In [1]:
import os
import pickle
import json
from typing import List, Dict
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

In [2]:

class AdvancedStudyBuddyAI:
    def __init__(self, pdf_directory: str, db_path: str = "studybuddy_vectordb"):
        self.pdf_directory = pdf_directory
        self.db_path = db_path
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.vectorstore = None
        self.qa_chain = None
        self.llm = Ollama(model="llama3.1", temperature=0.2)

    def load_and_process_pdfs(self):
        documents = []
        for filename in os.listdir(self.pdf_directory):
            if filename.endswith('.pdf'):
                file_path = os.path.join(self.pdf_directory, filename)
                loader = PyPDFLoader(file_path)
                documents.extend(loader.load())

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        chunks = text_splitter.split_documents(documents)

        self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
        self.vectorstore.save_local(self.db_path)
        print(f"Processed and saved {len(chunks)} chunks from {len(documents)} documents.")

    def load_vectorstore(self):
        if os.path.exists(self.db_path):
            self.vectorstore = FAISS.load_local(self.db_path, self.embeddings, allow_dangerous_deserialization=True)
            print("Vector database loaded successfully.")
        else:
            print("No existing vector database found. Please process PDFs first.")

    def setup_qa_chain(self):
        if self.vectorstore is None:
            print("Please load or process documents before setting up the QA chain.")
            return

        retriever = self.vectorstore.as_retriever(search_kwargs={"k": 4})
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )

    def query(self, question: str) -> Dict:
        if self.qa_chain is None:
            print("Please set up the QA chain before querying.")
            return {"answer": "QA chain not set up", "sources": []}

        result = self.qa_chain({"query": question})
        return {
            "answer": result['result'],
            "sources": [doc.page_content for doc in result['source_documents']]
        }

    def generate_and_save_flashcards(self, topic: str, num_cards: int):
        if self.vectorstore is None:
            print("Please load or process documents before generating flashcards.")
            return

        # Retrieve relevant chunks based on the topic
        relevant_chunks = self.vectorstore.similarity_search(topic, k=num_cards)
        
        flashcards = []
        for chunk in relevant_chunks:
            content = chunk.page_content
            flashcard_prompt = (
                f"Based on the following content about '{topic}', generate a flashcard with a question and answer:\n\n"
                f"{content}\n\n"
                f"Format the response as JSON with 'question' and 'answer' keys. "
                f"Ensure the question and answer are directly related to the topic '{topic}'."
            )
            
            response = self.llm(flashcard_prompt)
            try:
                flashcard = json.loads(response)
                flashcards.append(flashcard)
            except json.JSONDecodeError:
                print(f"Failed to parse flashcard JSON: {response}")

        # Save flashcards to a text file
        if not os.path.exists("flashcards"):
            os.makedirs("flashcards")
        
        filename = os.path.join("flashcards", f"{topic.replace(' ', '_')}.txt")
        with open(filename, "w") as f:
            for i, card in enumerate(flashcards, 1):
                f.write(f"Flashcard {i}:\n")
                f.write(f"Q: {card['question']}\n")
                f.write(f"A: {card['answer']}\n\n")
        
        print(f"Generated and saved {len(flashcards)} flashcards on the topic '{topic}' to {filename}")


In [3]:
def save_state(studybuddy: AdvancedStudyBuddyAI, filename: str = "advanced_studybuddy_state.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(studybuddy, f)
    print(f"State saved to {filename}")

def load_state(filename: str = "advanced_studybuddy_state.pkl") -> AdvancedStudyBuddyAI:
    if os.path.exists(filename):
        with open(filename, "rb") as f:
            return pickle.load(f)
    return AdvancedStudyBuddyAI("documents/")

In [4]:
studybuddy = load_state()


  from tqdm.autonotebook import tqdm, trange


In [5]:
while True:
    print("\nStudyBuddy AI - Main Menu")
    print("1. Process PDF Documents")
    print("2. Load Vector Database")
    print("3. Setup QA Chain")
    print("4. Ask a Question")
    print("5. Generate Flashcards")
    print("6. Quit")

    choice = input("Enter your choice (1-6): ")

    if choice == "1":
        studybuddy.load_and_process_pdfs()
    elif choice == "2":
        studybuddy.load_vectorstore()
    elif choice == "3":
        studybuddy.setup_qa_chain()
    elif choice == "4":
        question = input("Enter your question: ")
        result = studybuddy.query(question)
        print(f"Answer: {result['answer']}")
        print("\nSources:")
        for i, source in enumerate(result['sources'], 1):
            print(f"{i}. {source[:200]}...")
    elif choice == "5":
        topic = input("Enter the topic for the flashcards: ")
        num_cards = int(input("How many flashcards do you want to generate? "))
        studybuddy.generate_and_save_flashcards(topic, num_cards)
    elif choice == "6":
        print("Thank you for using Advanced StudyBuddy AI. Goodbye!")
        break
    else:
        print("Invalid choice. Please try again.")


StudyBuddy AI - Main Menu
1. Process PDF Documents
2. Load Vector Database
3. Setup QA Chain
4. Ask a Question
5. Generate Flashcards
6. Quit
Vector database loaded successfully.

StudyBuddy AI - Main Menu
1. Process PDF Documents
2. Load Vector Database
3. Setup QA Chain
4. Ask a Question
5. Generate Flashcards
6. Quit


  response = self.llm(flashcard_prompt)


Failed to parse flashcard JSON: Here is the flashcard in JSON format:

```
{
  "question": "What data structure does the heap-sort algorithm start with?",
  "answer": "A single array"
}
```
Failed to parse flashcard JSON: However, I don't see any content about "heap vs stack" in your message. Please provide me with some text or context that discusses the differences between heaps and stacks, so I can create a flashcard for you.

Once you provide the necessary information, I'll format the response as JSON with 'question' and 'answer' keys.
Failed to parse flashcard JSON: Here is the flashcard:

```json
{
  "question": "What data structure is more complex than a MeldableHeap?",
  "answer": "A leftist heap, binomial heap, Fibonacci heap, pairing heap, or skew heap"
}
```

Let me know if you'd like me to adjust anything!
Failed to parse flashcard JSON: Here is the flashcard in JSON format:

```json
{
  "question": "What is a key difference between a heap and a stack?",
  "answer": "A heap 