In [1]:
#Import all the necessary modules
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate

import os
import shutil
import spacy


In [2]:
#Define the necessary path variables
CHROMA_PATH = "chroma"
DATA_PATH = "data/"

#Define a prompt template
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [3]:
#Generate word embeddings using Spacy instead of OpenAI
class SpacyEmbeddings:
    """
    Class for generating Spacy-based embeddings for documents and queries.
    """

    def __init__(self):
        """
        Initialize the SpacyEmbeddings object by loading the Spacy model.
        """
        self.nlp = spacy.load('en_core_web_md')

    def embed_documents(self, texts):
        """
        Embed a list of documents using Spacy.

        Args:
            texts (list): A list of documents.

        Returns:
            list: A list of document embeddings.
        """
        return [self.nlp(text).vector.tolist() for text in texts]

    def embed_query(self, text):
        """
        Embed a query using Spacy.

        Args:
            text (str): The query text.

        Returns:
            list: The query embedding.
        """
        return self.nlp(text).vector.tolist()

In [4]:
#Create the function to load our text to use for informing our LLM
def generate_data_store():

    #Load the documents
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()

    #Split the text into smaller, manageable chunks
    # This helps, when we search through the data, with the chunk being more
    # useful and relevant to what answer we might be looking for
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, #Size in number of characters
        chunk_overlap=100, #Overlap size in number of characters
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")


    #In order to query each chunk, we need to save them as a database
    # We use ChromaDB for this, which uses vector embeddings as the key
    #Save the output to Chroma
    #Clear the database if one already exists
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    #Get spacy embeddings
    spacy_embeddings = SpacyEmbeddings()

    #Create a new database from our loaded documents, using OpenAI for the embeddings
    # Save the database to disk to reuse later or save elsewhere (e.g. a lamba function) 
    db = Chroma.from_documents(
        chunks, spacy_embeddings, persist_directory=CHROMA_PATH
    )
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    return db

db = generate_data_store()

Split 1 documents into 1582 chunks.
Saved 1582 chunks to chroma.


In [14]:
#Now we can query the database with our uploaded
# data chunks to find a relevant response
def query_data(query_text):
    #Prepare the chroma database we created earlier
    embedding_function = SpacyEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    #Search the database for the chunk that best matches our query
    # This will help ensure we get a relevant response
    # k=3 means we retrieve the 3 best matches for our query
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    print(results)

    #The above returns a list of tuples containing a document and its relevance score
    if len(results) == 0:
        print(f"Unable to find matching results.")
        return
    
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

query_data("Who is Ender Wiggin?")



[(Document(metadata={'source': 'data\\EndersGame.txt', 'start_index': 490806}, page_content='"What then? What happened to them? When they failed?"\n\n"Why does it matter, Ender?"\n\nEnder didn\'t answer.\n\n"None of them failed at this point in their course, Ender. You made a mistake with Petra. She\'ll recover. But Petra is Petra, and you are you."\n\n"Part of what I am is her. Is what she made me."'), -580.582984231293), (Document(metadata={'source': 'data\\EndersGame.txt', 'start_index': 248793}, page_content='"Are you helping Peter?" asked Graff.\n\nShe didn\'t answer.\n\n"Is Peter such a very bad person, Valentine?"\n\nShe nodded.\n\n"Is Peter the worst person in the world?"\n\n"How can he be? I don\'t know. He\'s the worst person I know."\n\n"And yet you and Ender are his brother and sister. You have the same genes, the same parents, how can he be so bad if--"'), -619.4594423000128), (Document(metadata={'source': 'data\\EndersGame.txt', 'start_index': 13723}, page_content='This i

  warn_deprecated(


Response: Ender Wiggin is a character in the story who is dealing with the influence and manipulation of his siblings, Peter and Valentine, as well as facing challenges and pressure in his training to potentially save the world from an alien threat.
Sources: ['data\\EndersGame.txt', 'data\\EndersGame.txt', 'data\\EndersGame.txt']
