In [1]:
#Import all the necessary modules
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate

import openai
import os
import shutil

Create an OpenAI account for later steps.
1. https://platform.openai.com/signup
2. https://platform.openai.com/account/api-keys
3. Set your environment variable OPENAI_API_KEY to be equal to the key value from step 2.


In [2]:
#Define the necessary path variables
openai.api_key = os.environ['OPENAI_API_KEY']
CHROMA_PATH = "chroma"
DATA_PATH = "data/"

#Define a prompt template
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [3]:
#Create the function to load our text to use for informing our LLM
def generate_data_store():

    #Load the documents
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()

    #Split the text into smaller, manageable chunks
    # This helps, when we search through the data, with the chunk being more
    # useful and relevant to what answer we might be looking for
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, #Size in number of characters
        chunk_overlap=100, #Overlap size in number of characters
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    #In order to query each chunk, we need to save them as a database
    # We use ChromaDB for this, which uses vector embeddings as the key
    #Save the output to Chroma
    #Clear the database if one already exists
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    #Create a new database from our loaded documents, using OpenAI for the embeddings
    # Save the database to disk to reuse later or save elsewhere (e.g. a lamba function) 
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

generate_data_store()

Split 1 documents into 1582 chunks.
Saved 1582 chunks to chroma.


In [5]:
#Now we can query the database with our uploaded
# data chunks to find a relevant response
def query_data(query_text):
    #Prepare the chroma database we created earlier
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    #Search the database for the chunk that best matches our query
    # This will help ensure we get a relevant response
    # k=3 means we retrieve the 3 best matches for our query
    results = db.similarity_search_with_relevance_scores(query_text, k=3)

    #The above returns a list of tuples containing a document and its relevance score
    # Here we ensure we find a match with a relevant of at least 0.7 before moving on
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return
    
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

query_data("What is the name of Ender's sister?")


Human: 
Answer the question based only on the following context:

Ender remembered his own brother, and the memory was not fond.

---

"While I killed billions."

"I wasn't going to say that."

"So he wanted to use me?"

"He had plans for you, Ender. He would publicly reveal himself when you arrived, going to meet you in front of all the videos. Ender Wiggin's older brother, who also happened to be the great Locke, the architect of peace. Standing next to you, he would look quite mature. And the physical resemblance between you is stronger than ever. It would be quite simple for him, then, to take over."

---

this isn't Fairyland anymore. It's beyond the End of the World, and--"

"I know the names of the places, I just don't know what ney mean."

"Fairyland was programmed in. It's mentioned in a few other places. But nothing talks

about the End of the World. We don't have any experience with it."

"I don't like having the computer screw around with Ender's mind that way. Peter Wiggin