In [2]:
from langchain.document_loaders import TextLoader

loader = TextLoader("./bella_vista.txt")
docs = loader.load()

In [4]:
print(docs)
print(len(docs))

[Document(page_content="Q: What are the hours of operation for Bella Vista?\nA: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome guests from 12 p.m. to 10 p.m.\n\nQ: What type of cuisine does Bella Vista serve?\nA: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.\n\nQ: Do you offer vegetarian or vegan options at Bella Vista?\nA: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and vegan dishes. Our chefs are also happy to customize dishes based on dietary needs.\n\nQ: Is Bella Vista family-friendly?\nA: Yes, Bella Vista is a family-friendly establishment. We have a dedicated kids' menu and offer high chairs and booster seats for our younger guests.\n\nQ: Can I book private events at Bella Vista?\nA: Certainly! Bella Vista has a private dining area perfect for events, parties, or co

In [6]:
from langchain.schema import Document

example_doc = Document(page_content="test", metadata={"important_info": "hi there"})
example_doc

Document(page_content='test', metadata={'important_info': 'hi there'})

Texts are not loaded 1:1 into the database, but in pieces, so called "chunks". You can define the chunk size and the overlap between the chunks.

To create multiple documents (chunks), you can use a text splitter.

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
)
documents = text_splitter.split_documents(docs)

[Document(page_content='Q: What are the hours of operation for Bella Vista?', metadata={'source': './bella_vista.txt'}), Document(page_content='A: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome', metadata={'source': './bella_vista.txt'}), Document(page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.', metadata={'source': './bella_vista.txt'}), Document(page_content='Q: What type of cuisine does Bella Vista serve?', metadata={'source': './bella_vista.txt'}), Document(page_content='A: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We', metadata={'source': './bella_vista.txt'}), Document(page_content='cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.', metadata={'source': './bella_vista.txt'}), Document(page_content='Q: Do you offer vegetarian or vegan options at Bella Vista?', metadata={'source': './bella_vista.txt'}), Document(page_content='

In [8]:
for doc in documents:
    print(doc)

page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='A: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome' metadata={'source': './bella_vista.txt'}
page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}
page_content='Q: What type of cuisine does Bella Vista serve?' metadata={'source': './bella_vista.txt'}
page_content='A: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We' metadata={'source': './bella_vista.txt'}
page_content='cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.' metadata={'source': './bella_vista.txt'}
page_content='Q: Do you offer vegetarian or vegan options at Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='A: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and veg

### Embeddings

Texts are not stored as text in the database, but as vector representations. 
Embeddings are a type of word representation that represents the semantic meaning of words in a vector space.

In [9]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [30]:
embedding1 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
print(embedding1)
print(len(embedding1))

[0.0249906582088126, 0.021015434321445474, -0.010260039364017913, -0.022352891815948222, -0.04037142684617507, 0.022328124222542264, -0.005455096440938061, -0.01635909695880139, -0.0011060347697468343, -0.009436511760995372, 0.009027843675829195, 0.005390081275416761, -0.01441482876923956, 0.002015475486265269, -0.005944259204673611, -0.01544269041484519, 0.011157870678580945, -0.004198133159343381, 0.006439614332421954, -0.0037677936627776402, 0.0023544839000699, -0.007027848401103956, -0.01147985125550365, -0.0001495739758757932, 0.017225968315945663, -0.00417336556593742, 0.011690376730776927, -0.031900857747270424, 0.022811095087926322, -0.00937459277748047, 0.020247633337925043, -0.010904000517863327, -0.019987572675839844, -0.01554176171979164, 0.0155665293131976, 0.004102158036403327, 0.0071021516469831425, 0.01001236156731309, -0.03204946703299662, 0.004099062320058234, 0.011498426484896818, 0.03353553008793513, -0.004671816410030858, -0.005312681847531178, -0.01281730968132900

In [31]:
embedding2 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
embedding3 = embeddings.embed_query(text="Planets, asteroids, and comets are part of our solar system.")
embedding4 = embeddings.embed_query(text="I love baking chocolate chip cookies on weekends.")

In [32]:
import numpy as np

def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    return dot_product / (norm_a * norm_b)

In [34]:
sim_1_2 = cosine_similarity(embedding1, embedding2)
sim_1_4 = cosine_similarity(embedding1, embedding3)
sim_3_4 = cosine_similarity(embedding3, embedding4)

print(sim_1_2, sim_1_4, sim_3_4)


0.9999986122240551 0.901053907123304 0.6977501000551984


### Loading Vectors into VectorDB (FAISS)

As created by OpenAIEmbeddings vectors and documents can now be stored in the database. This DB can be stored as .pkl file

In [35]:
from langchain.vectorstores.faiss import FAISS
import pickle

vectorstore = FAISS.from_documents(documents, embeddings)

with open("vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

### Loading the database

Before using the database, it must of course be loaded again.

In [42]:
with open("vectorstore.pkl", "rb") as f:
    vectorstore = pickle.load(f)


In [43]:
retriever = vectorstore.as_retriever()

In [41]:
docs = retriever.get_relevant_documents(query="When are the opening hours??")
for doc in docs:
    print(doc)

page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}
page_content='A: While walk-ins are always welcome, we recommend making a reservation, especially during weekends' metadata={'source': './bella_vista.txt'}
page_content='during weekends and holidays, to ensure a seamless dining experience.' metadata={'source': './bella_vista.txt'}


### Now we have to pass the documents to an LLM.

We create a prompt with a question and context. Context is the output from the retriever (Document Store).
LangChain provides chains out-of-the-box to do that, the RetrievalChains

In [45]:
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

prompt_template = """You are a helpful assistant for our restaurant.

{context}

Question: {question}
Answer here:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

llm = OpenAI()
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
)

result = qa.run(query="When are the opening hours??")
print(result)

 Our regular hours of operation are Monday-Saturday 11am-9pm and Sundays 12pm-10pm.
