### Document Loaders

In [None]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./bella_vista.txt")
docs = loader.load()

In [None]:
print(docs)
print(len(docs))

In [None]:
from langchain.schema import Document

example_doc = Document(page_content="test", metadata={"important_info": "hi there"})
example_doc

Texts are not loaded 1:1 into the database, but in pieces, so called "chunks". You can define the chunk size and the overlap between the chunks.

To create multiple documents (chunks), you can use a text splitter.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
)
documents = text_splitter.split_documents(docs)

In [None]:
for doc in documents:
    print(doc)

len(documents)

### Embeddings

Texts are not stored as text in the database, but as vector representations. 
Embeddings are a type of word representation that represents the semantic meaning of words in a vector space.

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [None]:
embedding1 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
print(embedding1)
print(len(embedding1))

In [None]:
embedding2 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
embedding3 = embeddings.embed_query(text="Planets, asteroids, and comets are part of our solar system.")
embedding4 = embeddings.embed_query(text="I love baking chocolate chip cookies on weekends.")

In [None]:
import numpy as np

def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    return dot_product / (norm_a * norm_b)

In [None]:
sim_1_2 = cosine_similarity(embedding1, embedding2)
sim_1_3 = cosine_similarity(embedding1, embedding3)
sim_3_4 = cosine_similarity(embedding3, embedding4)

print(sim_1_2, sim_1_3, sim_3_4)


### Loading Vectors into VectorDB (FAISS)

As created by OpenAIEmbeddings vectors and documents can now be stored in the database. This DB can be stored as .pkl file

In [None]:
from langchain.vectorstores.faiss import FAISS

vectorstore = FAISS.from_documents(documents, embeddings)

vectorstore.save_local("index") # newer FAISS versions can not be serialized with pickle

### Loading the database

Before using the database, it must of course be loaded again.

In [None]:
vectorstore = FAISS.load_local("index",embeddings)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
docs = retriever.get_relevant_documents(query="When are the opening hours??")
for doc in docs:
    print(doc)


In [None]:
docs = retriever.get_relevant_documents(query="When are the opening hours?", filter={'source': './bella_vista.txt'}, k=3)
for doc in docs:
    print(doc) # does not work!

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"filter": {'source': './bella_vista.txt'}, "k":1})
docs = retriever.get_relevant_documents(query="When are the opening hours??")
for doc in docs:
    print(doc)

### Now we have to pass the documents to an LLM.

We create a prompt with a question and context. Context is the output from the retriever (Document Store).
LangChain provides chains out-of-the-box to do that, the RetrievalChains

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

prompt_template = """You are a helpful assistant for our restaurant.

{context}

Question: {question}
Answer here:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

llm = ChatOpenAI()
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
)

result = qa.invoke(input="When are the opening hours on sunday??")
print(result)