# Vectorstores and Embeddings

Recall the overall workflow for retrieval augmented generation (RAG):

In [None]:
from langchain.document_loaders import PyPDFLoader

loaders=[
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
splits = r_splitter.split_documents(docs)
len(splits)

## Embeddings

Let's take our splits and embed them.

In [None]:
from langchain.embeddings import OllamaEmbeddings

embedding = OllamaEmbeddings(model = "nomic-embed-text")

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
np.dot(embedding1,embedding2)

In [None]:
np.dot(embedding1,embedding3)

In [None]:
np.dot(embedding2,embedding3)

## Vectorstores

In [None]:
from langchain.vectorstores import Chroma

In [None]:
presistant_directory = "docs/chroma/"

In [None]:
!rm -rf ./docs/chroma  # remove old database files if any

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=presistant_directory
)

In [None]:
print(vectordb._collection.count())


### Similarity Search

In [None]:
question = "is there an email i can ask for help"

In [None]:
docs = vectordb.similarity_search(question, k = 3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

In [None]:
vectordb.persist()

## Failure modes

This seems great, and basic similarity search will get you 80% of the way there very easily. 

But there are some failure modes that can creep up. 

Here are some edge cases that can arise - we'll fix them in the next lesson.

In [None]:
question = "what did they say about matlab?"

In [None]:
docs = vectordb.similarity_search(question, k = 5)

In [None]:
docs[0]

In [None]:
docs[1]

In [None]:
question = "what did they say about regression in the third lecture?"

In [None]:
docs = vectordb.similarity_search(question, k=5)

In [None]:
for doc in docs:
    print(doc)

In [None]:
docs[4].page_content