In [2]:
# support retrieval of data from vectore dbs , RAG application
# concepts : Documents , vector stores , retreivers


In [None]:
# ! pip install langchain-chroma

In [None]:
from dotenv import load_dotenv
load_dotenv()

### Sample Docs

In [4]:
from langchain_core.documents import Document

# page content - string content 
# metadata - capture info about the source doc and relationship to other doc


documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

### Vector Store

vector search is common way to store and search over unstructured data , numerical vectors associated with text , embed vector of the same dimension and use vector similarity to identify relatable data
Langchain `VectorStore` objects contain methods to add text and docs objects to store , and query similarity check. embeding models convert text to vectors. 

Different vectore store technologies : 
1. Postgres - sperate infrastructure , run locally or 3rd party
2. Chroma - In memory implementation , for now we work on this

we will use openai embeddings 

In [5]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(    # add docs to vector store 
    documents, 
    embedding = OpenAIEmbeddings()
)

Calling `.from_documents` here will add the documents to the vector store. VectorStore implements methods for adding documents that can also be called after the object is instantiated. Most implementations will allow you to connect to an existing vector store-- e.g., by providing a client, index name, or other information. See the documentation for a specific integration for more detail.

Once we've instantiated a `VectorStore` that contains documents, we can query it. VectorStore includes methods for querying:

1. Synchronously and asynchronously;
2. By string query and by vector;
3. With and without returning similarity scores;
4. By similarity and maximum marginal relevance (to balance similarity with query to diversity in retrieved results).


In [6]:
vectorstore.similarity_search("cat")

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

In [7]:
await vectorstore.asimilarity_search("cat")  # Async Query

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

In [8]:
vectorstore.similarity_search_with_score("cat")  # based on chroma metric of distance score , others use other metric for similarity score

[(Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
  0.375326931476593),
 (Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
  0.4833090305328369),
 (Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
  0.4958883225917816),
 (Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'}),
  0.4974174499511719)]

In [9]:
embedding = OpenAIEmbeddings().embed_query("cat")

vectorstore.similarity_search_by_vector(embedding)

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

### Retrievers

LangChain `VectorStore` objects do not subclass `Runnable`, and so cannot immediately be integrated into LangChain Expression Language chains.

LangChain `Retrievers` are Runnables, so they implement a standard set of methods (e.g., synchronous and asynchronous invoke and batch operations) and are designed to be incorporated in LCEL chains.

In [10]:
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriever = RunnableLambda(vectorstore.similarity_search).bind(k= 1)
retriever.batch(['cat','shark'])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

Vectorstores implement an `as_retriever` method that will generate a Retriever, specifically a `VectorStoreRetriever`. These retrievers include specific `search_type`and `search_kwargs` attributes that identify what methods of the underlying vector store to call

In [11]:
retriever = vectorstore.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":1}
)

retriever.batch(["cat","shark"])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

`VectorStoreRetriever` supports search types of `similarity` (default), `mmr` (maximum marginal relevance, described above), and `similarity_score_threshold`. We can use the latter to threshold documents output by the retriever by similarity score.

### Sample RAG implementation

In [12]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

messages = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

model = ChatOpenAI(model = "gpt-3.5-turbo")

prompt = ChatPromptTemplate.from_messages(["human", messages])
rag_chain = {"context" : retriever , "question" : RunnablePassthrough()} | prompt | model

response = rag_chain.invoke("tell me about cats")
response.content

'Cats are independent pets that often enjoy their own space.'