# Chapter 3. Retrieval: How to Chat with Your Data with RAG

# Basic Retrival-Augmented Generation pipeline

## Indexing
This stage involves preprocessing the external data source and storing embeddings that represent the data in a vector store where they can be easily retrieved.

## Retrieval
This stage involves retrieving the relevant embeddings and data stored in the Vector Store based on a user’s query.

## Generation
This stage involves synthesizing the original prompt with the retrieved relevant documents as one final prompt sent to the model for a prediction.

In [1]:
# Install langchain-chroma and lark, which is required for using the self-query retriever
# See here for details: https://python.langchain.com/docs/how_to/self_query/
%pip install --upgrade --quiet lark langchain-chroma

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Import modules
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI, OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import chain
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_community.utils.math import cosine_similarity
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from typing import Literal
from pydantic import BaseModel, Field

# Import modules for the SQL database query example
import sqlite3
import requests
from langchain_community.utilities.sql_database import SQLDatabase
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
from langchain.chains import create_sql_query_chain
from sqlalchemy import create_engine
from sqlalchemy.pool import StaticPool

In [3]:
## Load the document 
loader = TextLoader("TeachingwithGenerativeAI.txt")
doc = loader.load()

## Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
)
chunks = text_splitter.split_documents(doc)

## Define the embedding model
embed_model = OpenAIEmbeddings()

# Create the vector store
vector_db = FAISS.from_documents(
    documents = chunks, 
    embedding = embed_model)

In [4]:
# Create a retriever
retriever = vector_db.as_retriever()

# Fetch relevant documents
docs = retriever.invoke("Should a faculty member explain their AI policy in a class?")
print(f"number of docs found: {len(docs)}")
print()
print(docs[0].page_content)

number of docs found: 4

* Students only learn from productive effort, and should understand how misuse or overuse of AI threatens that effort
In order to help students understand these things, we recommend that instructors:
* Explain your AI policy in your syllabus, and discuss the reasons you adopted it in class
* Be specific about Dos and Don’ts—“Do acknowledge and describe any AI use”, or “Don’t use any AI for anything other than suggesting topics and sources”
* Explain the limitations of generative AI. 
* Remember that students generally want to learn, and explain to them what they can learn from doing the work, not just the potential punishments for cheating
Additional details can be found in the AI FAQ and in Adapting Assignments to AI.
Academic Integrity and Generative AI


In [5]:
# Create a retriever with k=2
retriever = vector_db.as_retriever(search_kwargs={"k": 2})

# Fetch relevant documents
docs = retriever.invoke("Should a faculty member explain their AI policy in a class?")
print(f"number of docs found: {len(docs)}")

number of docs found: 2


# Generating LLM Predictions Using Relevant Documents

In [6]:
# Build a prompt template
prompt_temp = ChatPromptTemplate.from_template("""Answer the question based only on the following context:
{context}
Question: {question}
""")

# Choose a chatmodel
chatmodel = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create a chain
chain = prompt_temp | chatmodel

# Fetch relevant documents
docs = retriever.invoke("Should a faculty member explain their AI policy in a class?")

# Invoke the chain to answer the question
response = chain.invoke({
    "question": "Should a faculty member explain their AI policy in a class?", 
    "context": docs})
print(response)

content='Yes, a faculty member should explain their AI policy in a class.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 439, 'total_tokens': 453, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-bf8fb9d3-e550-4231-bd63-5a6728943fca-0' usage_metadata={'input_tokens': 439, 'output_tokens': 14, 'total_tokens': 453, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [9]:
# Incoporating the above pipeline into a function
@chain
def qa(question):
    # Fetch relevant documents 
    docs = retriever.get_relevant_documents(question)
    # Invote the prompt template
    prompt = prompt_temp.invoke({"context": docs, "question": question})
    # Generate a response
    response = chatmodel.invoke(prompt)
    return response

# Run the function
# Note: Given the qa() function is a runnable chain, it should be invoked with qa.invoke()
response = qa.invoke("Should a faculty member explain their AI policy in a class?")
print(response.content)

  docs = retriever.get_relevant_documents(question)


Yes, a faculty member should explain their AI policy in a class according to the provided context.


In [10]:
# We can also return the retrieved documents for further inspection
@chain
def qa(question):
    # Fetch relevant documents 
    docs = retriever.get_relevant_documents(question)
    # Invote the prompt template
    prompt = prompt_temp.invoke({"context": docs, "question": question})
    # Generate a response
    response = chatmodel.invoke(prompt)
    return response, docs
response, docs = qa.invoke("Should a faculty member explain their AI policy in a class?")
print(response.content)
print()
print(len(docs))
print()
print(docs[0].page_content)
print()
print(docs[1].page_content)

Yes, a faculty member should explain their AI policy in a class.

2

* Students only learn from productive effort, and should understand how misuse or overuse of AI threatens that effort
In order to help students understand these things, we recommend that instructors:
* Explain your AI policy in your syllabus, and discuss the reasons you adopted it in class
* Be specific about Dos and Don’ts—“Do acknowledge and describe any AI use”, or “Don’t use any AI for anything other than suggesting topics and sources”
* Explain the limitations of generative AI. 
* Remember that students generally want to learn, and explain to them what they can learn from doing the work, not just the potential punishments for cheating
Additional details can be found in the AI FAQ and in Adapting Assignments to AI.
Academic Integrity and Generative AI

Students and faculty report growing use of generative AI—tools that produce human-like writing (e.g ChatGPT), images (e.g. MidJourney), code (e.g. Microsoft Co-Pilo

# Query Transformation
In a production setting, a user is likely to construct their query in an incomplete, ambiguous, or poorly worded manner that leads to model hallucination. Query transformation is a subset of strategies designed to modify the user’s input to answer the first RAG problem question.

## Rewrite-Retrieve-Read
The Rewrite-Retrieve-Read strategy proposed by a Microsoft Research team simply prompts the LLM to rewrite the user’s query before performing retrieval.

In [11]:
# A demonstration of a poorly written prompt on RAG response
@chain
def qa(question):
    # Fetch relevant documents 
    docs = retriever.get_relevant_documents(question)
    # Invote the prompt template
    prompt = prompt_temp.invoke({"context": docs, "question": question})
    # Generate a response
    response = chatmodel.invoke(prompt)
    return response
question = "I don't know what to say. The weather is perfect for a walk. Should faculty declare their AI policy or not to their students?"
response = qa.invoke(question)
print(response.content)

Faculty should declare their AI policy to their students, as there is currently no default for acceptable vs. unacceptable use of generative AI tools in coursework. Students are using AI without clear directions from their instructors about which uses are acceptable, so it is important for faculty to explain to students what is and is not allowed around AI use in their classes.


In [12]:
# Create a rewrite prompt template
rewrite_prompt_temp = ChatPromptTemplate.from_template("""
Provide a better search query for the vectore database search engine to answer the given question, end the queries with ’**’. Question: {x} Answer:""")

# Define a function to parse the message from the chat model
def parse_rewriter_output(message):
    return message.content.strip('"').strip("**")

# Define the rewriter chain
rewriter = rewrite_prompt_temp | chatmodel | parse_rewriter_output

# Test the rewriter chain
rewriter.invoke(question)

'Should faculty declare their AI policy to students?'

In [13]:
# Redefine the RAG pipeline function to incoporate the above rewrite chain
@chain
def qa_rrr(question):
    # Invote the rewrite prompt template
    rewritten_question = rewriter.invoke(question)
    
    # Fetch relevant documents
    docs = retriever.get_relevant_documents(rewritten_question)
    
    # Assemble the prompt, which now includes the retrived documents and the rewritten question
    prompt = prompt_temp.invoke({"context": docs, "question": rewritten_question})

    # Generate a response
    response = chatmodel.invoke(prompt)
    return response
question = "I don't know what to say. The weather is perfect for a walk. Should faculty declare their AI policy or not to their students?"
response = qa_rrr.invoke(question)
print(response.content)

Yes, faculty should declare their AI policy to students.


# Multi Query Retrieval
A user’s single query can be insufficient to capture the full scope of information required to answer the query comprehensively. The Multi Query Retrieval strategy resolves this problem by instructing an LLM to generate multiple queries based on a user’s initial query, executing a parallel retrieval of each query from the data source, and then inserting the retrieved results as prompt context to generate a final model output.

This strategy is particularly useful for use cases where a single question may rely on multiple perspectives to provide a comprehensive answer.

In [14]:
# Create a multi-perpspective prompt template
perspectives_prompt_temp = ChatPromptTemplate.from_template(
    """You are an AI language model assistant. 
    Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. 
    By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines. Original question: {question}""")
def parse_queries_output(message):
    return message.content.split('\n')

# Build and invoke the query generator chain
query_gen = perspectives_prompt_temp | chatmodel | parse_queries_output
lst_perspectives = query_gen.invoke(question)
print(len(lst_perspectives))
print()
print(lst_perspectives)

5

['1. Is it necessary for faculty to disclose their AI policy to students?', '2. Should faculty communicate their AI policy to students or keep it confidential?', '3. What are the implications of faculty sharing their AI policy with students?', '4. Is transparency about AI policies important for faculty-student relationships?', '5. How does faculty disclosure of their AI policy impact student trust and understanding?']


In [15]:
# Take the list of generated queries, retrieve the most relevant docs for each of them in parallel, 
# and then combine to get the unique union of all the retrieved relevant documents.
def get_unique_union(document_lists):
    # Flatten list of lists, and deduplicate them by including them in a dictionary.
    # Note: a dictionoary can't have duplicate keys, so we can use it to deduplicate.
    deduped_docs = {
        doc.page_content: doc for sublist in document_lists for doc in sublist
    }
    # return a flat list of unique docs
    return list(deduped_docs.values())

# Build the retrieval chain
# Note: Use the retriever.batch() method to retrieve documents in parallel.
retrieval_chain = query_gen | retriever.batch | get_unique_union

In [16]:
# Redefine the RAG pipeline function to incoporate the above multi-query chain
@chain
def multi_query_qa(question):
    # Invote the multi-query prompt template to obtain a list of queries
    lst_perspectives = query_gen.invoke(question)
    
    # Fetch relevant documents
    docs = retrieval_chain.invoke(lst_perspectives)
    
    # Assemble the prompt, which now includes the retrived documents and the rewritten question
    prompt = prompt_temp.invoke({"context": docs, "question": question})

    # Generate a response
    response = chatmodel.invoke(prompt)
    return response
question = "I don't know what to say. The weather is perfect for a walk. Should faculty declare their AI policy or not to their students?"
response = multi_query_qa.invoke(question)
print(response.content)

Faculty should declare their AI policy to their students.


# RAG-Fusion
The RAG-Fusion strategy shares similarities with the Multi Query Retrieval strategy, except that we apply a final reranking step to all the retrieved documents. This reranking step makes use of the Reciprocal Rank Fusion (RRF) algorithm, which involves combining the ranks of different search results to produce a single, unified ranking. By combining ranks from different queries, we pull the most relevant documents to the top of the final list.

In [17]:
# Create a prompt template for RAG-Fusion
prompt_rag_fusion = ChatPromptTemplate.from_template("""
You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):""")
def parse_queries_output(message):
    return message.content.split('\n')
chatmodel = ChatOpenAI(temperature=0)

# Build and invoke the query generator chain
query_gen = prompt_rag_fusion | chatmodel | parse_queries_output
lst_perspectives = query_gen.invoke(question)
print(len(lst_perspectives))
print()
print(lst_perspectives)

4

["1. How to overcome writer's block when you don't know what to say?", '2. Benefits of walking in perfect weather for physical and mental health.', '3. Importance of transparency in faculty declaring their AI policy to students.', '4. Pros and cons of faculty disclosing their AI policy to students.']


The function "reciprocal_rank_fusion()" takes a list of the search results of each query, so a list of lists of documents, where each inner list of documents is sorted by their relevance to that query. The RRF algorithm then calculates a new score for each document based on its ranks (or positions) in the different lists and sorts them to create a final reranked list. After calculating the fused scores, the function sorts the documents in descending order of these scores to get the final reranked list, which is then returned.

In [18]:
# Define the reciprocal rank fusion function.
# Note: Notice the function also takes a k parameter, 
# which determines how much influence documents in each query’s result sets have 
# over the final list of documents. 
# A higher value indicates that lower ranked documents have more influence.
def reciprocal_rank_fusion(results: list[list], k=60):
    """reciprocal rank fusion on multiple lists of ranked documents 
       and an optional parameter k used in the RRF formula
    """
    
    # Initialize a dictionary to hold fused scores for each document
    # Documents will be keyed by their contents to ensure uniqueness
    fused_scores = {}
    documents = {}
    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Use the document contents as the key for uniqueness
            doc_str = doc.page_content
            # If the document hasn't been seen yet,
            # - initialize score to 0
            # - save it for later
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
                documents[doc_str] = doc
            # Update the score of the document using the RRF formula:
            # 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)
    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_doc_strs = sorted(
        fused_scores, key=lambda d: fused_scores[d], reverse=True
    )
    # retrieve the corresponding doc for each doc_str
    return [
        documents[doc_str]
        for doc_str in reranked_doc_strs
    ]

# Build the retrieval chain
retrieval_chain = query_gen | retriever.batch | reciprocal_rank_fusion

In [19]:
# Build a prompt template that take the output from the retrieval chain and the question as input
prompt = ChatPromptTemplate.from_template("""
Answer the following question based on this context:
{context}
Question: {question}
""")

# Build the multi-query chain with RAG-funtion
@chain
def multi_query_qa(input):
    # fetch relevant documents 
    docs = retrieval_chain.invoke(input)
    # format prompt
    formatted = prompt.invoke({"context": docs, "question": input})
    # generate answer
    answer = chatmodel.invoke(formatted)
    return answer

# Invoke the chain
multi_query_qa.invoke(question)

AIMessage(content="Based on the context provided, faculty should declare their AI policy to their students. It is recommended that faculty explain to students what is and is not allowed around AI use in their classes. This includes acknowledging the use of AI tools, being responsible for the content and accuracy of any work submitted, and discussing Dos and Don'ts related to AI use. It is important for faculty to help students understand the limitations of generative AI and the potential consequences of misuse or overuse of AI.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 97, 'prompt_tokens': 688, 'total_tokens': 785, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-af

# Hypothetical Document Embeddings (HyDE)
Hypothetical Document Embeddings (HyDE) is a strategy that involves creating a hypothetical document based on the user’s query, embedding the document, and retrieving relevant documents based on vector similarity. The intuition behind HyDE is that an LLM-generated hypothetical document will be more similar to the most relevant documents than the original query.

In [20]:
# Build a prompt template for the HyDE
prompt_hyde = ChatPromptTemplate.from_template("""
Please write a scientific paper passage to answer the question
Question: {question}
Passage:""")

# Build the chain to generate document from the prompt
generate_doc = prompt_hyde | ChatOpenAI(temperature=0) | StrOutputParser()

# Invote the chain
generate_doc.invoke("Should faculty be required to declare their AI policy for their students?")

"In recent years, the integration of artificial intelligence (AI) technologies in educational settings has become increasingly prevalent. As such, the question of whether faculty should be required to declare their AI policy for their students has garnered significant attention. \n\nOne argument in favor of requiring faculty to declare their AI policy is the importance of transparency and accountability in the use of AI technologies in education. By clearly outlining their AI policy, faculty can ensure that students are aware of how AI is being utilized in their learning environment and can make informed decisions about their participation. This transparency can also help to build trust between faculty and students, as well as promote a culture of openness and communication.\n\nAdditionally, declaring an AI policy can help to mitigate potential ethical concerns surrounding the use of AI in education. Faculty can outline how AI technologies are being used, the data being collected, and 

In [21]:
# Build the retrieval chain
retrieval_chain = generate_doc | retriever

In [22]:
# Build a prompt template that take the output from the retrieval chain and the question as input
prompt = ChatPromptTemplate.from_template("""
Answer the following question based on this context:
{context}
Question: {question}
""")

# Build the HyDe chain
@chain
def hyde_query_qa(input):
    # fetch relevant documents 
    docs = retrieval_chain.invoke(input)
    # format prompt
    formatted = prompt.invoke({"context": docs, "question": input})
    # generate answer
    answer = chatmodel.invoke(formatted)
    return answer

# Invoke the chain
hyde_query_qa.invoke(question)

AIMessage(content="Based on the context provided, it is recommended that faculty declare their AI policy to their students. This is important in order to help students understand the dos and don'ts of AI use, the limitations of generative AI, and to ensure academic integrity. By explaining the AI policy in the syllabus and discussing it in class, faculty can provide clear directions to students on what is and is not allowed in terms of AI use in coursework.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 88, 'prompt_tokens': 455, 'total_tokens': 543, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-0d8fba3f-ed53-48af-86bb-9008b42d2134-0', usage_metadata={'input_tokens': 4

# Query Routing
Although using a single Vector Store is useful, the required data may live in a variety of data sources, including relational databases or other Vector Stores.

For example, you may have two Vector Stores: one for LangChain Python documentation and another for LangChain JS documentation. Given a user’s question, we would like to route the query to the appropriate inferred data source to retrieve relevant docs. Query routing is a strategy used to forward a user’s query to the relevant data source.

## Logical Routing
In logical routing, we give the LLM knowledge of the various data sources at our disposal and then let the LLM reason which data source to apply based on the user’s query.

In [23]:
# Note: a RouteQuery model that validates input for a field called datasource, 
# ensuring it is either "python_docs" or "js_docs", 
# with the purpose of routing a user’s question to the appropriate documentation source.
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""
    # Note: (1) datasource: This attribute uses Literal, meaning it can only take specific values, 
    # "python_docs" or "js_docs". This restricts datasource to these two values, 
    # ensuring that any instance of RouteQuery must have datasource set to one of them.
    # (2) Field: The Field function from pydantic adds metadata to the field. 
    # Here, ... (Ellipsis) is used to indicate that datasource is a required field (without default value), 
    # and description provides additional information about the field’s purpose.
    datasource: Literal["python_docs", "js_docs"] = Field(
        ...,
        description="Given a user question choose which datasource would be most relevant for answering their question",
    )

In [24]:
# LLM with function call
chatmodel = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm = chatmodel.with_structured_output(RouteQuery)

# Prompt 
system = """You are an expert at routing a user question to the appropriate data source.
Based on the programming language the question is referring to, route it to the relevant data source."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
# Define router 
router = prompt | structured_llm

In [25]:
question = '''
Why doesn't the following code work:
from PIL import Image
import numpy as np

def process_image(image_path, target_size=(224, 224)):
    """
    Processes an image by resizing and normalizing it.
    
    Args:
        image_path (str): Path to the input image file.
        target_size (tuple): Desired output size (width, height).
    
    Returns:
        np.ndarray: The processed image as a normalized numpy array.
    """
    # Load the image
    img = Image.open(image_path)
    
    # Resize the image to the target size
    img = img.resize(target_size)
    
    # Convert the image to a numpy array
    img_array = np.array(img)
    
    # Normalize the image data to a range of 0 to 1
    img_array = img_array / 255.0
    
    # If the image is grayscale, expand dimensions to (height, width, 1)
    if len(img_array.shape) == 2:
        img_array = np.expand_dims(img_array, axis=-1)
    
    return img_array
'''
result = router.invoke({"question": question})
print(result.datasource)
print()
print(result)

python_docs

datasource='python_docs'


In [26]:
# Define a functioin to choose the route
def choose_route(result):
    if "python_docs" in result.datasource.lower():
        ### Logic here 
        return "chain for python_docs"
    else:
        ### Logic here 
        return "chain for js_docs"

# Then we can use the choose_route function to choose the route
# full_chain = router | RunnableLambda(choose_route)

# Semantic Routing
Unlike logical routing, semantic routing involves embedding various prompts that represent various data sources alongside the user’s query, and then performing vector similarity search to retrieve the most similar prompt.

In [27]:
# Two prompts
climate_template = """You are a meteorologist. You are great at answering questions about weather and climate.
Here is a question:
{query}"""
fire_template = """You are a firefighter. You are great at answering questions about fire and emergency situations.
Here is a question:
{query}"""

# Embed prompts
embeddings = OpenAIEmbeddings()
prompt_templates = [climate_template, fire_template]
prompt_embeddings = embeddings.embed_documents(prompt_templates)

In [29]:
# Define the function to route question to prompt that repesents the underling database
@chain
def prompt_router(query):
    # Embed question
    query_embedding = embeddings.embed_query(query)
    # Compute similarity
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    # Pick the prompt most similar to the input question
    most_similar = prompt_templates[similarity.argmax()]
    return PromptTemplate.from_template(most_similar)

# Test the above function
query = "Would calling a fire department be a good idea?"
print(prompt_router.invoke(query))
print()

# Build the semantic router
semantic_router = (
    prompt_router
    | ChatOpenAI()
    | StrOutputParser()
)

# Invote the router
response = semantic_router.invoke(query)
print(response)

text='You are a firefighter. You are great at answering questions about fire and emergency situations.\nHere is a question:\nWould calling a fire department be a good idea?'

Yes, calling the fire department in an emergency situation is always a good idea. The fire department has trained professionals who can respond quickly and effectively to help mitigate the situation and ensure the safety of everyone involved. It is important to call 911 as soon as possible in the event of a fire or any other emergency.


# Query Construction
Retrieval-augmented generation is an effective strategy to embed and retrieve relevant unstructured data from a Vector Store based on a query. But most data available for use in production apps is structured and typically stored in relational databases. In addition, unstructured data embedded in a Vector Store also contains structured metadata that posesses important information.

Query construction is the process of transforming a natural language query into the query language of the database or data source you are interacting with.

For example, consider the query what are movies about aliens in the year 1980? This question contains an unstructured topic that can be retrieved via embeddings (aliens), but it also contains potential structured components (“year == 1980”).

## Text-to-metadata-filter using self-querying retriever
A self-querying retriever is one that has the ability to query itself. Specifically, given any natural language query, the retriever uses a query-constructing LLM chain to write a structured query and then applies that structured query to its underlying VectorStore. This allows the retriever to not only use the user-input query for semantic similarity comparison with the contents of stored documents but to also extract filters from the user query on the metadata of stored documents and to execute those filters.

See here fore more details: https://python.langchain.com/docs/how_to/self_query/

In [30]:
# Create a vectorstore from a list of Documents
# Note: We have to use Chroma here because FAISS doesn't support self-querying retriever
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

# Create a vectorstore from the documents
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

In [31]:
# Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields 
# # that our documents support and a short description of the document contents.
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]
document_content_description = "Brief summary of a movie"

# Define the llm
llm = ChatOpenAI(temperature=0)

# Build the self-query retriever
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

In [32]:
# Test it out
# Note: This example only specifies a filter.
retriever.invoke("I want to watch a movie rated higher than 8.5")

[Document(metadata={'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'year': 1979}, page_content='Three men walk into the Zone, three men walk out of the Zone'),
 Document(metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea')]

In [33]:
# This example specifies a query and a filter
retriever.invoke("Has Greta Gerwig directed any movies about women")

[Document(metadata={'director': 'Greta Gerwig', 'rating': 8.3, 'year': 2019}, page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them')]

In [34]:
# This example specifies a composite filter
retriever.invoke("What's a highly rated (above 8.5) science fiction film?")

[Document(metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea'),
 Document(metadata={'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'year': 1979}, page_content='Three men walk into the Zone, three men walk out of the Zone')]

In [35]:
# This example specifies a query and composite filter
retriever.invoke(
    "What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated"
)

[Document(metadata={'genre': 'animated', 'year': 1995}, page_content='Toys come alive and have a blast doing so')]

# SQL
Create a chain to translate a question to an SQL query and then execute the query to get the answer

In [36]:
# Create a function to get the engine for the Chinook database
# Code adopted from here: https://python.langchain.com/docs/integrations/tools/sql_database/
def get_engine_for_chinook_db():
    """Pull sql file, populate in-memory database, and create engine."""
    url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sql"
    response = requests.get(url)
    sql_script = response.text

    connection = sqlite3.connect(":memory:", check_same_thread=False)
    connection.executescript(sql_script)
    return create_engine(
        "sqlite://",
        creator=lambda: connection,
        poolclass=StaticPool,
        connect_args={"check_same_thread": False},
    )

# Build the engine
engine = get_engine_for_chinook_db()

# Build the database
db = SQLDatabase(engine)

# Print the table names in the database
print(db.get_usable_table_names())

['Album', 'Artist', 'Customer', 'Employee', 'Genre', 'Invoice', 'InvoiceLine', 'MediaType', 'Playlist', 'PlaylistTrack', 'Track']


In [37]:
# Define the LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Create the sql query chain to translate the question to SQL query
chain = create_sql_query_chain(llm, db)

# Invoke the chain with the question
response = chain.invoke({"question": "How many employees are there?"})
print(response)

SELECT COUNT("EmployeeId") AS "TotalEmployees" FROM "Employee"


In [38]:
# Create the chain to translate the question to SQL query and execute the query
execute_query = QuerySQLDataBaseTool(db=db)
write_query = create_sql_query_chain(llm, db)
chain = write_query | execute_query

# Invoke the chain
chain.invoke({"question": "How many employees are there?"})

'[(8,)]'