<a href="https://colab.research.google.com/github/DeependraChaddha/RAG_Projects/blob/main/RAG_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First, an environment will be set up then different method of query translation will be demnostrated. The methods to be deomnstrated are Multi-Query, RAG-Fusion, Decomposition, Step Back and HyDE

##Setting up Environment

Installing Packages

In [None]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Setting up Langsmith

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2']='true'
os.environ['LANGCHAIN_ENDPOINT']='https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY']=###YOUR OWN API KEY###

OpenAI api key

In [None]:
os.environ['OPENAI_API_KEY']=###your api key###

##Multi-Query

Indexing

In [None]:
##### INDEXING #####
#Loading document/ blog

import bs4
from langchain_community.document_loaders import WebBaseLoader
loader=WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),       #link of blog
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content","post-title", "post-header")
        )
    ),
)###Made a WebBaseLoader instance in this line
blog_docs=loader.load()#Used the WebBaseLoader instance to load the documents

#SPLIT
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter= RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)# made an instance of RecursiveCharacterTextSplitter
###Make the Split###
splits=text_splitter.split_documents(blog_docs)

#Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore= Chroma.from_documents(documents=splits,
                                   embedding=OpenAIEmbeddings())#Stored the split documents in Chroma vector database using OpenAIEmbeddings
retriever=vectorstore.as_retriever()

Prompt

In [None]:
from langchain.prompts import ChatPromptTemplate

#Multi-Query:Different Perspectives
template="""You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives=ChatPromptTemplate.from_template(template)

#Now the prompt will be broken down to generate multiple independent queries then these queries independently make the RAG retrieve documents, this way a more wide search can be done inside the documents
from langchain_core.output_parsers import StrOutputParsers
from langchain_openai import ChatOpenAI

generate_queries=(
    prompt_perspectives
    | ChaOpenAI(temperature=0)
    |StrOutputParser()
    | (lambda x: x.split("\n"))
)##This generates a list of queries

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
  """This function makes a unique union of all the retrieved doucments"""
  #Flatten list of lists, convert each document to string
  flattened_docs=[dumps{docs} for sublist in documents for doc  in sublist]
  #getting unique documents
  unique_documents=list(set(flattened_docs))
  return [loads(doc) for doc in unique_docs]
#retrieve
question= "What is Task Decompositon for LLM agents?"
retrieval_chain=generate_queries |retriever.map()|get_unique_union #this gives the entire chain, first generate quesries then retrieves the different answers then get the unique union of all the documents retrieved
docs= retrieval_chain.invoke({"question":question})
len(docs)


In [None]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

#RAG
template= """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt=ChatPromptTemplate.from_template(template)

llm=ChatOpenAI(temperature =0)

#making final rag chain
final_rag_chain=(
    {"context":retrieval_chain,
     "question":itemgetter("question")}
    |prompt
    |llm
    |StrOutputParser()
)
final_rag_chain.invoke({"question":question})

##RAG Fusion

This is mostly similar to Multi-Query, except that after retrieveing multiple documents, the documents are reranked and given a score.

Prompt

In [None]:
from langchain.prompts import ChatPromptTemplate

#RAG-Fusion: Related
template="""You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion= ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries=(prompt_rag_fusion
                  | ChatOpenAI(temperature=0)
                  |StrOutputParser()
                  |(lambda x: x.split("\n"))
                  )

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60): #Takes multiple documents and k value used in RRF formula
  #1. Make a dicionary to stores fused scores of each document
  fused_scores={}

  #2. Iterate through each list of documents
  for docs in results:
    #2.1. iterating through each document in a list
    for rank, doc in enumerate(docs):
      #2.1.1. convert each doc to string (assuming docs can be serialized to JSON)
      doc_str=dumps(doc)
      #2.1.2. check if doc_str in not already present in fused_scores(to avoid repitition), if its not, then add with initia score=0
      if doc_str not in fused_scores:
        fused_scores[doc_str]=0
      #2.1.3. Retrieve the current score of the document(whether already present or just added)
      previous_score=fused_scores[doc_str]
      #2.1.4. Update score using RRF formula
      fused_scores+= 1/(rank + k)
  #3. Sort the documents based on rank in descending order
  reranked_results=[(loads(doc),score) for doc,score in sorted(fusion_scores.items().key= lambda x:x[1], reverse= True)]#Sorts fusion_scores dictionary according to score which is mentioned as the key and loads the doc and score into a list of tuples

  #4. Return the raranked list
  return reranked_results

#make chain
retrieval_chain_rag_fusion= generate_queries | retriever.map() |reciprocal_rank_fusion
docs= retrieval_chain_rag_fusion.invoke({"question":question})
len(docs)

In [None]:
from langchain_core.runnables import RunnablePassThrough

#RAG
template= """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt=ChatPromptTemplate.from_template(template)

final_rag_chain= (
    {"context":retrieval_chain_rag_fusion,
     "question": itemgetter("question")}
    | prompt
    |llm
    |StrOutputParser()
)

finalrag_chain.invoke({"question": question})