# **RAG-Fusion Retriever**

## Install Libraries

In [None]:
! pip install sentence-transformers
! pip install --q unstructured langchain
! pip install --q "unstructured[all-docs]"

In [None]:
! pip install langchain_community fastembed chromadb ollama

## Constants

In [32]:
# Define the directory where your PDFs are stored
pdf_directory = "C:/Users/ili/Downloads/test_rag"
save_dir = pdf_directory

## **1. Extract Texts from PDFs**
use **PyPDFLoader** from LangChain_Community to extract textual data <br>
from **Multipple PDFs**

In [None]:
# general
import os
import datetime

# Lancgain
from langchain_community.document_loaders import PyPDFLoader

# Get a list of all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

# Initialize lists to hold pages from Nvidia and Tesla PDFs separately
nvidia_pages = []


# Iterate through each PDF file and load it
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_directory, pdf_file)
    print(f"Processing file: {file_path}\n")

    # Load the PDF and split it into pages
    loader = PyPDFLoader(file_path=file_path)
    pages = loader.load()


    nvidia_pages.extend(pages)



In [None]:
# print out the first page of the first document for each category as an example
if nvidia_pages:
    print("=========================================")
    print("First page of the first Nvidia document:")
    print("=========================================\n")
    print(nvidia_pages[0].page_content)
else:
    print("No Nvidia pages found in the PDFs.")

## **2. Split Text**
We'll use RecursiveCharacterTextSplitter to break down the large text bodies from the PDFs into manageable chunks.

### 2.1 Text Splitter

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)

# Split text into chunks for Nvidia pages
nvidia_text_chunks = []
for page in nvidia_pages:
    chunks = text_splitter.split_text(page.page_content)
    nvidia_text_chunks.extend(chunks)



### 2.2 Add Metadata

In [6]:
# Example metadata management (customize as needed)
def add_metadata(chunks, doc_title):
    metadata_chunks = []
    for chunk in chunks:
        metadata = {
            "title": doc_title,
            "author": "company",  # Update based on document data
            "date": str(datetime.date.today())
        }
        metadata_chunks.append({"text": chunk, "metadata": metadata})
    return metadata_chunks

# Add metadata to Nvidia chunks
nvidia_chunks_with_metadata = add_metadata(nvidia_text_chunks, "NVIDIA Financial Report")



## **3. Create Embedding from text chunks**

In [None]:
! ollama pull nomic-embed-text:v1.5

In [None]:
! ollama list

In [8]:
import ollama

# Function to generate embeddings for text chunks
def generate_embeddings(text_chunks, model_name='nomic-embed-text:v1.5'):
    embeddings = []
    for chunk in text_chunks:
        # Generate the embedding for each chunk
        embedding = ollama.embeddings(model=model_name, prompt=chunk)
        embeddings.append(embedding)
    return embeddings

## Example

In [None]:
# Example: Embed Nvidia text chunks
nvidia_texts = [chunk["text"] for chunk in nvidia_chunks_with_metadata]
nvidia_embeddings = generate_embeddings(nvidia_texts)

nvidia_embeddings

## **4. Store and Use Embeddings in Chroma DB**
After generating the embeddings, you can store them in Chroma DB for efficient retrieval

### **CHROMADB**

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_community.embeddings import OllamaEmbeddings

# Wrap Nvidia texts with their respective metadata into Document objects
nvidia_documents = [Document(page_content=chunk['text'], metadata=chunk['metadata']) for chunk in nvidia_chunks_with_metadata]


# Add Nvidia embeddings to the database
nvidia_vector_db = Chroma.from_documents(documents=nvidia_documents,
                      embedding=OllamaEmbeddings(model="nomic-embed-text:v1.5",show_progress=False),
                      collection_name="nvidia-local-rag")

## **5. Query Processing RAG-Fusion Retriever:**

Implement a RAG-Fusion retriever using Chroma DB. Fetch the most relevant chunks from the database based on user queries.

In [11]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever


In [None]:
# LLM from Ollama
local_model = "llama3:latest"
llm = ChatOllama(model=local_model)

In [13]:
 # RAG-Fusion: Related
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
    Generate multiple search queries related to: {question} \n
    Output (4 queries):"""
)

## RAG prompt

In [14]:
retriever = MultiQueryRetriever.from_llm(
                                          nvidia_vector_db.as_retriever(),
                                          ChatOllama(model=local_model),
                                          prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

## re-rank search results

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = retriever | reciprocal_rank_fusion


In [None]:
questions = '''What are the main revenue drivers for Nvidia this fiscal year?'''

docs = retrieval_chain_rag_fusion.invoke({"question": questions},)
len(docs)

In [19]:
final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": RunnablePassthrough()} 
    | prompt
    | llm
    | StrOutputParser()
)



In [None]:
from IPython.display import Markdown
questions = '''What are the main revenue drivers for Nvidia this fiscal year?'''
display(Markdown(final_rag_chain.invoke(questions)))

In [None]:
questions = '''Can you some financial advise on Nvidia Stock to the future? should people consider buying it?'''
display(Markdown(final_rag_chain.invoke(questions)))

#  Chatting with Local RAG - Hugging Face Embedding + Llama 3 -> Improve Speed

In [None]:
from langchain_community.llms.ollama import Ollama


local_model = "llama3:latest"
cached_llm = Ollama(model=local_model)

### **CHROMADB**

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Load a smaller Hugging Face embedding model
#hf_embedding_model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

# Wrap the Hugging Face model for use with LangChain
#hf_embeddings = HuggingFaceEmbeddings(model=SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2'), show_progress=True)

# Wrap Nvidia texts with their respective metadata into Document objects
nvidia_documents = [Document(page_content=chunk['text'], metadata=chunk['metadata']) for chunk in nvidia_chunks_with_metadata]

model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Add Nvidia embeddings to the database using the smaller Hugging Face model
nvidia_vector_db_hf = Chroma.from_documents(documents=nvidia_documents,
                      embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
                      collection_name="nvidia-local-rag-384")


## RAG prompt

In [25]:
retriever_hf = MultiQueryRetriever.from_llm(
                                          nvidia_vector_db_hf.as_retriever(),
                                          ChatOllama(model=local_model),
                                          prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt_hf = ChatPromptTemplate.from_template(template)

In [26]:
retrieval_chain_rag_fusion_hf = retriever_hf | reciprocal_rank_fusion

In [27]:
questions = '''What are the main revenue drivers for Nvidia this fiscal year?'''

docs_hf = retrieval_chain_rag_fusion_hf.invoke({"question": questions},)
len(docs_hf)

13

In [30]:
final_rag_chain_hf = (
    {"context": retrieval_chain_rag_fusion_hf, 
     "question": RunnablePassthrough()} 
    | prompt_hf
    | cached_llm
    | StrOutputParser()
)



In [None]:
from IPython.display import Markdown
questions = '''What are the main revenue drivers for Nvidia this fiscal year?'''
display(Markdown(final_rag_chain_hf.invoke(questions)))