# Merger Retriver with filtering and contexual compression

In [None]:
from langchain.document_loaders import PyPDFLoader

# Load the documents
loader_harrypotter  = PyPDFLoader("harrypotter_1.pdf")
documnet_harrypotter = loader_harrypotter.load()

loader_got = PyPDFLoader("got.pdf")
documnet_got = loader_got.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
text_harrypotter = text_splitter.split_documents(documnet_harrypotter)
text_got = text_splitter.split_documents(documnet_got)
print(len(text_harrypotter))
print(len(text_got))

1155
122


In [15]:
import os 
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings()


In [16]:
from langchain_huggingface import HuggingFaceEmbeddings

# hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
hf_bge_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")


In [None]:
from langchain.vectorstores import Chroma
import chromadb

os.getcwd()
CURRENT_DIR = os.path.dirname(os.path.abspath("."))
DB_DIR = os.path.join(CURRENT_DIR, "db")


In [22]:
DB_DIR

'c:\\Users\\aayus\\OneDrive\\Agentic AI\\my_practice_code\\Agentic_AI_hands_on\\db'

In [None]:
# Create the Chroma client settings to save in local memory 
client_settings = chromadb.config.Settings(
    is_persistent=True,
    persist_directory=DB_DIR,
    anonymized_telemetry=False,
)

In [None]:
# Create the vector store for harrypotter
harrypotter_vectorstore = Chroma.from_documents(text_harrypotter,
                                       hf_bge_embeddings,
                                       client_settings=client_settings,
                                       collection_name="harrypotter",
                                       collection_metadata={"hnsw":"cosine"},
                                       persist_directory="/store/harrypotter")

In [None]:
# Create the vector store for got
got_vectorstore = Chroma.from_documents(text_got,
                                       hf_bge_embeddings,
                                       client_settings=client_settings,
                                       collection_name="got",
                                       collection_metadata={"hnsw":"cosine"},
                                       persist_directory="/store/got")

In [None]:
# Create retrievers for both vector stores
retriever_harrypotter = harrypotter_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 5, "include_metadata": True})
retriever_got = got_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 5, "include_metadata": True})

# MergerRetrieval 
https://python.langchain.com/docs/integrations/retrievers/merger_retriever/

In [28]:
from langchain.retrievers.merger_retriever import MergerRetriever

lotr = MergerRetriever(retrievers=[retriever_harrypotter, retriever_got])

In [29]:
for chunks in lotr.get_relevant_documents("Who was the jon snow?"):
    print(chunks.page_content)

  for chunks in lotr.get_relevant_documents("Who was the jon snow?"):


yeah, Professor Snape."
"Snape?"
"Yeah -- yer not still on abou' that, are yeh? Look, Snape helped
protect the Stone, he's not about ter steal it."
Harry knew Ron and Hermione were thinking the same as he was. If Snape
had been in on protecting the Stone, it must have been easy to find out
how the other teachers had guarded it. He probably knew everything --
except, it seemed, Quirrell's spell and how to get past Fluffy.
"You're the only one who knows how to get past Fluffy. aren't you,
Aria Stark. 
 
Eddard Stark (also known as Ned Stark), the Lord of Winterfell, is a man of his word, a 
man of honor, loyalty, and justice. We see that the principle of honor is held high by 
Eddard Stark, “Do you think my life is some precious thing to me that I would trade my 
honor for a few more years…of what?” and “It was the only honorable thing to do.”  
  
Eddard Stark’s obedience to the established principles is reminiscent of Immanuel
broken at least twice. This man's name was Albus Dumbledore

In [30]:
for chunks in lotr.get_relevant_documents("Who is a harry potter?"):
    print(chunks.page_content)

Potter.
"Look," said Harry, throwing caution to the winds, "Professor -- it's
about the Sorcerer's tone --"
they must become united in myself—in the unity of my answerability” (Bakhtin, 1990, p. 
2). Answerability implies responsibility of the creator for the creation. Only creators are 
responsible for their lives – that is their creation. They must take the blame and answer 
for their life and deeds. Tyrion Lannister – intelligent, witty, and well versed – can be 
perceived as a philosopher who realizes that he is the creator of his life. Even though
he is?"
"Who?"
"Harry Potter!"
Harry heard the little girl's voice.
"Oh, Mom, can I go on the train and see him, Mom, eh please...."
"You've already seen him, Ginny, and the poor boy isn't something you
goggle at in a zoo. Is he really, Fred? How do you know?"
"Asked him. Saw his scar. It's really there - like lightning."
"Poor dear - no wonder he was alone, I wondered. He was ever so polite
when he asked how to get onto the platform."
a

## See this result is too much messy now lets refine it according to the question and overcome the situation of lost in middle

# Now After understanding step by step it create a pipeline for LLM

In [None]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_transformers import LongContextReorder
from re import search

# Create a filter to remove redundant documents based on embeddings
filter = EmbeddingsRedundantFilter(embeddings=hf_bge_embeddings)
reordering = LongContextReorder()
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])

compression_retriever_reordered = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr,search_kwargs={"k": 3, "include_metadata": True}
)


In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model="gpt-4o")

# Create a RetrievalQA chain 
qa = RetrievalQA.from_chain_type(
      llm=llm,
      chain_type="stuff",
      retriever = compression_retriever_reordered,
      return_source_documents = True
)

In [None]:
# test the query
query ="who is jon snow?"
results = qa(query)
print(results['result'])
#
print(results["source_documents"])

  results = qa(query)


Jon Snow is a character from the "Game of Thrones" series, adapted from George R.R. Martin's "A Song of Ice and Fire" novels. He is the illegitimate son, or "bastard," of Eddard Stark, the Lord of Winterfell. Jon is raised at Winterfell alongside the Stark children but often feels like an outsider due to his status. He joins the Night's Watch, a sworn brotherhood tasked with guarding the realm from threats beyond the Wall. Jon is known for his sense of honor and leadership qualities, and he plays a significant role throughout the series. Later, it is revealed that Jon has a significant heritage that ties him closely to the larger political and fantastical conflicts within the series.
[_DocumentWithState(metadata={'creationdate': '2018-10-06T19:23:28+10:00', 'source': 'got.pdf', 'creator': 'Microsoft® Word for Office 365', 'page_label': '6', 'total_pages': 14, 'page': 5, 'moddate': '2018-10-06T19:23:28+10:00', 'producer': 'Microsoft® Word for Office 365'}, page_content='Aria Stark. \n \

In [34]:
results = qa("who is a harry potter?")
print(results['result'])
#
print(results["source_documents"])
#
for source in  results["source_documents"]:
    print(source.metadata)

Harry Potter is the main protagonist in the "Harry Potter" book series written by J.K. Rowling. He is a young wizard known for his distinctive lightning-shaped scar on his forehead. The series follows his journey at the Hogwarts School of Witchcraft and Wizardry, where he learns about magic and battles against the dark wizard Lord Voldemort. Harry is famous within the wizarding world as the "Boy Who Lived" because he survived an attack by Voldemort as an infant, which not only marked him with the scar but also led to the initial downfall of Voldemort.
[_DocumentWithState(metadata={'page_label': '8', 'total_pages': 14, 'page': 7, 'source': 'got.pdf', 'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2018-10-06T19:23:28+10:00', 'moddate': '2018-10-06T19:23:28+10:00'}, page_content='they must become united in myself—in the unity of my answerability” (Bakhtin, 1990, p. \n2). Answerability implies responsibility of the creator for th