In [1]:
print("Kernel Working Fine!")

Kernel Working Fine!


## Loading the data

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
loader_HP = PyPDFLoader("Data/harrypotter.pdf")

In [4]:
doc_HP = loader_HP.load()

In [5]:
print(len(doc_HP))

3623


In [6]:
loader_got = PyPDFLoader("Data/got.pdf")

In [7]:
doc_got = loader_got.load()

In [8]:
print(len(doc_got))

755


## Text Splitting / Chunking

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [11]:
text_HP = text_splitter.split_documents(doc_HP)

In [13]:
text_got = text_splitter.split_documents(doc_got)

In [16]:
print(len(text_HP))

16976


In [17]:
print(len(text_got))

4434


## Loading the data Embedding Model

The embedding model is used to convert the chunks which we just created into vectors and then we will store these vector embeddings in a vector database.

In [18]:
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceBgeEmbeddings

In [19]:
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [20]:
hf_bge_embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en")

  hf_bge_embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en")


In [21]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [22]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [24]:
openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

## Ingesting the Data in ChromaDB

In [25]:
from langchain.vectorstores import Chroma
import chromadb

In [27]:
import os
os.getcwd()

'/Users/atharvabot7/Downloads/Reranking'

In [28]:
CURRENT_DIR = os.path.dirname(os.path.abspath("."))

In [29]:
CURRENT_DIR

'/Users/atharvabot7/Downloads'

In [31]:
DB_DIR = os.path.join(CURRENT_DIR, "Database")

In [37]:
client_settings = chromadb.config.Settings(
    is_persistent=True,
    persist_directory=DB_DIR,
    anonymized_telemetry=False
)

In [39]:
hp_vectorstore = Chroma.from_documents(
    text_HP,
    hf_bge_embeddings,
    client_settings=client_settings,
    collection_name="harry_potter",
    collection_metadata={"description": "Harry Potter book collection", "hnsw" : "cosine"},
    persist_directory="Store/Harry_Potter"
)

In [40]:
got_vectorstore = Chroma.from_documents(
    text_got,
    hf_bge_embeddings,
    client_settings=client_settings,
    collection_name="game_of_thrones",
    collection_metadata={"description": "Game of Thrones book collection", "hnsw" : "cosine"},
    persist_directory="Store/Game_Of_Thrones"
)

## Creating a Retriever

In [42]:
retriever_hp = hp_vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k": 5, "include_metadata": True}) 

In [43]:
retriever_got = got_vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k": 5, "include_metadata": True}) 

## Merge the Retrievers

This is also caled Lord of the Retriever (LOTR). 

In [44]:
from langchain.retrievers.merger_retriever import MergerRetriever

In [45]:
lotr = MergerRetriever(retrievers=[retriever_hp, retriever_got])

In [46]:
for chunks in lotr.get_relevant_documents("Who is sister of Sansa Stark?"):
    print(chunks.page_content)
    print(chunks.metadata)
    print("\n")

sister.	For,	though	her	first	jailer	had	died,	there	was	no	change	in	the
pitiful	condition	of	Ariana	Dumbledore.	Her	very	existence	continued	to
be	known	only	to	those	few	outsiders	who,	like	“Dogbreath”	Doge,
could	be	counted	upon	to	believe	in	the	story	of	her	“ill	health.”
Another	 such	 easily	 satisfied	 friend	 of	 the	 family	 was	 Bathilda
Bagshot,	the	celebrated	magical	historian	who	has	lived	in	Godric’s
Hollow	for	many	years.	Kendra,	of	course,	had	rebuffed	Bathilda	when
{'title': 'Harry Potter: The Complete Collection', 'creator': 'calibre 3.27.1 [https://calibre-ebook.com]', 'creationdate': '2019-02-21T16:04:23+00:00', 'producer': 'calibre 3.27.1 [https://calibre-ebook.com]', 'author': 'Rowling, J.K.', 'source': 'Data/harrypotter.pdf', 'page_label': '3277', 'page': 3276, 'total_pages': 3623}


woman, and Sansa . . . Sansa is your sister. You may be as different as the sun and the 
moon, but the same blood flows through both your hearts. You need her, as she needs 
you . .

  for chunks in lotr.get_relevant_documents("Who is sister of Sansa Stark?"):


In [47]:
for chunks in lotr.get_relevant_documents("What is the name of main charater of Harry Potter?"):
    print(chunks.page_content)
    print(chunks.metadata)
    print("\n")

James,	
Sirius!”
“I’m	perfectly	clear	who	he	is,	thanks,	Molly,”	said	Sirius	coldly.
“I’m	not	sure	you	are!”	said	Mrs.	Weasley.	“Sometimes,	the	way	you	talk
about	him,	it’s	as	though	you	think	you’ve	got	your	best	friend	back!”
“What’s	wrong	with	that?”	said	Harry.
“What’s	wrong,	Harry,	is	that	you	are	
not
	your	father,	however	much	you
might	look	like	him!”	said	Mrs.	Weasley,	her	eyes	still	boring	into	Sirius.
“You	are	still	at	school	and	adults	responsible	for	you	should	not	forget	it!”
{'page': 1653, 'source': 'Data/harrypotter.pdf', 'creator': 'calibre 3.27.1 [https://calibre-ebook.com]', 'creationdate': '2019-02-21T16:04:23+00:00', 'author': 'Rowling, J.K.', 'producer': 'calibre 3.27.1 [https://calibre-ebook.com]', 'title': 'Harry Potter: The Complete Collection', 'page_label': '1654', 'total_pages': 3623}


—LORD PETYR BAELISH, called LITTLEFINGER, master of coin,
—LORD STANNIS BARATHEON, master of ships,
—LORD RENLY BARATHEON, master of laws,
—SER BARRISTAN SELMY, Lord Commande

This result is very messy and huge, it can't be fed to a LLM so we have to refine it according to the question asked to overcome the scenario of Lost in the Middle. 