In [1]:
import os

In [4]:
import os
import chromadb
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.document_transformers.long_context_reorder import (
    LongContextReorder
)
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
## Get the Embedding Model
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
print("Embedding Model Loaded..........")



Embedding Model Loaded..........


## Data Preprocessing

In [6]:
loader_un_sdg = PyPDFLoader(r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\Data\UN SDG.pdf")
documents_un_sdg = loader_un_sdg.load()
text_splitter_un_sdg = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                      chunk_overlap = 100)
texts_un_sdg = text_splitter_un_sdg.split_documents(documents_un_sdg)

In [11]:
texts_un_sdg[1]

Document(page_content='UNITED NA TIONSTRANSFORMING OUR WORLD:\nTHE 2030 AGENDA FOR \nSUST AINABLE DEVELOPMENT\nsustainabledevelopment.un.orgA/RES/70/1', metadata={'source': 'C:\\Users\\ADMIN\\Desktop\\LANGCHAIN\\RAG_PROJECTS\\Merger_Retriever_LOTR\\Data\\UN SDG.pdf', 'page': 1})

In [12]:
loader_paris_agreement = PyPDFLoader(r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\Data\english_paris_agreement.pdf")
documents_paris_agreement = loader_paris_agreement.load()
text_splitter_paris_agreement = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                      chunk_overlap = 100)
texts_paris_agreement = text_splitter_paris_agreement.split_documents(documents_paris_agreement)

In [14]:
texts_paris_agreement[1]

Document(page_content='PARIS AGREEMENT \nThe Parties to this Agreement, \nBeing Parties to the United Nations Framework Convention on Climate \nChange, hereinafter referred to as "the Convention", \nPursuant to the Durban Platform for Enhanced Action established by \ndecision 1/CP.17 of the Conference of the Parties to the Convention at its \nseventeenth session, \nIn pursuit of the objective of the Convention, and being guided by its \nprinciples, including the principle of equity and common but differentiated \nresponsibilities and  respective capabilities, in the light of different national \ncircumstances, \nRecognizing the need for an effective and progressive response to the \nurgent threat of climate change on the basis of the best available scientific \nknowledge, \nAlso recognizing the specific needs and special circumstances of \ndeveloping country Parties, especially those that are particularly vulnerable to the \nadverse effects of climate change, as provided for in the Con

## Create and Store VectorStore

In [16]:
load_un_sdg_store = Chroma.from_documents(
    texts_un_sdg,
    hf,
    collection_metadata={"hnsw:space": "cosine"},
    persist_directory=r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\store\un_sdg_chroma_cosine",
    )
print("First Vector Store Created.........")

First Vector Store Created.........


In [17]:
load_paris_agreement_store = Chroma.from_documents(
    texts_paris_agreement,
    hf,
    collection_metadata={"hnsw:space": "cosine"},
    persist_directory=r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\store\paris_agreement_chroma_cosine",
    )
print("Second Vector Store Created.........")

Second Vector Store Created.........


## Load VectorStore

In [18]:
load_un_sdg_store = Chroma(persist_directory=r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\store\un_sdg_chroma_cosine",
                           embedding_function=hf)

In [19]:
load_paris_agreement_store = Chroma(persist_directory=r"C:\Users\ADMIN\Desktop\LANGCHAIN\RAG_PROJECTS\Merger_Retriever_LOTR\store\paris_agreement_chroma_cosine",
                                    embedding_function=hf)

## Init Merger Retriever

In [31]:
retriever_un_sdg = load_un_sdg_store.as_retriever(search_type = "similarity",
                                                  search_kwargs = {"k":3})

retriever_paris_agreement = load_paris_agreement_store.as_retriever(search_type = "mmr",
                                                                    search_kwargs = {"k":3})

In [32]:
lotr = MergerRetriever(retrievers=[retriever_un_sdg, retriever_paris_agreement])

In [33]:
lotr

MergerRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000024ACD4DB610>, search_kwargs={'k': 3}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000024ACBAC5E70>, search_type='mmr', search_kwargs={'k': 3})])

## Perform Semantic Search

In [35]:
docs = lotr.get_relevant_documents("Is there any framework available to tackle the climate change?")
print(docs)



In [34]:
for chunks in lotr.get_relevant_documents("Is there any framework available to tackle the climate change?"):
    print(chunks.page_content)

finance should  represent a progression beyond previous efforts. 
4. The provision of scaled-up financial resources should aim to achieve a 
balance between adaptation and mitigation, taking into account country-driven 
strategies, and the priorities and needs of developing country Parties, especially 
those that are particularly vulnerable to the adverse effects of climate change and 
have significant capacity constraints, such as the least developed countries and 
small island developing States, considering the need for public and grant-based 
resources for adaptation. 
5. Developed country Parties shall biennially communicate indicative 
quantitative and qualitative information related to paragraphs 1 and 3 of this 
Article, as applicable, including, as available, projected levels of public financial 
resources to be provided to developing country Parties. Other Parties providing 
resources are encouraged to communicate biennially such information on a 
voluntary basis.
and+ adaptat

## Reordered Docs (Tackling Lost in the Middle)

In [36]:
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content='finance should  represent a progression beyond previous efforts. \n4. The provision of scaled-up financial resources should aim to achieve a \nbalance between adaptation and mitigation, taking into account country-driven \nstrategies, and the priorities and needs of developing country Parties, especially \nthose that are particularly vulnerable to the adverse effects of climate change and \nhave significant capacity constraints, such as the least developed countries and \nsmall island developing States, considering the need for public and grant-based \nresources for adaptation. \n5. Developed country Parties shall biennially communicate indicative \nquantitative and qualitative information related to paragraphs 1 and 3 of this \nArticle, as applicable, including, as available, projected levels of public financial \nresources to be provided to developing country Parties. Other Parties providing \nresources are encouraged to communicate biennially such information