In [5]:
import os
import chromadb
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_transformers import (
    EmbeddingsRedundantFilter,
    EmbeddingsClusteringFilter,
)
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Get the Embedding Model

In [3]:
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
print("Embedding Model Loaded..........")

Embedding Model Loaded..........


## Data Preprocessing

In [7]:
loader_un_sdg = PyPDFLoader("data/UN SDG.pdf")
documents_un_sdg = loader_un_sdg.load()
text_splitter_un_sdg = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                   chunk_overlap=100)
texts_un_sdg = text_splitter_un_sdg.split_documents(documents_un_sdg)

In [9]:
texts_un_sdg[0]

Document(page_content='TRANSFORMING OUR WORLD:\nTHE 2030 AGENDA FOR \nSUST AINABLE DEVELOPMENTUNITED NA TIONS', metadata={'source': 'data/UN SDG.pdf', 'page': 0})

In [10]:
loader_paris_agreement = PyPDFLoader("data/english_paris_agreement.pdf")
documents_paris_agreement = loader_paris_agreement.load()
text_splitter_paris_agreement = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                   chunk_overlap=100)
texts_paris_agreement = text_splitter_paris_agreement.split_documents(documents_paris_agreement)

In [11]:
texts_paris_agreement[0]

Document(page_content='PARIS AGREEMENT \n(mm \nUNITED NATIONS \n2015', metadata={'source': 'data/english_paris_agreement.pdf', 'page': 0})

## Create and Store Vectors

In [12]:
un_sdg_store = Chroma.from_documents(texts_un_sdg, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/un_sdg_chroma_cosine")

In [13]:
paris_agreement_store = Chroma.from_documents(texts_paris_agreement, hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="store/paris_chroma_cosine")

## Load Vector Store

In [20]:
load_un_sdg_store = Chroma(persist_directory="store/un_sdg_chroma_cosine", embedding_function=hf)

In [21]:
load_paris_agreement_store = Chroma(persist_directory="store/paris_chroma_cosine", embedding_function=hf)

## Init Merge Retriever

In [22]:
retriever_un_sdg = load_un_sdg_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3, "include_metadata": True})

retriever_paris_agreement = load_paris_agreement_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3, "include_metadata": True})

In [23]:
lotr = MergerRetriever(retrievers=[retriever_un_sdg, retriever_paris_agreement])

In [24]:
lotr

MergerRetriever(tags=None, metadata=None, retrievers=[VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000028839E541D0>, search_type='similarity', search_kwargs={'k': 3, 'include_metadata': True}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000028839866690>, search_type='similarity', search_kwargs={'k': 3, 'include_metadata': True})])

## Perform Semantic Search

In [25]:
for chunks in lotr.get_relevant_documents("Is there any framework available to tackle the climate change?"):
    print(chunks.page_content)

finance should  represent a progression beyond previous efforts. 
4. The provision of scaled-up financial resources should aim to achieve a 
balance between adaptation and mitigation, taking into account country-driven 
strategies, and the priorities and needs of developing country Parties, especially 
those that are particularly vulnerable to the adverse effects of climate change and 
have significant capacity constraints, such as the least developed countries and 
small island developing States, considering the need for public and grant-based 
resources for adaptation. 
5. Developed country Parties shall biennially communicate indicative 
quantitative and qualitative information related to paragraphs 1 and 3 of this 
Article, as applicable, including, as available, projected levels of public financial 
resources to be provided to developing country Parties. Other Parties providing 
resources are encouraged to communicate biennially such information on a 
voluntary basis.
and+ adaptat