In [1]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains import RetrievalQA

In [2]:
path = "./Understanding_Climate_Change.pdf"

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

In [8]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

In [10]:
def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using Ollama embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embeddings = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

In [11]:
vector_store = encode_pdf(path)

  embeddings = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
OllamaEmbeddings: 100%|██████████| 97/97 [01:46<00:00,  1.10s/it]


In [12]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x11c36c440>

In [13]:
# Create a retriever
retriever = vector_store.as_retriever()

In [14]:
from langchain_groq import ChatGroq

In [None]:
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0, max_tokens=4000,api_key='')
compressor = LLMChainExtractor.from_llm(llm)

In [20]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

# Create a QA chain with the compressed retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=compression_retriever,
    return_source_documents=True
)

In [21]:
query = "What is the main topic of the document?"
result = qa_chain.invoke({"query": query})
print(result["result"])
print("Source documents:", result["source_documents"])

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.70it/s]


The main topic of the document is Climate Policy.
Source documents: [Document(metadata={'source': './Understanding_Climate_Change.pdf', 'page': 5}, page_content='Climate Policy \nEffective climate policy is essential for driving large-scale change. International agreements, \nsuch as the Paris Agreement, aim to limit global warming to well below 2 degrees Celsius \nabove pre-industrial levels. National and local policies also play a critical role in \nimplementing mitigation and adaptation strategies. \nInternational Agreements \nInternational climate agreements, such as the Kyoto Protocol and the Paris Agreement, set \ntargets and frameworks for reducing greenhouse gas emissions globally. Cooperation and \ncommitment from all countries are necessary for achieving climate goals. \nNational Policies'), Document(metadata={'source': './Understanding_Climate_Change.pdf', 'page': 28}, page_content='International Climate Negotiations \nConference of the Parties (COP) \nThe Conference of the 