In [52]:
import os
from typing import List

import google.generativeai as genai
from langchain_core.embeddings import Embeddings
from dotenv import load_dotenv

load_dotenv()

# Configure google genai api key
genai.configure(api_key = os.getenv("GEMINI_API_KEY"))

class GeminiEmbeddings(Embeddings):
    def __init__(self, model_name: str = "text-embedding-004", task_type:str = "RETRIEVAL_DOCUMENT"):
        self.model_name = model_name
        self.task_type = task_type

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        embeddings = []
        for text in texts:
            embedding = genai.embed_content(model =self.model_name, content = text, task_type = self.task_type)
            embeddings.append(embedding["embedding"])

        return embeddings

    def embed_query(self, text:str) -> List[float]:
        embedding = genai.embed_content(model = self.model_name, content = text, task_type = "RETRIEVAL_QUERY")
        return embedding["embedding"]

In [53]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
        model="meta-llama/llama-3.2-1b-instruct",
        temperature=0.1,
        openai_api_key=os.getenv("OPENROUTER_API_KEY"),
        openai_api_base="https://openrouter.ai/api/v1",
    )

In [54]:
KNOWLEDGE_BASE = "/home/olande/Desktop/Rag_Techniques/Contextual Chunk Headers/books"

In [55]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.vectorstores import FAISS

loader = DirectoryLoader(KNOWLEDGE_BASE, glob="**/*.txt", show_progress = True, loader_cls = TextLoader)
documents = loader.load()

100%|██████████| 2/2 [00:00<00:00, 211.66it/s]


In [56]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 128, length_function = len)
texts = text_splitter.split_documents(documents)

In [57]:
retriever = FAISS.from_documents(texts,
                                 embedding = GeminiEmbeddings()).as_retriever()



In [58]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
    
pretty_print_docs(retriever.get_relevant_documents("Who is Thomas Jefferson?"))

Document 1:

﻿The Project Gutenberg eBook of The Declaration of Independence of the United States of America
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Declaration of Independence of the United States of America

Author: Thomas Jefferson

Release date: December 1, 1971 [eBook #1]
                Most recently updated: January 1, 2021

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE DECLARATION OF INDEPENDENCE OF THE UNITED STATES OF AMERICA ***



December, 1971  [Etext #1]


The Project Gutenberg Etext of The Declaration of Independence.
--------------------

In [59]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor = compressor,
                                                    base_retriever = retriever)


In [60]:
compressed_docs = compression_retriever.get_relevant_documents("What were the grievances against the King?")
pretty_print_docs(compressed_docs)


Document 1:

the forms to which they are accustomed.  But when a long train of abuses and
usurpations, pursuing invariably the same Object evinces a design to reduce
them under absolute Despotism, it is their right, it is their duty, to throw
off such Government, and to provide new Guards for their future security. --Such has been the patient sufferance of these Colonies; and such is now
the necessity which constrains them to alter their former Systems of Government.
The history of the present King of Great Britain is a history of repeated
injuries and usurpations, all having in direct object the establishment
of an absolute Tyranny over these States.  To prove this, let Facts
be submitted to a candid world.
----------------------------------------------------------------------------------------------------
Document 2:

*The grievances against the King: repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States.
-----------

In [61]:
_filter = LLMChainFilter.from_llm(llm)

In [62]:
compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is Chris Olande?")
pretty_print_docs(compressed_docs)

Document 1:

Chris Olande: A Professional Profile

Chris Olande is a dynamic and intellectually curious student of Statistics and Programming at Kenyatta University, with a growing portfolio of sophisticated projects that blend statistical rigor with cutting-edge machine learning techniques. His academic and practical pursuits reflect not only a mastery of foundational principles in data science and programming, but also a passion for innovative applications in real-world contexts, including education and artificial intelligence.

From his meticulous handling of sentiment classification tasks using state-of-the-art transformer models to his leadership in organizing educational field studies, Chris demonstrates a rare combination of technical excellence, strategic thinking, and human-centered values. His ability to bridge theory and practice places him at the forefront of a new generation of data-driven professionals.

Academic Background and Technical Expertise
------------------------

In [63]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings=GeminiEmbeddings(), similarity_threshold=0.5)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("How important were the minorities in the fight for independence?")
pretty_print_docs(compressed_docs)

Document 1:

One-eighth of the whole population were colored slaves, not distributed
generally over the Union, but localized in the Southern part of it.
These slaves constituted a peculiar and powerful interest.  All knew
that this interest was, somehow, the cause of the war.  To strengthen,
perpetuate, and extend this interest was the object for which the
insurgents would rend the Union, even by war; while the government claimed
no right to do more than to restrict the territorial enlargement of it.
----------------------------------------------------------------------------------------------------
Document 2:

In every stage of these Oppressions We have Petitioned for Redress
in the most humble terms:  Our repeated Petitions have been answered
only by repeated injury.  A Prince, whose character is thus marked
by every act which may define a Tyrant, is unfit to be the ruler
of a free People.

Nor have We been wanting in attention to our British brethren.
We have warned them from time 

# Pipelines

In [64]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=128, separator = ".")
redundant_filter = EmbeddingsRedundantFilter(embeddings=GeminiEmbeddings(), similarity_threshold = 0.5)
relevant_filter = EmbeddingsFilter(embeddings = GeminiEmbeddings(), min_similarity = 0.5)
compressor_pipeline = DocumentCompressorPipeline(transformers = [splitter, redundant_filter, relevant_filter, ])

In [65]:
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor_pipeline,
                                                       base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What were the grievances against the King?")
pretty_print_docs(compressed_docs)

Created a chunk of size 700, which is longer than the specified 500
Created a chunk of size 700, which is longer than the specified 500


Document 1:

He has abdicated Government here, by declaring us out of his Protection
and waging War against us.

He has plundered our seas, ravaged our Coasts, burnt our towns,
and destroyed the lives of our people


In [66]:
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, compressor, redundant_filter, relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                       base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What were the grievances against the King?")
pretty_print_docs(compressed_docs)

Created a chunk of size 700, which is longer than the specified 500
Created a chunk of size 700, which is longer than the specified 500


Document 1:

He has abdicated Government here, by declaring us out of his Protection
and waging War against us.
He has plundered our seas, ravaged our Coasts, burnt our towns,
and destroyed the lives of our people
