### RAG Pipeline -  Data Ingestion To Vector DB pipeline 

In [33]:
%pip install -q langchain-text-splitters

import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


Note: you may need to restart the kernel to use updated packages.


In [None]:

def load_all_pdfs(pdf_directory):
    all_documents=[]
    pdf_dir=Path(pdf_directory)

    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"found {len(pdf_files)} PDF files in {pdf_directory}")

    for pdf in pdf_files:
        print(f"\nprocessing:{pdf.name}")
        try:
            loader=PyMuPDFLoader(str(pdf))
            documents=loader.load()

            for page_i,doc in enumerate(documents):
                doc.metadata["source_file"]=pdf.name
                doc.metadata["source_path"]=str(pdf)
                doc.metadata["file_type"]="pdf"
                doc.metadata["page"]=doc.metadata.get("page", page_i+1) # if the page number is not available, set it to the page index + 1

            all_documents.extend(documents)
            print(f"loaded {len(documents)} pages from the {pdf.name}")
            doc.metadata["doc_id"]=pdf.stem # set the doc_id to the file name without extension

        except Exception as e:
            print(f"error loading {pdf.name}: {e}")

    print(f"\ntotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=load_all_pdfs("../data")





found 4 PDF files in ../data

processing:1_Final_DFA.pdf
loaded 160 pages from the 1_Final_DFA.pdf

processing:01_Introduction_to_Attention_Mechanisms.pdf
loaded 3 pages from the 01_Introduction_to_Attention_Mechanisms.pdf

processing:02_Transformer_and_Self_Attention.pdf
loaded 2 pages from the 02_Transformer_and_Self_Attention.pdf

processing:03_Advanced_Attention_Variants.pdf
loaded 3 pages from the 03_Advanced_Attention_Variants.pdf

total documents loaded: 168


In [35]:
all_pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': '(unspecified)', 'creationdate': '2026-02-23T11:04:25+00:00', 'source': '..\\data\\pdf1\\01_Introduction_to_Attention_Mechanisms.pdf', 'file_path': '..\\data\\pdf1\\01_Introduction_to_Attention_Mechanisms.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2026-02-23T11:04:25+00:00', 'trapped': '', 'modDate': "D:20260223110425+00'00'", 'creationDate': "D:20260223110425+00'00'", 'page': 0, 'source_file': '01_Introduction_to_Attention_Mechanisms.pdf', 'file_type': 'pdf'}, page_content="Introduction to Attention Mechanisms\nA Comprehensive Overview for Deep Learning Practitioners\n1. What is Attention?\nAttention mechanisms are a fundamental component of modern deep learning architectures. Inspired\nby the human cognitive ability to focus on relevant parts of information while ignoring irrelevant details,\natte

In [110]:
### text splitting get into chunks

from langchain_core import documents
import hashlib # we will use this to create a unique hash for each chunk based on its content, this will help us to avoid storing duplicate chunks in the vector store and also to easily identify which chunks are similar when we retrieve them later.


def split_documents(documents,chunk_size=1000,chunk_overlap=200): # what is a chunk overlap - ans - it is the number of characters that will be repeated in the next chunk to provide context
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len, # this is the function that will be used to calculate the length of the text, in this case we are using the built-in len function which counts the number of characters in the text. This is important because we want to split the text into chunks of a certain size, and we need to know how long the text is to do that.
        separators=["\n\n","\n"," ",""] #  these are the separators that will be used to split the text. The text splitter will try to split the text using these separators in order. So it will first try to split the text using double newlines, then single newlines, then spaces, and finally if it can't split the text using any of those separators, it will split the text at the chunk size regardless of the separator. This is important because we want to try to split the text at natural break points (like paragraphs or sentences) before splitting it at arbitrary points (like in the middle of a word).
    )

    #add chunk metadata + stable chunk_id based on content hash
    for i,d in enumerate(split_docs):
        d.metadata["chunk_index"]=i

        source=d.metadata.get("source_path",d.metadata.get("source_file","unknown_source"))
        page=d.metadata.get("page","unknown_page")

        #stable chunk id
        raw=f"{source}|{page}|{i}|{d.page_content}".encode("utf-8")
        d.metadata["chunk_id"]=hashlib.sha1(raw).hexdigest() # we use sha1 here because it is fast and produces a short hash, we don't need a cryptographic hash for this purpose
        

    split_docs=text_splitter.split_documents(documents)
    print(f"split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print("\n example chunk:")
        print(f"content: {split_docs[0].page_content[:200]}")
        print(f"metadata: {split_docs[0].metadata}")
    return split_docs

chunks=split_documents(all_pdf_documents)
chunks

UnboundLocalError: cannot access local variable 'split_docs' where it is not associated with a value

### Embedding & Vector DB

In [37]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing  import List , Dict , Any , Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
class EmbeddingManager:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentance embeddings
        """
        self.model_name=model_name
        self.model=None
        self._load_model() # its a protected function because we don't want the user to call it directly, we want them to use the get_embedding function which will call this function if the model is not already loaded.


    def _load_model(self):
        """
        Load the SentenceTransformer model
        """
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}:{e}")
            raise e
        
    def generate_embeddings(self,texts:List[str])->np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
        texts:list of text strings to embed

        Returns:
            numpy array of embeddins with shape (len(texts),embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() to load the model before generating embeddings.")
        
        print(f"Generating embeddings fro {len(texts)} texts")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embedings with stage:{embeddings.shape}")
        return embeddings
    
embedding_manager=EmbeddingManager()
embedding_manager

 
    

Loading embedding model:all-MiniLM-L6-v2


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension:384


<__main__.EmbeddingManager at 0x2340f67ba10>

### Vector Store

In [112]:
class VectorStore:
    """Manages a vector store using ChromaDB for storing and retrieving document embeddings."""

    def __init__(self,collection_name:str ="pdf_documents",persist_directory: str ="../data/vector_store"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""

        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)

            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={
                    "description":"PDF document embeddings for RAG",
                    "hnsw:space":"cosine" # this is a custom metadata field that we can use to specify the distance metric for the HNSW index. ChromaDB will use this information to optimize the index for cosine similarity, which is the most common distance metric for text embeddings. This is important because it will improve the performance of similarity search in the vector store.
                }
            )
            print(f"vector store initialize. collection:{self.collection_name}")
            print(f"Existing documents in collection:{self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        
        """
        add documents and their embedings tp the vector store
        
        Args:
            documents: list of documents to add to the store. Each document should have a unique ID in its metadata under the key "id".
            embeddings: numpy array of embeddings corresponding to the documents, with shape (len(documents), embedding_dim)
        """

        if(len(documents)!=len(embeddings)):
            raise ValueError("number of documents and embeddings must be the same" )

        print(f"adding {len(documents)} document to vector store...")

        #prepare data for chroma db    
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):

            #generate id
            doc_id=doc.metadata.get("chunk_id") # generate a unique id for the document using uuid and the index of the document in the list. This ensures that even if there are duplicate documents, they will have unique IDs in the vector store.
            if not doc_id:
                doc_id=f"fallback_{uuid.uuid4().hex}"

            #generate metadata
            metadata=dict(doc.metadata) # make a copy of the document metadata to avoid modifying the original document's metadata
            metadata['doc_index']=i # add the index of the document in the original list to the metadata. This can be useful for debugging and for retrieving the original document later if needed.
            metadata['context_length']=len(doc.page_content) # add the length of the document text to the metadata. This can be useful for filtering documents based on their length during retrieval.  
            metadatas.append(metadata)

            #Document content

            documents_text.append(doc.page_content)

            #embeddings
            embeddings_list.append(embedding.tolist()) # convert the embedding from a numpy array to a list so that it can be stored in ChromaDB, which expects embeddings to be in list format.

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"succesfully added {len(documents)} documents to vector store")
            print(f"total documnets in collection:{self.collection.count()}")

        except Exception as e:
            print(f"error adding documents to vector store: {e}")
            raise e 
        

vector_store=VectorStore()
vector_store


vector store initialize. collection:pdf_documents
Existing documents in collection:574


<__main__.VectorStore at 0x2340f7792b0>

In [113]:
chunks

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-09-18T14:45:11+05:30', 'trapped': '', 'modDate': "D:20240918144511+05'30'", 'creationDate': "D:20240918144511+05'30'", 'page': 0, 'source_file': '1_Final_DFA.pdf', 'file_type': 'pdf'}, page_content='Deterministic\nFinite Automata\n9/18/2024\n1\nFinite Automata\nAnd Regular Languages'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2024-09-18T14:45:11+05:30', 'source': '..\\data\\pdf\\1_Final_DFA.pdf', 'file_path': '..\\data\\pdf\\1_Final_DFA.pdf', 'total_pages': 160, 'format': 'PDF 1.7', 'title': 'Microsoft PowerPoint - 1_Final_DFA (2) (4) [Compatibility Mode]

## convert the text to embeddings

In [114]:
# extraxt the text
texts=[doc.page_content for doc in chunks] # extract the text content from each document chunk to create a list of strings that can be passed to the embedding model for generating embeddings. Each element in the "texts" list corresponds to the text content of a document chunk, which will be embedded and stored in the vector store for later retrieval during RAG operations.

# generate the embeddings
embeddings=embedding_manager.generate_embeddings(texts)

# store in the vecor db
vector_store.add_documents(chunks,embeddings)

Generating embeddings fro 180 texts


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Generated embedings with stage:(180, 384)
adding 180 document to vector store...
error adding documents to vector store: Non-empty lists are required for ['ids'] in add.


ValueError: Non-empty lists are required for ['ids'] in add.

### Retrival Pipeline From Vector Store

In [None]:
class RAGRetrival:
    
    def __init__(self,vector_store:VectorStore,embedding_manager:EmbeddingManager):
        """Initailize the retriver
        Args:
        vector_store: instance of the VectorStore class that manages the vector database for storing and retrieving document embeddings.
        embedding_manager: instance of the EmbeddingManager class that handles generating embeddings for query texts.
        """
        self.vector_store=vector_store
        self.embedding_manager=embedding_manager 

    def retrive(self,query:str,top_k:int=5,score_threshold:float=0.0)-> List[Dict[str,Any]]:
        """
        retrive relevent dpcument for a  query
        
        Args:
        query; the input query comes form the user that we want to find relevant documents for. This is typically a natural language question or statement that the user inputs to the RAG system.
        top_k; top k results to return. This parameter controls how many of the most relevant documents will be returned by the retriever. A higher value of top_k will return more documents, but may also include less relevant ones, while a lower value will return fewer but more relevant documents.
        score_threashold: the minimum cosine similarity score for a document to be considered relevant and included in the results. This parameter helps filter out documents that are not sufficiently similar to the query, ensuring that only documents with a cosine similarity score above this threshold are returned in the results.

        Returns:
        A list of dictionaries, where each dictionary contains the retrieved document's text, metadata, and its cosine similarity score with respect to the query. The list is sorted in descending order of relevance (highest cosine similarity score first).


        """

        print(f"retriving document for the query {query} with top_k={top_k} and score_threshold={score_threshold}")

        #generate embedding for the query
        query_embedding=self.embedding_manager.generate_embeddings([query])[0] # generate embedding for the query and take the first element of the resulting array since we are only embedding one query at a time.

        # search in the vector store
        try:
            results=self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()], # convert the query embedding from a numpy array to a list so that it can be used in the ChromaDB query, which expects embeddings to be in list format.
                n_results=top_k
            )

            #process results
            retrived_docs=[]

            if results['documents'] and results['documents'][0]: # check if there are any documents in the results and if the first element of the documents list is not empty
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0] # retrieve the cosine similarity scores (distances) for the retrieved documents from the query results. This can be useful for filtering the results based on relevance and for debugging purposes.
                ids=results['ids'][0] # retrieve the list of document IDs from the query results. This can be useful for debugging and for retrieving the original documents later if needed.

                for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):  
                    # convert distcnce t0o similarity score
                    similarity_score=1-distance # since cosine similarity is 1 - cosine distance, we can convert the distance to a similarity score by subtracting the distance from 1. This gives us a similarity score that ranges from 0 to 1, where 1 means the document is identical to the query and 0 means it is completely different.

                    if similarity_score>=score_threshold:
                        retrived_docs.append(
                            {
                                'id':doc_id,
                                'content':document,
                                'metadata':metadata,
                                'similarity_score':similarity_score,
                                'distance':distance,
                                'rank':i+1
                            }
                        )

                print(f"retrived {len(retrived_docs)} documents after filtering")
            else:
                print("no document found")

            return retrived_docs
        except Exception as e:
            print(f"error while retrieving documents: {e}")
            return []
               


rag_retriver=RAGRetrival(vector_store,embedding_manager)
rag_retriver
 

        

<__main__.RAGRetrival at 0x2340f67bb60>

In [None]:
rag_retriver.retrive("Sparse Attention")

retriving document for the query Sparse Attention with top_k=5 and score_threshold=0.0
Generating embeddings fro 1 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embedings with stage:(1, 384)
retrived 5 documents after filtering


[{'id': 'doc_1e783488_12',
  'content': 'Advanced Attention Variants\nEfficient Attention, Sparse Attention, Flash Attention & Beyond\n1. The Quadratic Problem\nStandard self-attention has O(n^2) time and memory complexity with respect to sequence length n. For\na sequence of 1000 tokens, the attention matrix has 1,000,000 entries. For 10,000 tokens, it has\n100,000,000 entries. This makes standard attention prohibitively expensive for long documents,\nhigh-resolution images, or genomic sequences. This challenge spurred a wave of research into efficient\nattention variants.\n2. Sparse Attention\nSparse attention restricts each token to attend to only a subset of other tokens, reducing complexity to\nO(n * sqrt(n)) or O(n * log(n)). The key insight is that not all token pairs are equally important — most of\nthe attention weight is concentrated in a small fraction of positions.\n2.1 Longformer Attention\nLongformer (Beltagy et al., 2020) combines local windowed attention with global att

### Integration VectorDB Context Pipeline With LLM Output

In [None]:
## simple rag pipeline using gemini

from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()

# initialize gemini model
groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(
    model_name="openai/gpt-oss-20b",
    groq_api_key=groq_api_key,
    temperature=0,
    max_tokens=1024
)


## simple rag functions

def rag_simple(query, retriver, llm, top_k=3):
    # retrive the context
    results = retriver.retrive(query, top_k=top_k)
    if not results:
        return "No relevant documents found."

    context = "\n\n".join([doc["content"] for doc in results])

    if not context:
        return "No relevant documents found."

    # context + prompt -> llm
    prompt = """use the following context to answer the question concisely and accurately.

Context:
{context}

Question:
{query}

Answer:
"""

    response = llm.invoke(prompt.format(context=context, query=query))
    return response.content if hasattr(response, "content") else str(response)
    
    


In [None]:
answer=rag_simple("what is a attention mechanism?",rag_retriver,llm)

retriving document for the query what is a attention mechanism? with top_k=3 and score_threshold=0.0
Generating embeddings fro 1 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embedings with stage:(1, 384)
retrived 3 documents after filtering


In [None]:
print(answer)

An attention mechanism is a neural network component that lets the model dynamically weight and focus on different parts of an input sequence when generating an output. Instead of compressing the entire input into a single fixed‑size vector, attention allows the model to “look back” at the full input and decide which elements are most relevant at each step, improving performance on tasks with long or complex sequences.


### Enhance RAG Pipeline Features

In [None]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - returns answer, sources, confidence score and optionally the context used for answering the query
    """

    results = retriever.retrive(query, top_k=top_k, score_threshold=min_score)

    if not results:
        return {'answer': 'No relevent content found', 'sources': [], 'confidence': 0.0, 'context': ''}

    context = "\n\n".join(doc['content'] for doc in results)
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('sorce', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]

    confidence = max(doc['similarity_score'] for doc in results)

    prompt = f"""Use the following context to answer the question concisely and accurately.
        Context: {context}
        Question: {query}
        Answer: 
        """

    response = llm.invoke(prompt.format(context=context, query=query))

    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence,
        'context': context if return_context else ''
    }

    return output

In [None]:
#example usage of the advanced RAG function

result=rag_advanced("Who first introduced the concept of attention in machine translation, and when?", rag_retriver, llm, top_k=5, min_score=0.2, return_context=True)
print("answer: ", result["answer"])
print("sources: ", result["sources"])
print("confidence: ",result['confidence'])
print("context:",result['context'])


retriving document for the query Who first introduced the concept of attention in machine translation, and when? with top_k=5 and score_threshold=0.2
Generating embeddings fro 1 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embedings with stage:(1, 384)
retrived 2 documents after filtering
answer:  Bahdanau et al. introduced the concept in 2014.
sources:  [{'source': '01_Introduction_to_Attention_Mechanisms.pdf', 'page': 0, 'score': 0.2522565722465515, 'preview': 'Introduction to Attention Mechanisms\nA Comprehensive Overview for Deep Learning Practitioners\n1. What is Attention?\nAttention mechanisms are a fundamental component of modern deep learning architectures. Inspired\nby the human cognitive ability to focus on relevant parts of information while ignoring...'}, {'source': '01_Introduction_to_Attention_Mechanisms.pdf', 'page': 0, 'score': 0.2522565722465515, 'preview': 'Introduction to Attention Mechanisms\nA Comprehensive Overview for Deep Learning Practitioners\n1. What is Attention?\nAttention mechanisms are a fundamental component of modern deep learning architectures. Inspired\nby the human cognitive ability to focus on relevant parts of information while ignoring...'}]
confidence:  