## RAG Pipeline - Data Ingestion to vector DB pipeline

In [1]:
import os 
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [2]:
### Read all the PDFs inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in the specified directory."""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find the PDF files in the directory
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            ## Add metadata to each document
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata ["file_type"] = "pdf"
                              
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")

        print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

    # Process all PDFs in the data directory
    all_pdf_documents = process_all_pdfs("../data")



In [3]:
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def process_all_pdfs(pdf_directory):
    """Process all PDF files in the specified directory."""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process\n")

    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"

            all_documents.extend(documents)
            print(f" → Loaded {len(documents)} pages\n")

        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")

    print(f"\n✅ Total documents loaded: {len(all_documents)}")
    return all_documents


# Run the function
all_pdf_documents = process_all_pdfs("../data")


Found 2 PDF files to process

Processing file: Effieient_Mobile_Network_Design.pdf
 → Loaded 10 pages

Processing file: Object_Detection.pdf
 → Loaded 9 pages


✅ Total documents loaded: 19


In [10]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-03-05T02:07:26+00:00', 'author': '', 'keywords': '', 'moddate': '2021-03-05T02:07:26+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/Effieient_Mobile_Network_Design.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': 'Effieient_Mobile_Network_Design.pdf', 'file_type': 'pdf'}, page_content='Coordinate Attention for Efﬁcient Mobile Network Design\nQibin Hou1 Daquan Zhou1 Jiashi Feng1\n1National University of Singapore\n{andrewhoux,zhoudaquan21}@gmail.com\nAbstract\nRecent studies on mobile network design have demon-\nstrated the remarkable effectiveness of channel atten-\ntion (e.g., the Squeeze-and-Excitation attention) for lifting\nmodel performance, but they generally neglect the posi-\ntional information, which is important 

In [4]:
### Text Splitting get into chunks
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    """Split documents into smaller chunks for better RAG performance."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split{len(documents)} documents into {len(split_docs)} chunks")

    # Show example of chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"content:{split_docs[0].page_content[:200]}...") # split_doc[0] first 200 characters
        print(f"metadata:{split_docs[0].metadata}")
        return split_docs
    

In [5]:
chunks = split_documents(all_pdf_documents)  # Split the loaded documents into chunks
chunks

Split19 documents into 109 chunks

Example chunk:
content:Coordinate Attention for Efﬁcient Mobile Network Design
Qibin Hou1 Daquan Zhou1 Jiashi Feng1
1National University of Singapore
{andrewhoux,zhoudaquan21}@gmail.com
Abstract
Recent studies on mobile net...
metadata:{'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-03-05T02:07:26+00:00', 'author': '', 'keywords': '', 'moddate': '2021-03-05T02:07:26+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/Effieient_Mobile_Network_Design.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': 'Effieient_Mobile_Network_Design.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-03-05T02:07:26+00:00', 'author': '', 'keywords': '', 'moddate': '2021-03-05T02:07:26+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/Effieient_Mobile_Network_Design.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': 'Effieient_Mobile_Network_Design.pdf', 'file_type': 'pdf'}, page_content='Coordinate Attention for Efﬁcient Mobile Network Design\nQibin Hou1 Daquan Zhou1 Jiashi Feng1\n1National University of Singapore\n{andrewhoux,zhoudaquan21}@gmail.com\nAbstract\nRecent studies on mobile network design have demon-\nstrated the remarkable effectiveness of channel atten-\ntion (e.g., the Squeeze-and-Excitation attention) for lifting\nmodel performance, but they generally neglect the posi-\ntional information, which is important 

## Embedding and VectorStoreDB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager.

        Args:
            model_name (str): HuggingFace model name for sentence embeddings.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"🔄 Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"✅ Model loaded successfully! "
                  f"Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"❌ Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts (List[str]): List of strings to embed.

        Returns:
            np.ndarray: Array of embeddings.
        """
        if not self.model:
            raise ValueError("Embedding model is not loaded.")

        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings


# ✅ Initialize the embedding manager
embedding_manager = EmbeddingManager()

# ✅ Example: generate embeddings
texts = ["Machine learning is fascinating.", "Transformers revolutionized NLP."]
embeddings = embedding_manager.generate_embeddings(texts)

print("\n✅ Generated embeddings shape:", embeddings.shape)


🔄 Loading embedding model: all-MiniLM-L6-v2
✅ Model loaded successfully! Embedding dimension: 384


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


✅ Generated embeddings shape: (2, 384)


## VectorStore

In [8]:
import os
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store.

        Args:
            collection_name (str): Name of the ChromaDB collection.
            persist_directory (str): Directory to persist the vector store.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_vector_store()

    def _initialize_vector_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            # Ensure directory exists
            os.makedirs(self.persist_directory, exist_ok=True)

            # Initialize Chroma client with persistence
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Create or load a collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Document embeddings for RAG"}
            )

            print(f"Vector store initialized with collection: '{self.collection_name}'")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Dict[str, Any]], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.

        Args:
            documents (List[Dict]): List of LangChain Documents (or dicts) with metadata.
            embeddings (np.ndarray): Corresponding embeddings for the documents.
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")

        print(f"Adding {len(documents)} documents to the vector store...")

        # Prepare data for Chroma
        ids = []
        metadatas = []
        document_texts = []
        embedding_list = []

        for i, (doc, emb) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"

            # Metadata
            metadata = dict(doc.metadata) if "metadata" in doc else {}
            metadata["index"] = i
            metadata["content_length"] = len(doc.page_content if hasattr(doc, "page_content") else str(doc))

            # Store
            ids.append(doc_id)
            metadatas.append(metadata)
            document_texts.append(doc.page_content if hasattr(doc, "page_content") else str(doc))
            embedding_list.append(emb.tolist())

        # Add to ChromaDB collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=document_texts,
                embeddings=embedding_list
            )

            print(f"Successfully added {len(documents)} documents.")
            print(f"Total in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


# Initialize and test
vectorstore = VectorStore()
vectorstore



Vector store initialized with collection: 'pdf_documents'
Existing documents in collection: 327


<__main__.VectorStore at 0x15fc8a120>

In [9]:
### Let's  convert the text into embeddings
texts = [doc.page_content for doc in chunks]

# Generate embeddings for the text chunks
embeddings = embedding_manager.generate_embeddings(texts)

# Store in the vector Database
vectorstore.add_documents(chunks, embeddings)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding 109 documents to the vector store...
Successfully added 109 documents.
Total in collection: 436


## Retriever Pipeline From VectorStore

In [11]:
class RAGRetriever:
    """Handles query-based document retrieval from the vector store."""

    def __init__(self, vector_store, embedding_manager):
        """
        Initialize the RAG retriever.

        Args:
            vector_store: VectorStore instance containing document embeddings.
            embedding_manager: EmbeddingManager instance for generating query embeddings.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for the given query.

        Args:
            query (str): The search query.
            top_k (int): Number of top documents to retrieve.
            score_threshold (float): Minimum similarity score for retrieval.

        Returns:
            List[Dict]: List of retrieved documents with metadata.
        """
        print(f" Retrieving top {top_k} documents for query: '{query}'")
        print(f" Score threshold: {score_threshold}")

        # Generate embedding for the query
        try:
            query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        except Exception as e:
            print(f"Error generating embedding for query: {e}")
            return []

        # Query the vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )

            retrieved_docs = []
            for doc, metadata, distance in zip(
                results["documents"][0], results["metadatas"][0], results["distances"][0]
            ):
                similarity_score = 1 - distance  # Convert distance to similarity
                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        "document": doc,
                        "metadata": metadata,
                        "similarity_score": round(similarity_score, 4)
                    })

            if retrieved_docs:
                print(f"Retrieved {len(retrieved_docs)} documents after applying threshold.")
            else:
                print(" No documents met the similarity threshold.")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
rag_retriever = RAGRetriever(vectorstore, embedding_manager)


# Example usage (assuming you already initialized these elsewhere)
# rag_retriever = RAGRetriever(vectorstore, embedding_manager)
# results = rag_retriever.retrieve("What is ChromaDB?", top_k=3, score_threshold=0.6)
# print(results)


In [12]:
rag_retriever

<__main__.RAGRetriever at 0x15ff78440>

In [13]:
rag_retriever.retrieve("What is Object_Detection?")

 Retrieving top 5 documents for query: 'What is Object_Detection?'
 Score threshold: 0.0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 4 documents after applying threshold.


[{'document': 'effectiveness of the proposed methods. Code is available at\nhttps://github.com/megvii-research/AnchorDETR.\nIntroduction\nThe object detection task is to predict a bounding box and a\ncategory for each object of interest in an image. In the last\ndecades, there are many great progresses in object detection\nbased on the CNN (Ren et al. 2015; Cai and Vasconcelos\n2018; Redmon et al. 2016; Lin et al. 2017; Zhang et al. 2020;\nQiao, Chen, and Yuille 2020; Chen et al. 2021). Recently,\nCarion et al. (Carion et al. 2020) propose the DETR which\nis a new paradigm of object detection based on the trans-\nformer. It uses a set of learned object queries to reason about\nthe relations of the objects and the global image context to\noutput the ﬁnal predictions set. However, the learned object\nquery is very hard to explain. It does not have an explicit\nphysical meaning and the corresponding prediction slots of\neach object query do not have a speciﬁc mode. As shown in',
  'metada

In [14]:
rag_retriever.retrieve("Coordinate Attention for Efficient Mobile Network Design")

 Retrieving top 5 documents for query: 'Coordinate Attention for Efficient Mobile Network Design'
 Score threshold: 0.0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 5 documents after applying threshold.


[{'document': 'of the proposed coordinate attention over the SE attention\nin more powerful mobile networks, we take EfﬁcientNet-b0\n[38] as our baseline here. EfﬁcientNet is based on archi-\ntecture search algorithms. and contains SE attention. To\ninvestigate the performance of the proposed coordinate at-\ntention on EfﬁcientNet, we simply replace the SE attention\nwith our proposed coordinate attention. For other settings,\nwe follow the original paper. The results have been listed in\nTable 5. Compared to the original EfﬁcientNet-b0 with SE\nattention included and other methods that have comparable\n6',
  'metadata': {'content_length': 587, 'index': 34},
  'similarity_score': 0.471},
 {'document': 'of the proposed coordinate attention over the SE attention\nin more powerful mobile networks, we take EfﬁcientNet-b0\n[38] as our baseline here. EfﬁcientNet is based on archi-\ntecture search algorithms. and contains SE attention. To\ninvestigate the performance of the proposed coordinat

## Integration VectorDB Context Pipeline with LLM Output

In [15]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

# Initialize Groq LLM
groq_api_key = "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" 
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model="llama-3.1-8b-instant",  # ✅ Supported model!
    temperature=0.0,
    max_tokens=2048
)


### Simple RAG Function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k=3):
    # Retrieve the context
    results = retriever.retrieve(query, top_k=top_k)

    # Handle both dict or Document objects
    def get_content(doc):
        if isinstance(doc, dict):
            return doc.get("content") or doc.get("page_content") or ""
        else:
            return getattr(doc, "page_content", "")

    context = "\n\n".join([get_content(doc) for doc in results]) if results else ""

    if not context:
        return "No relevant context found."

    # Generate the answer using Groq LLM
    prompt = f"""Use the following context to answer the question concisely.
Context: {context}
Question: {query}
Answer:"""

    response = llm.invoke(prompt)
    return response.content


In [16]:
# let's try it out by asking some questions

In [17]:
response = llm.invoke("Say hi in 5 words.")
print(response.content)


Hello, how are you today?


In [18]:
answer = rag_simple("What is attention mechanism?", rag_retriever, llm)
print(answer)


 Retrieving top 3 documents for query: 'What is attention mechanism?'
 Score threshold: 0.0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 3 documents after applying threshold.
The attention mechanism is a technique used in deep learning models, particularly in sequence-to-sequence tasks such as machine translation, text summarization, and image captioning. It allows the model to focus on specific parts of the input sequence that are relevant to the output, rather than considering the entire sequence equally.

In essence, the attention mechanism is a weighted sum of the input elements, where the weights are learned during training. This enables the model to selectively attend to certain parts of the input, improving its ability to capture long-range dependencies and relationships between different elements of the input sequence.


In [19]:
answer = rag_simple("Explain RAG in simple terms?", rag_retriever, llm)
print(answer)


 Retrieving top 3 documents for query: 'Explain RAG in simple terms?'
 Score threshold: 0.0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 No documents met the similarity threshold.
No relevant context found.


## Enhanced RAG Pipeline Features

In [20]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    # Retrieve top documents
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    
    if not results:
        return {
            "answer": "No relevant content found.",
            "sources": [],
            "confidence": 0.0
        }

    # Prepare context and sources
    context = "\n\n".join(doc.get("content", "") for doc in results)

    sources = [
        {
            "source": doc.get("metadata", {}).get("source", "unknown"),
            "page": doc.get("metadata", {}).get("page", "unknown"),
            "score": doc.get("metadata", {}).get("similarity_score", 0.0),
            "preview": doc.get("content", "")[:200] + "..."
        }
        for doc in results
    ]

    confidence = max(doc.get("metadata", {}).get("similarity_score", 0.0) for doc in results)

    # Generate answer
    prompt = f"""
    Use the following context to answer the question concisely.
    Context: {context}
    Question: {query}
    Answer:"""
    
    response = llm.invoke(prompt)

    output = {
        "answer": response.content,
        "sources": sources,
        "confidence": confidence
    }

    if return_context:
        output["context"] = context

    return output


In [21]:
result = rag_advanced(
    "What is Coordinate Attention?",
    rag_retriever,
    llm,
    top_k=3,
    min_score=0.1,
    return_context=True
)

print("Answer:", result["answer"])
print("Confidence:", result["confidence"])
print("Sources:", result["sources"])
print("Context preview:", result["context"][:300])


 Retrieving top 3 documents for query: 'What is Coordinate Attention?'
 Score threshold: 0.1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 3 documents after applying threshold.
Answer: I'm ready to answer your question. However, you didn't provide the context. Please provide the context so I can give a concise answer to your question.
Confidence: 0.0
Sources: [{'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}, {'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}, {'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}]
Context preview: 






In [22]:
result = rag_advanced(
    "Anchors in Object Detection?",
    rag_retriever,
    llm,
    top_k=3,
    min_score=0.1,
    return_context=True
)

print("Answer:", result["answer"])
print("Confidence:", result["confidence"])
print("Sources:", result["sources"])
print("Context preview:", result["context"][:300])


 Retrieving top 3 documents for query: 'Anchors in Object Detection?'
 Score threshold: 0.1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 3 documents after applying threshold.
Answer: In object detection, anchors are pre-defined bounding box proposals with different scales and aspect ratios. They are used as a starting point for predicting object locations and classes. Anchors help the model to focus on potential object locations, reducing the search space and improving detection accuracy.
Confidence: 0.0
Sources: [{'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}, {'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}, {'source': 'unknown', 'page': 'unknown', 'score': 0.0, 'preview': '...'}]
Context preview: 




