# Data Ingestion to Vector DB Pipeline

In [1]:
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in the specified directory and return a list of documents."""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing : {pdf_file.name}")
        try:
            loader=PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type']='pdf'

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents


all_pdf_documents = process_all_pdfs("./data")


Found 3 PDF files to process

Processing : attention_is_all_you_need.pdf
Loaded 11 pages

Processing : cnn.pdf
Loaded 11 pages

Processing : embedding.pdf
Loaded 27 pages

Total documents loaded: 49


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [4]:
def split_documents(documents, chunk_size=300, chunk_overlap=70):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(split_docs)} chunks")

    if split_docs:
        print(f"Example chunk:")
        print(f"Content : {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
              
    return split_docs

In [5]:
chunks=split_documents(all_pdf_documents)
chunks

Split into 526 chunks
Example chunk:
Content : Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz...
Metadata: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality whi

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

## Embedding and VectorStoreDB

In [6]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingManager:
    """"Handles document embeddings using SentenceTransformer."""
    def __init__ (self, model_name: str = "all-MiniLM-L6-v2"):
        """"Intialize the embedding moanager
        Args:
            model_name (str, optional): The name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    def generate_embedding(self,texts: List[str])-> np.ndarray:
        """Generate embeddings for a list of texts.
        Args:
            texts (List[str]): List of texts to embed.

        Returns:
            np.ndarray: Array of shape (num_texts, embedding_dim) containing the embeddings.
        """
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2bfd408b380>

# VectorStore

In [8]:
class VectorStore:
    """Manages a vector store using ChromaDB."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the vector store.
        Args:
            collection_name (str, optional): The name of the ChromaDB collection. Defaults to "documents".
            persist_directory (str, optional): Directory to persist the ChromaDB data. Defaults to ".chromadb/".
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(name=self.collection_name, metadata={"description": "PDF document embedding for RAG"})

            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Exisiting documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store.
        Args:
            documents (List[Any]): List of document objects.
            embeddings (np.ndarray): Array of shape (num_documents, embedding_dim) containing the embeddings.
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings.")

        print(f"Adding {len(documents)} documents to the vector store.")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i,(doc,embedding) in enumerate(zip(documents, embeddings)):
            doc_id=f"doc_{uuid.uuid4().hex[:8]}+{i}"
            ids.append(doc_id)

            metadata=dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )

            print(f"Added {len(documents)} documents to the vector store.")
            print(f"Total documents in collection after addition: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized with collection: pdf_documents
Exisiting documents in collection: 34747


<__main__.VectorStore at 0x2bfd463c050>

In [9]:
chunks

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [10]:
texts=[doc.page_content for doc in chunks]

embeddings=embedding_manager.generate_embedding(texts)

vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 526 texts.
Generated embeddings with shape: (526, 384)
Adding 526 documents to the vector store.
Added 526 documents to the vector store.
Total documents in collection after addition: 35273


## Retriever Pipeline from VectorStore

In [11]:
class RAGRetriever:
    """Retrieves relevant documents from the vector store based on a query."""
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """Initialize the retriever.
        Args:
            vector_store (VectorStore): The vector store instance.
            embedding_manager (EmbeddingManager): The embedding manager instance.
            top_k (int, optional): Number of top similar documents to retrieve. Defaults to 5.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int=3, score_threshold: float=0.0) -> List[Dict[str, Any]]:
        """Retrieve the most relevant documents for the given query.
        Args:
            query (str): The input query string."""
        
        print(f"Retrieving documents for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embedding([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            retrieved_docs=[]

            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]

                for i,(doc_id, document, metadata, distance) in enumerate(zip(ids,documents, metadatas, distances)):
                    similarity_score = 1 - distance

                    if similarity_score>=score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            'distance':distance,
                            'rank' : i+1
                        })
                        print(f"Retreived {len(retrieved_docs)} documents (after filtering)")
                    else:
                        print("No document found")
                    return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
rag_retriever = RAGRetriever(vectorstore, embedding_manager)


In [12]:
rag_retriever

<__main__.RAGRetriever at 0x2bfd56b41d0>

In [13]:
rag_retriever.retrieve("What is Data Grouping Strategy")

Retrieving documents for query: What is Data Grouping Strategy
Top K: 3, Score Threshold: 0.0
Generating embeddings for 1 texts.
Generated embeddings with shape: (1, 384)
Retreived 1 documents (after filtering)


[{'id': 'doc_96b15816+251',
  'content': 'pling tasks based on weighted randomness during training. Building on this, we propose\na reﬁned Data Grouping Strategy, extending the granularity from task-level to dataset-\nlevel partitioning. We posit that dataset-level grouping captures more domain-speciﬁc',
  'metadata': {'source': 'data\\pdf\\embedding.pdf',
   'content_length': 259,
   'doi': 'https://doi.org/10.48550/arXiv.2508.21632',
   'title': 'QZhou-Embedding Technical Report',
   'source_file': 'embedding.pdf',
   'moddate': '2025-09-01T00:50:53+00:00',
   'page': 11,
   'creator': 'arXiv GenPDF (tex2pdf:)',
   'page_label': '12',
   'license': 'http://creativecommons.org/licenses/by/4.0/',
   'producer': 'pikepdf 8.15.1',
   'arxivid': 'https://arxiv.org/abs/2508.21632v1',
   'file_type': 'pdf',
   'doc_index': 251,
   'creationdate': '2025-09-01T00:50:53+00:00',
   'total_pages': 27,
   'author': 'Peng Yu; En Xu; Bin Chen; Haibiao Chen; Yinfei Xu',
   'keywords': ''},
  'simila

## RAG with GROQ LLM 

In [14]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage, AIMessage

In [26]:
class GroqLLM:
    def __init__(self, model_name: str = "gemma2-9b-it", api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key or os.getenv("GROQ_API_KEY")

        if not self.api_key:
            raise ValueError("GROQ_API_KEY must be provided either as an argument or in environment variables.")

        self.llm=ChatGroq(groq_api_key=self.api_key, model_name=self.model_name,
        temperature=0.1,max_tokens=1024)

        print(f"Initialized GROQ LLM with model: {self.model_name}")

    def generate_response(self, query : str, context: str, max_length : int = 500) ->str: 
        """Generate a response from the LLM based on the query and context.
        Args:
            query (str): The input query string.
            context (str): The context to provide to the LLM.
            max_length (int, optional): Maximum length of the response. Defaults to 500.

        Returns:
            str: The generated response from the LLM.
        """

        prompt_template = PromptTemplate(
            input_variables=["context","question"],
            template="""You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.

Context:
{context}

Question: {question}

Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        )

        formatted_prompt = prompt_template.format(context=context, question=query)

        try:
            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        
        except Exception as e:
            print(f"Error generating response: {e}")
            return "I'm sorry, I couldn't generate a response at this time."
    
    def generate_response_simple(self, query : str, context : str) -> str:
        simple_prompt=f"""Use the following context to answer the question accurately and concisely.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""

        try:
            messages=[HumanMessage(content=simple_prompt)]
            response=self.llm.invoke(messages)
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            return "I'm sorry, I couldn't generate a response at this time."




In [27]:
try:
    groq_llm=GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
    print("GROQ LLM initialized successfully.")
except Exception as e:
    print(f"Warning : {e}")
    print("Please ensure you have set the GROQ_API_KEY in your environment variables to use the GROQ LLM.")
    groq_llm=None

Initialized GROQ LLM with model: gemma2-9b-it
GROQ LLM initialized successfully.


In [15]:
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="gemma2-9b-it",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [16]:
answer=rag_simple("What is maxpoolaing layer?",rag_retriever,llm)
print(answer)

Retrieving documents for query: What is maxpoolaing layer?
Top K: 3, Score Threshold: 0.0
Generating embeddings for 1 texts.
Generated embeddings with shape: (1, 384)
Retreived 1 documents (after filtering)
A max-pooling layer is a type of convolutional neural network layer that reduces the spatial dimensions (size) of an activation map while preserving its depth (number of channels). It does this by selecting the maximum value from a small region (defined by filters and stride) within the input map. 



In [17]:
answer=rag_simple("Attention is all you need",rag_retriever,llm)
print(answer)

Retrieving documents for query: Attention is all you need
Top K: 3, Score Threshold: 0.0
Generating embeddings for 1 texts.
Generated embeddings with shape: (1, 384)
Retreived 1 documents (after filtering)
That's the title of a seminal paper in the field of natural language processing.  



In [18]:
answer=rag_simple("What is Fifa worldcup?",rag_retriever,llm)
print(answer)

Retrieving documents for query: What is Fifa worldcup?
Top K: 3, Score Threshold: 0.0
Generating embeddings for 1 texts.
Generated embeddings with shape: (1, 384)
No document found
No relevant context found to answer the question.
