# Data Ingestion

In [1]:
from langchain_core.documents import Document

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("./data")

Found 4 PDF files to process

Processing: attention.pdf
  ✓ Loaded 15 pages

Processing: mapreduce.pdf
  ✓ Loaded 13 pages

Processing: rag.pdf
  ✓ Loaded 19 pages

Processing: gfs.pdf
  ✓ Loaded 15 pages

Total documents loaded: 62


In [4]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\nai

In [5]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs


In [6]:
chunks=split_documents(all_pdf_documents)
chunks

Split 62 documents into 329 chunks

Example chunk:
Content: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\nai

Above are the examples of how files from directory can be loaded and how txt files can be loaded, similarly any type of file can be ingested and converted into Document type. 

# Embedding Manager

In [7]:
import numpy as np
import uuid
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
class EmbeedingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the EmbeddingManager with a specified model.
        Args:
            model_name (str): The name of the sentence transformer model to use.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """
        Load the sentence transformer model.
        """
        try:
            print(f"Loading model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        Args:
            texts (List[str]): A list of strings to generate embeddings for.
        Returns:
            numpy array of embedding with shape (len(texts), embedding_dimension)
        """
        if not self.model:
            raise ValueError("Model is not loaded.")
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
embedding_manager = EmbeedingManager()
embedding_manager

Loading model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeedingManager at 0x15b512510>

# Vector Store

In [9]:
class VectorStore:
    def  __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "data/vector_store"):
        """
        Initialize the VectorStore with ChromaDB.
        Args:
            collection_name (str): The name of the collection to use in ChromaDB.
            persist_directory (str): Directory to persist the database.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """
        Initialize the ChromaDB client and collection.
        """
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description": "Collection of PDF document embeddings"}
            )
            print(f"Vector store initialized with collection: {self.collection_name}")
            print((f"Existing number of documents in the collection: {self.collection.count()}"))
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        Args:
            documents (List[Any]): List of documents to add.
            embeddings (np.ndarray): Corresponding embeddings for the documents.
        """
        if len(documents) != embeddings.shape[0]:
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store.")
        ids = []
        metadatas = []
        documents_text = []
        embedding_list = []

        for i,(doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embedding_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents.")
            print("total number of documents in the collection:", self.collection.count())

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vector_store = VectorStore()
vector_store

Vector store initialized with collection: pdf_documents
Existing number of documents in the collection: 0


<__main__.VectorStore at 0x15b6b34d0>

In [10]:
# convert text to embeddings
texts = [doc.page_content for doc in chunks]
# get embeddings
embeddings = embedding_manager.generate_embeddings(texts)
# store in vector db
vector_store.add_documents(chunks, embeddings)

Generating embeddings for 329 texts.


Batches: 100%|██████████| 11/11 [00:05<00:00,  2.06it/s]

Generated embeddings with shape: (329, 384)
Adding 329 documents to the vector store.
Successfully added 329 documents.
total number of documents in the collection: 329





# RAG Retriever

In [11]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeedingManager):
        """
        Initialize the RAGRetriever with a vector store and embedding manager.
        Args:
            vector_store (VectorStore): The vector store to retrieve documents from.
            embedding_manager (EmbeedingManager): The embedding manager to generate query embeddings.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve the top_k most relevant documents for a given query.
        Args:
            query (str): The input query string.
            top_k (int): The number of top documents to retrieve.
        Returns:
            List of dictionaries containing the retrieved documents and their metadata.
        """
        print(f"Retrieving top {top_k} documents for query: '{query}, Score Threshold: {score_threshold}'")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                ids = results['ids'][0]
                distances = results['distances'][0]

            for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                sim_score = 1 - distance
                if sim_score >= score_threshold:
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": document,
                        "metadata": metadata,
                        "similarity_score": sim_score,
                        "rank": i + 1
                    })
            
                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
            else:
                print("No documents found.")
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
rag_retriever = RAGRetriever(vector_store, embedding_manager)
rag_retriever


<__main__.RAGRetriever at 0x15b7e16a0>

In [12]:
rag_retriever.retrieve("What is a attention is all you need?")

Retrieving top 5 documents for query: 'What is a attention is all you need?, Score Threshold: 0.0'
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents after applying score threshold.
Retrieved 2 documents after applying score threshold.
Retrieved 2 documents after applying score threshold.
Retrieved 2 documents after applying score threshold.
Retrieved 2 documents after applying score threshold.
No documents found.





[{'id': 'doc_e496d131_12',
  'content': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3',
  'metadata': {'source': 'data/attention.pdf',
   'doc_index': 12,
   'page': 2,
   'title': '',
   'creator': 'LaTeX with hyperref',
   'total_pages': 15,
   'source_file': 'attention.pdf',
   'creationdate': '2024-04-10T21:11:43+00:00',
   'trapped': '/False',
   'keywords': '',
   'content_length': 216,
   'subject': '',
   'moddate': '2024-04-10T21:11:43+00:00',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
   'author': '',
   'file_type': 'pdf',
   'page_label': '3',
   'producer': 'pdfTeX-1.40.25'},
  'similarity_score': 0.13304245471954346,
  'rank': 1},
 {'id': 'doc_bfd1c9de_196',
  'content': '[58] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkore

# Introducing LLM

In [13]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or \
    getpass("Enter your OpenAI API key: ")

In [14]:
### Simple RAG pipeline with Groq LLM
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the OpenAI LLM (set your OPENAI_API_KEY in environment)
openai_api_key = os.getenv("OPENAI_API_KEY")

llm=ChatOpenAI(openai_api_key=openai_api_key,model_name="gpt-4.1-mini",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [15]:
answer=rag_simple("What is attention mechanism?",rag_retriever,llm)
print(answer)

Retrieving top 3 documents for query: 'What is attention mechanism?, Score Threshold: 0.0'
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents after applying score threshold.
Retrieved 2 documents after applying score threshold.
Retrieved 3 documents after applying score threshold.
No documents found.
The attention mechanism is a function that maps a query and a set of key-value pairs to an output vector, computed as a weighted sum of the values, where the weights are determined by the similarity between the query and the keys.


In [16]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("What is Attention is all you need?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving top 3 documents for query: 'What is Attention is all you need?, Score Threshold: 0.1'
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]


Generated embeddings with shape: (1, 384)
Retrieved 1 documents after applying score threshold.
Retrieved 1 documents after applying score threshold.
Retrieved 1 documents after applying score threshold.
No documents found.
Answer: "Attention Is All You Need" is a seminal paper that introduced the Transformer model, which relies entirely on attention mechanisms to process sequences, eliminating the need for recurrent or convolutional layers. It uses self-attention to map queries, keys, and values to outputs, enabling efficient and parallelizable sequence modeling.
Sources: [{'source': 'attention.pdf', 'page': 2, 'score': 0.12962275743484497, 'preview': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3...'}]
Confidence: 0.12962275743484497
Context Preview: 3.2 Attention
An attention function can be described as mapping