### RAG Pipeline - Data Ingestion ###

In [7]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [16]:
# Read all PDF files in the 'data' directory

def process_all_pdfs(pdf_directory):
    
    all_documents=[]
    pdf_dir= Path(pdf_directory)
    
    pdf_files=list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files.")

    for pdf in pdf_files:

        print(f"\nProcessing file: {pdf.name}")
        try:

            loader= PyPDFLoader(str(pdf))
            documents= loader.load()

            #Metadata source info
            for doc in documents:
                doc.metadata["source"]= str(pdf.name)
                doc.metadata['file_type']='pdf'

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")

        except Exception as e:
            print(f"Error loading {pdf.name}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data/pdf_files")


Found 3 PDF files.

Processing file: How to Master Public Speaking in 30 Days.pdf
Loaded 64 pages

Processing file: Public Speaking in Business.pdf
Loaded 32 pages

Processing file: PublicSpeaking_Skills.pdf
Loaded 37 pages

Total documents loaded: 133


In [17]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'How to Master Public Speaking in 30 Days', 'source': 'How to Master Public Speaking in 30 Days.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content="How  to  Master  Public  Speaking  in  30  Days    \n Chapter  1:  The  Psychology  of  Fear  and  Stage  Fright  \n \nSomething\n \nstrange\n \nhappens\n \nwhen\n \nwe\n \nstep\n \ninto\n \nthe\n \nspotlight\n \nthat\n \nbrings\n \neven\n \nthe\n \nmost\n \nconfident\n \nto\n \ntheir\n \nknees.\n \nSpeaking\n \nin\n \nfront\n \nof\n \nan\n \naudience\n \nhas\n \na\n \nway\n \nof\n \nstirring\n \nup\n \nemotions\n \nwe\n \noften\n \ntry\n \nto\n \navoid—fear,\n \ndoubt,\n \nand\n \nuncertainty—all\n \ncoming\n \nto\n \nthe\n \nsurface\n \nat\n \nonce.\n \nI\n \nknow\n \nexactly\n \nhow\n \nyou\n \nfeel\n \nright\n \nnow.\n \nThat\n \nknot\n \nin\n \nyour\n \nstomach\n \nwhen\n \nyou\n \

### Text Splitting by creating chunks ###

In [25]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= chunk_size,
        chunk_overlap= chunk_overlap,
        length_function= len,
        separators= ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"Sample document chunk:\n{split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [26]:
chunk= split_documents(all_pdf_documents)
chunk

Split 133 documents into 374 chunks
Sample document chunk:
How  to  Master  Public  Speaking  in  30  Days    
 Chapter  1:  The  Psychology  of  Fear  and  Stage  Fright  
 
Something
 
strange
 
happens
 
when
 
we
 
step
 
into
 
the
 
spotlight
 
that
 
b...
Metadata: {'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'How to Master Public Speaking in 30 Days', 'source': 'How to Master Public Speaking in 30 Days.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'How to Master Public Speaking in 30 Days', 'source': 'How to Master Public Speaking in 30 Days.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content="How  to  Master  Public  Speaking  in  30  Days    \n Chapter  1:  The  Psychology  of  Fear  and  Stage  Fright  \n \nSomething\n \nstrange\n \nhappens\n \nwhen\n \nwe\n \nstep\n \ninto\n \nthe\n \nspotlight\n \nthat\n \nbrings\n \neven\n \nthe\n \nmost\n \nconfident\n \nto\n \ntheir\n \nknees.\n \nSpeaking\n \nin\n \nfront\n \nof\n \nan\n \naudience\n \nhas\n \na\n \nway\n \nof\n \nstirring\n \nup\n \nemotions\n \nwe\n \noften\n \ntry\n \nto\n \navoid—fear,\n \ndoubt,\n \nand\n \nuncertainty—all\n \ncoming\n \nto\n \nthe\n \nsurface\n \nat\n \nonce.\n \nI\n \nknow\n \nexactly\n \nhow\n \nyou\n \nfeel\n \nright\n \nnow.\n \nThat\n \nknot\n \nin\n \nyour\n \nstomach\n \nwhen\n \nyou\n \

### Embeddings & Vector Store

In [27]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import Tuple, Any, Dict ,List
from sklearn.metrics.pairwise import cosine_similarity




In [None]:
class EmbeddingManager:

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        
        """Args:
        HuggingFace Model for creating sentence embeddings"""

        self.model_name = model_name
        self.model= None
        self._load_model()

    def _load_model(self):
        """Load the embedding model."""

        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
            Returns a numpy array of embeddings in shape (len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings= self.model.encode(texts,show_progress_bar=True )
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
#Initialize Embedding Manager

embedding_manager = EmbeddingManager()
embedding_manager
        

Loading embedding model: all-MiniLM-L6-v2
Model all-MiniLM-L6-v2 loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1fde4abb730>

In [None]:
class VectorStore:

    """Manages storage and retrieval of embeddings using ChromaDB."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):

        """Initialize the Vector Store"""

        """Args:
        collection_name: Name of the ChromaDB collection.
        persist_directory: Directory to persist the ChromaDB database.
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""

        try:
            #Creating a ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client= chromadb.PersistentClient(path=self.persist_directory)

            #Get or Create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Pdf Documments Embeddings for RAG"}
                )
            
            print(f"ChromaDB initialized with collection: {self.collection_name}")
            print(f"Existing number of documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

    def add_documents(self,documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store."""

        if len(documents)!= len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        #Prepare data for ChromaDB
        ids=[]
        metadatas=[]
        document_text=[]
        embedding_list=[]

        for i, (doc,embedding) in enumerate(zip(documents, embeddings)):

            #Generate unique ID
            doc_id= f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Prepare metadata
            metadata= dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']= len(doc.page_content)
            metadatas.append(metadata)

            #Document Content
            document_text.append(doc.page_content)

            #Embedding
            embedding_list.append(embedding.tolist())

            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embedding_list,
                    metadatas=metadatas,
                    documents=document_text
                )
                print(f"Successfully added {len(documents)} documents to the vector store.")
                print(f"Total documents in collection after addition: {self.collection.count()}")

            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise
        
#Initialize Vector Store

vector_store = VectorStore()
vector_store

        

ChromaDB initialized with collection: pdf_documents
Existing number of documents in collection: 0


<__main__.VectorStore at 0x1fde4ab83d0>

In [30]:
#Converting all text to embeddings

texts= [doc.page_content for doc in chunk]
texts

["How  to  Master  Public  Speaking  in  30  Days    \n Chapter  1:  The  Psychology  of  Fear  and  Stage  Fright  \n \nSomething\n \nstrange\n \nhappens\n \nwhen\n \nwe\n \nstep\n \ninto\n \nthe\n \nspotlight\n \nthat\n \nbrings\n \neven\n \nthe\n \nmost\n \nconfident\n \nto\n \ntheir\n \nknees.\n \nSpeaking\n \nin\n \nfront\n \nof\n \nan\n \naudience\n \nhas\n \na\n \nway\n \nof\n \nstirring\n \nup\n \nemotions\n \nwe\n \noften\n \ntry\n \nto\n \navoid—fear,\n \ndoubt,\n \nand\n \nuncertainty—all\n \ncoming\n \nto\n \nthe\n \nsurface\n \nat\n \nonce.\n \nI\n \nknow\n \nexactly\n \nhow\n \nyou\n \nfeel\n \nright\n \nnow.\n \nThat\n \nknot\n \nin\n \nyour\n \nstomach\n \nwhen\n \nyou\n \nthink\n \nabout\n \nspeaking\n \nin\n \npublic,\n \nthe\n \nracing\n \nthoughts,\n \nthe\n \noverwhelming\n \nanxiety,\n \nthe\n \nsweaty\n \npalms\n \n–\n \nI've\n \nbeen\n \nthere,\n \nand\n \nI\n \nwant\n \nyou\n \nto\n \nknow\n \nthat\n \nyou're\n \nnot\n \nalone.\n \nIt's\n \nnatural\n \nfor\n \n

In [31]:
# Generate embeddings for the text chunks

embeddings= embedding_manager.generate_embeddings(texts)
embeddings

Generating embeddings for 374 texts...


Batches: 100%|██████████| 12/12 [00:24<00:00,  2.04s/it]

Generated embeddings with shape: (374, 384)





array([[ 0.13743944, -0.00214785,  0.03094154, ...,  0.07324756,
        -0.18237361, -0.00422344],
       [ 0.1178247 , -0.03078717,  0.03761166, ...,  0.09016085,
        -0.17064448, -0.02636734],
       [ 0.10892881, -0.00808657,  0.03524508, ...,  0.02996231,
        -0.10208246,  0.00253035],
       ...,
       [ 0.10202026, -0.07871489, -0.00672026, ...,  0.0573048 ,
        -0.1447678 ,  0.00855733],
       [ 0.06959243,  0.04550058,  0.03115555, ...,  0.07395021,
        -0.13995108, -0.02286959],
       [-0.00596692,  0.02089189,  0.00525316, ...,  0.09934128,
        -0.11237007,  0.01335948]], shape=(374, 384), dtype=float32)

In [32]:
# Store embeddings and documents in vector store

vector_store.add_documents(chunk, embeddings)

Adding 374 documents to the vector store...
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 1
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 2
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 3
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 4
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 5
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 6
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 7
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 8
Successfully added 374 documents to the vector store.
Total documents in collection after addition: 9
Successfully added 374 documents to th

### RAG - Retriever Pipleine from Vector Store ###

In [33]:
class RAGRetriever:
    """RAG Retriever to fetch relevant documents from vector store based on query."""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """Args:
        vector_store: Instance of VectorStore to retrieve documents from.
        embedding_manager: Instance of EmbeddingManager to generate query embeddings.
        top_k: Number of top similar documents to retrieve.
        """

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        # self.top_k = top_k

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:

        """Retrieve relevant documents for the query"""

        """Args:
        query: User query string.
        top_k: Number of top similar documents to retrieve.
        score_threshold: Minimum similarity score to consider a document relevant."""

        """Returns:
        List of dictionaries containing document content and metadata."""

        print(f"Retrieving documents for query: {query}")
        print(f"Top k: {top_k}, Score threshold: {score_threshold}")

        #Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        #Search in vector store
        try:
            results= self.vector_store.collection.query(
                query_embeddings= [query_embedding.tolist()],
                n_results= top_k
                # include=['documents','metadatas','distances']
            )

            #Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:

                documents= results['documents'][0]
                metadatas= results['metadatas'][0]
                distances= results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, doc, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    
                    similarity_score = 1 - distance  # Convert distance to similarity score

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': doc,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
            else:
                print("No documents retrieved from vector store.")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []
        
rag_retriever = RAGRetriever(vector_store, embedding_manager)
rag_retriever


<__main__.RAGRetriever at 0x1fde4ed0c40>

In [36]:
rag_retriever.retrieve("Why is public speaking important?")

Retrieving documents for query: Why is public speaking important?
Top k: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 25.53it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents after applying score threshold.





[{'id': 'doc_ec7428a3_370',
  'content': 'anxiety, time-consuming preparation, and potential criticism. By understanding these \nadvantages and disadvantages, speakers can better prepare and deliver effective speeches. \nOvercoming the disadvantages and enhancing the advantages requires continuous practice, \npreparation, and a willingness to learn and grow. Effective public speaking can lead to \nsignificant personal and professional development, making it a valuable skill to master.',
  'metadata': {'content_length': 443,
   'moddate': '2024-06-11T10:54:58+00:00',
   'producer': 'iLovePDF',
   'creator': 'PyPDF',
   'page': 33,
   'page_label': '34',
   'creationdate': '',
   'source': 'PublicSpeaking_Skills.pdf',
   'file_type': 'pdf',
   'total_pages': 37,
   'doc_index': 370},
  'similarity_score': 0.48015207052230835,
  'distance': 0.5198479294776917,
  'rank': 1},
 {'id': 'doc_3574c266_308',
  'content': 'provide clear, thoughtful responses. \n \n1.5.2 Professional Benefits \nIn