### Data Ingestion

In [4]:
from langchain_core.documents import Document

### Import a text file

In [5]:
import os
os.makedirs('../data/text_files', exist_ok=True)

In [6]:
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, CSVLoader

loader = TextLoader('../data/text_files/RAG_example.txt',encoding='utf-8' )
document = loader.load()
print(document)


[Document(metadata={'source': '../data/text_files/RAG_example.txt'}, page_content='What is RAG\nTo begin, let\'s examine a simple chatbot system without RAG:\n\n\nWhile the chatbot can respond to common questions based on its training dataset, it may lack access to the most up-to-date or domain-specific knowledge.\n\nA real-world example would be asking ChatGPT "What is my mother\'s name?". ChatGPT cannot answer this question because it doesn\'t have access to external knowledge, such as your family members\' information.\n\nfailed response\n\nTo address this limitation, we need to provide external knowledge to the model (in this example, a list of family members\' names):\n\n\nA RAG system consists of two key components:\n\nA retrieval model that fetches relevant information from an external knowledge source, which could be a database, search engine, or any other information repository.\nA language model that generates responses based on the retrieved knowledge.\nThere are several way

### Directory Loader

In [7]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader =DirectoryLoader(
    '../data/text_files', glob='**/*.txt', loader_cls=TextLoader
#     loader_cls_kwargs={".txt": {"loader_cls": TextLoader},
#         ".pdf": {"loader_cls": PyMuPDFLoader},  # Or use PyPDFLoader, UnstructuredPDFLoader, etc.
#         ".csv": {"loader_cls": CSVLoader}}
)

text_documents = dir_loader.load()
print(text_documents)

[Document(metadata={'source': '..\\data\\text_files\\RAG_example.txt'}, page_content='What is RAG\nTo begin, let\'s examine a simple chatbot system without RAG:\n\n\nWhile the chatbot can respond to common questions based on its training dataset, it may lack access to the most up-to-date or domain-specific knowledge.\n\nA real-world example would be asking ChatGPT "What is my mother\'s name?". ChatGPT cannot answer this question because it doesn\'t have access to external knowledge, such as your family members\' information.\n\nfailed response\n\nTo address this limitation, we need to provide external knowledge to the model (in this example, a list of family members\' names):\n\n\nA RAG system consists of two key components:\n\nA retrieval model that fetches relevant information from an external knowledge source, which could be a database, search engine, or any other information repository.\nA language model that generates responses based on the retrieved knowledge.\nThere are several 

In [9]:
from langchain_community.document_loaders import PyMuPDFLoader
dir_loader =DirectoryLoader(
    '../data/pdf_files', glob='**/*.pdf', loader_cls=PyMuPDFLoader
)

pdf_documents = dir_loader.load()
print(pdf_documents)

[Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf_files\\How to Master Public Speaking in 30 Days.pdf', 'file_path': '..\\data\\pdf_files\\How to Master Public Speaking in 30 Days.pdf', 'total_pages': 64, 'format': 'PDF 1.4', 'title': 'How to Master Public Speaking in 30 Days', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="How to Master Public Speaking in 30 Days \n \n\u200b\n \nChapter 1: The Psychology of Fear and Stage Fright\u200b\n\u200b\nSomething strange happens when we step into the spotlight that brings even the most \nconfident to their knees. Speaking in front of an audience has a way of stirring up \nemotions we often try to avoid—fear, doubt, and uncertainty—all coming to the surface \nat once. I know exactly how you feel right now. That knot in your stomach when you \nthink about speaking in public, the r

In [12]:
from langchain_community.document_loaders import CSVLoader

dir_loader =DirectoryLoader(
    '../data/text_files', glob='**/*.csv', loader_cls=CSVLoader
)

csv_documents = dir_loader.load()
print(csv_documents)



### Embeddings & VectorStore

In [15]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import Tuple, Any, Dict ,List
from sklearn.metrics.pairwise import cosine_similarity




In [19]:
class EmbeddingManager:

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        
        """Args:
        HuggingFace Model for creating sentence embeddings"""

        self.model_name = model_name
        self.model= None
        self._load_model()

    def _load_model(self):
        """Load the embedding model."""

        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
            Returns a numpy array of embeddings in shape (len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings= self.model.encode(texts,show_progress_bar=True )
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager
        

Loading embedding model: all-MiniLM-L6-v2


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model all-MiniLM-L6-v2 loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1a7cc788bb0>

In [23]:
class VectorStore:

    """Manages storage and retrieval of embeddings using ChromaDB."""

    def __init__(self,collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):

        """Initialize the Vector Store"""

        """Args:
        collection_name: Name of the ChromaDB collection.
        persist_directory: Directory to persist the ChromaDB database.
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""

        try:
            #Creating a ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client= chromadb.PersistentClient(path=self.persist_directory)

            #Get or Create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Pdf Documments Embeddings for RAG"}
                )
            
            print(f"ChromaDB initialized with collection: {self.collection_name}")
            print(f"Existing number of documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

    def add_documents(self,documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store."""

        if len(documents)!= len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        #Prepare data for ChromaDB
        ids=[]
        metadatas=[]
        document_text=[]
        embedding_list=[]

        for i, (doc,embedding) in enumerate(zip(documents, embeddings)):

            #Generate unique ID
            doc_id= f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Prepare metadata
            metadata= dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']= len(doc.page_content)
            metadatas.append(metadata)

            #Document Content
            document_text.append(doc.page_content)

            #Embedding
            embedding_list.append(embedding.tolist())

            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embedding_list,
                    metadatas=metadatas,
                    documents=document_text
                )
                print(f"Successfully added {len(documents)} documents to the vector store.")
                print(f"Total documents in collection after addition: {self.collection.count()}")

            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise
        
vector_store = VectorStore()
vector_store

        

ChromaDB initialized with collection: pdf_documents
Existing number of documents in collection: 0


<__main__.VectorStore at 0x1a7cd87efb0>