# Indexing

###  Data load

In [1]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader,DirectoryLoader

dirload  = DirectoryLoader(
    path='../data/pdf_file/',
    glob='**/*.pdf',
    loader_cls=PyMuPDFLoader
)
dir_docs = dirload.load()
print(len(dir_docs))

9


### data splitting

In [2]:
# splitting from langchain_text_splitters 
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(dir_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")
chunks = [x.page_content for x in all_splits]

Split blog post into 42 sub-documents.


### Embedding

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class EmbeddingManager:
    """It will handle embedding of text by SentenceTransformer"""
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Args : 
            model_name : Huggingface model name for sentence emedding
        """
        self.model_name=model_name
        self.model=None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f'Initalizing the embedding model : {self.model_name}')
            self.model = SentenceTransformer(self.model_name)
            print(f'Model of embedding has sucessfully Intialized, Embedding size is {self.model.get_sentence_embedding_dimension()}')
        except Exception as e:
            print(f"Error while Initialing the model : {self.model_name}")
            raise
    def generate_Embedding(self,texts:list[str])->np.ndarray:
        """
        Generate embedding from the text and take list of text as argument
        
        return a list of numpy array with shape(len(text),embedding_dim)
        """
        if not self.model:
            raise ValueError('No Model Given')
        print(f'Generating Embedding for {len(texts)}')
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f'Shapeof Embedding are {embeddings.shape}')
        return embeddings

#Intializing the Embedding 
embeddingManager = EmbeddingManager()
embedding = embeddingManager.generate_Embedding(chunks)

Initalizing the embedding model : all-MiniLM-L6-v2
Model of embedding has sucessfully Intialized, Embedding size is 384
Generating Embedding for 42


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]

Shapeof Embedding are (42, 384)





### VectorDb

In [5]:
from chromadb.config import Settings
import chromadb
from sklearn.metrics.pairwise import cosine_similarity
from typing import List,Dict,Any,Tuple
import uuid
import os

In [6]:
class VectorStore:
    def __init__(self,collection_name:str="pdf_documents",persist_directory:str="../data/vector_store"):
        """
        Args:
            collection_name: Name of the chromeDB collection
            persist_directory: Directry to persist the vector store
        """
        self.collection_name=collection_name
        self.persist_directory = persist_directory
        self.client=None
        self.collection=None
        self._initalize_store()

    def _initalize_store(self):
        """Initialize the Chromedb client and collection"""
        try:
            # Create persistant chromaDB client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create the collection 
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"Pdf Document embeddings for RAG"}
            )
            print(f"Vector store intialized, collection name: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error while intializing the vectorDB {e}")
            raise
    
    def add_documents(self,documents: list[any],embedding:np.ndarray):
        """
        Add document , embedding to to vector db

        Args:
            documents : List of langchain document
            embedding : corresponding embeddings for the documents  
        """
        if len(documents) != len(embedding):
            print("Length of document and embedding are not same.")
        print(f"{len(documents)} documents are being adding to vectorDB")

        # Preparing Data for ChromoDB
        ids=[]
        metadatas=[]
        documents_text=[]
        embedding_text = []

        # feeding the data in specific data list
        for i, (doc,embedd) in enumerate(zip(documents,embedding)):
            # feeding ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # feeding metadata
            md = dict(doc.metadata)
            md['doc_index'] = i
            md['content_lenght']=len(doc.page_content)
            metadatas.append(md)

            #fedding document text
            documents_text.append(doc.page_content)
            #fedding embedding
            embedding_text.append(embedd.tolist())
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_text,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total Document stored in vectorDB is {self.collection.count()}")
        except Exception as e:
            print("Error while feeding the document : {e}")
            raise

vectordb = VectorStore()
vectordb.add_documents(all_splits,embedding)

Vector store intialized, collection name: pdf_documents
Existing documents in collection: 168
42 documents are being adding to vectorDB
Successfully added 42 documents to vector store
Total Document stored in vectorDB is 210


# Retrieval and generation

## Retrieval

In [7]:
class RetreivalManager:
    """
    This class will Retrieval similar context with query from VectorStore
    """
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        print('Retreival has been Intialized')
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    def retreive(self,query:str,top_k:int=5,score_threshold:float=0.0)->List[dict[str,Any]]:
        """
        Retreive the data 

        Arg:
            query:given by the user
            top_k:top k similar context
            score_threshold : minimum similarity score
         Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        embedded_query = self.embedding_manager.generate_Embedding([query])[0]
        try:
            result = self.vector_store.collection.query(
                query_embeddings=[embedded_query.tolist()],
                n_results=top_k
            )
            retreived_docs = []
            if result['documents'] and result['documents'][0]:
                ids = result['ids'][0]
                distances = result['distances'][0]
                documents = result['documents'][0]
                metadatas = result['metadatas'][0]
                for i, (doc_id, distance, document, metadata) in enumerate(zip(ids, distances, documents, metadatas)):
                    similarity_score = 1-distance  # distance (0 = same, 1 = far) → into similarity score (1 = same, 0 = far).
                    if similarity_score>=score_threshold:
                        retreived_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                print(f'{len(retreived_docs)} retreived from vectordb')
                return retreived_docs
            else:
                print('No Document found')
        except Exception as e:
            print(f'Error while retreiving the data : {e}')
            raise
rm = RetreivalManager(vectordb, embeddingManager)

Retreival has been Intialized


In [8]:
rm.retreive("Is this a Dell vostro laptop")

Retrieving documents for query: 'Is this a Dell vostro laptop'
Top K: 5, Score threshold: 0.0
Generating Embedding for 1


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.84it/s]

Shapeof Embedding are (1, 384)
5 retreived from vectordb





[{'id': 'doc_01285239_38',
  'content': 'Maharashtra -421302, Bhiwandi, MAHARASHTRA, India - 421302, IN-MH\nGSTIN - 27AAFCI2112P1ZH\nTotal items: 1\nProduct\nTitle\nQty\nGross\nAmount ₹\nDiscounts\n/Coupons ₹\nTaxable\nValue ₹\nIGST ₹\nTotal ₹\nLaptops\nFSN: \nCOMG36KGCDP4HHVT\nHSN/SAC: 84713010\nDELL Vostro Core i3 10th \nGen - (8 GB/512 GB SSD\n/Windows 10) Vostro 3401 \nThin and Light Laptop\nWarranty: 1 Year Onsite Warranty\n1. [IMEI/Serial No: \n ]\n1NBSYH3\n 18.0 %\nIGST:\n1\n35990.00\n-2500.00\n28381.36\n5108.64\n33490.00\nShipping And Handling Charges\n1\n40.00',
  'metadata': {'keywords': '',
   'author': '',
   'subject': '',
   'doc_index': 38,
   'source': '../data/pdf_file/laptopinvoice.pdf',
   'moddate': '2023-07-14T12:32:41+05:30',
   'creationDate': "D:20230714123241+05'30'",
   'total_pages': 1,
   'page': 0,
   'format': 'PDF 1.4',
   'start_index': 678,
   'trapped': '',
   'file_path': '../data/pdf_file/laptopinvoice.pdf',
   'content_lenght': 498,
   'title': '',


In [11]:
rm.retreive("Arithmetic Expansion")

Retrieving documents for query: 'Arithmetic Expansion'
Top K: 5, Score threshold: 0.0
Generating Embedding for 1


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.44it/s]

Shapeof Embedding are (1, 384)
5 retreived from vectordb





[{'id': 'doc_5eb5c9dd_14',
  'content': 'Expansion\n[me@linuxbox ~]$ echo ~foo\n/home/foo\nArithmetic Expansion\nThe shell allows arithmetic to be performed by expansion. This allows us to use the shell\nprompt as a calculator.\n[me@linuxbox ~]$ echo $((2 + 2))\n4\nArithmetic expansion uses the following form:\n$((expression))\nwhere expression is an arithmetic expression consisting of values and arithmetic opera-\ntors.\nArithmetic expansion supports only integers (whole numbers, no decimals) but can per-',
  'metadata': {'producer': 'cairo 1.18.0 (https://cairographics.org)',
   'file_path': '../data/pdf_file/output.pdf',
   'subject': '',
   'creationdate': '2025-01-14T15:25:37+05:30',
   'keywords': '',
   'format': 'PDF 1.7',
   'creationDate': "D:20250114152537+05'30",
   'content_lenght': 458,
   'start_index': 0,
   'title': '',
   'moddate': '',
   'modDate': '',
   'trapped': '',
   'author': '',
   'source': '../data/pdf_file/output.pdf',
   'creator': '',
   'total_pages': 

## Generation