In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
loader  = PyMuPDFLoader(
    "cc2.pdf",
)
doc = loader.load()
doc

CREATING THE CHUNKS

In [None]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
chunks = text_splitter.split_documents(doc)
chunk_content = []
for chunk in chunks:
    chunk_content.append(chunk.page_content)
len(chunk_content)

IMPORTING LIBRARIES

In [None]:
import numpy as np
import uuid
from typing import Any, List, Dict, Tuple
from sentence_transformers import SentenceTransformer
import os
import chromadb
from chromadb.config import Settings

MAKING THE EMBEDDING MANAGER

In [None]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"model loaded successfully: {self.model_name}\n Embedding dimensions: {self.model.get_sentence_embedding_dimension}")
        except Exception as e:
            print("error: ",e)
            raise
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("model not loaded bitch!")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings

CREATING THE VECTOR STORAGE

In [None]:
class VectorStorage:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "./data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize()
        
    def _initialize(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory) #client have reference to the vector store chromadb
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"desc":"cc unit 3 pdf"}
            )
            print("Vector store initialized: ", self.collection_name)
            print("existing docs in collections: ", self.collection.count())
        except Exception as e:
            print("error setting up vector store: ", e)
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents)!= len(embeddings):
            raise ValueError("Number of docs doesnt match embedding size")
        ids=[]
        metadatas = []
        doc_text = []
        embed_list = []
        
        for i, (doc,embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            
            doc_text.append(doc.page_content)
            
            embed_list.append(embedding.tolist())
        try:
            self.collection.add(
                ids=ids,
                embeddings=embed_list,
                metadatas=metadatas,
                documents=doc_text
            )
            print(f"added {len(documents)} to the vector store")
            print("total number of collections: ", self.collection.count())
            
        except Exception as e:
            print("some error: ", e)
            raise

vector_store  = VectorStorage()
vector_store
        

GENERATING THE EMBEDDINGS

In [None]:
embed_manager = EmbeddingManager()
embed_manager._load_model()
embeddings = embed_manager.generate_embeddings(chunk_content)

ADDING DOCUMENTS INTO THE VECTOR STORE

In [None]:
# vector_store.add_documents(doc, embeddings)
# len(embeddings)
vector_store.add_documents(chunks, embeddings)
embeddings.shape

CREATING THE RETIVAL PIPELINE

In [None]:
# retriver pipeline

class ragretriver:
    def __init__(self, vector_store: VectorStorage, embed_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embed_manager = embed_manager
        
    def retrieve(self, query: str, top_k: int =5, score_threshold: float=0.0)-> List[Dict[str,Any]]:
        print("Retrieving document for the query: ", query)
        query_emdedding = self.embed_manager.generate_embeddings([query])[0]
        
        try:
            res = self.vector_store.collection.query(
                query_embeddings=[query_emdedding.tolist()],
                n_results=top_k
            )
            retrieved_docs = []
            
            if res['documents'] and res['documents'][0]:
                doc = res['documents'][0]
                metadatas = res['metadatas'][0]
                distances = res['distances'][0]
                ids = res['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, doc, metadatas, distances)):
                    similarity_score = 1-distance
                    
                    if similarity_score>=score_threshold:
                        retrieved_docs.append({
                            'id':doc_id,
                            'content': doc,
                            'metadata': metadata,
                            'similarit_score': similarity_score,
                            'distance':distance,
                            'rank':i+1
                        }
                        )
                print(f"{len(retrieved_docs)} documents fetched!")
                # print(retrieved_docs)
            else:
                print("No document matched")
            return retrieved_docs
        except Exception as e:
            print("some error:")
            raise
retriver = ragretriver(vector_store, embed_manager)
ans = retriver.retrieve("briefly explain the Characteristics of PaaS")
# for a in ans:
#     print(a)
''.join(ans[0]['content']).replace('\n','')

CONENCTING TO A LLM FOR ENHANCING THE CONTEXT RESPONSE

In [None]:
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
load_dotenv()

llm = ChatGoogleGenerativeAI(
    model = 'gemini-2.5-flash',
    temprature=0,
    max_token = 1024
)

def rag_res(query, retriver, llm, top_k=3):
    
    # retrieve the context
    res = retriver.retrieve(query, top_k=top_k)
    context = ''.join(res[0]['content']).replace('\n','') if res else ""
    if not context:
        return f"no relevant context to the query: {query}"
    prompt = "use the below context to answer the query preciesly: {context} and thew query is: {query}. just give straight your response no need to add your statements and also dont be like according to the context provided and all such opening statements"
    
    res = llm.invoke([prompt.format(context=context, query=query)])
    return res.content


    

In [None]:
ans = rag_res("", retriver, llm)
ans