<a href="https://colab.research.google.com/github/Amrutvarsh/temporary/blob/main/PDF_READER_RAGMODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I've added a cell to install the necessary libraries. Please run it and then try running the code cell again.

In [4]:
!pip install -U langchain langchain-core langchain-community pypdf pymupdf sentence-transformers faiss-cpu chromadb python-dotenv -q

In [5]:
import os
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [6]:
def Process_all_pdfs(pdf_directory):
    all_documents=[]

    pdf_dir=Path(pdf_directory)
    pdf_files=[str(f) for f in pdf_dir.glob("**/*.pdf")]

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        try:
            print(type(pdf_file))
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = str(pdf_file)
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
        except Exception as e:
            print(f" Error:{e}")

    return all_documents

all_pdfs=Process_all_pdfs("../content/sample_data/pdf_files")

all_pdfs


Found 1 PDF files to process
<class 'str'>


[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250916165023', 'source': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_path': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'total_pages': 27, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20250916165023', 'page': 0, 'source_file': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_type': 'pdf'}, page_content='QZhou-Embedding Technical Report\nKingsoft AI\nQZhou-Embedding Technical Report\nPeng Yu, En Xu, Bin Chen, Haibiao Chen, Yinfei Xu\nKingsoft AI∗\nAugust 2025\nAbstract\nWe present QZhou-Embedding, a general-purpose contextual text embed-\nding model with exceptional text representation capabilities.\nBuilt upon the\nQwen2.5-7B-Instruct foundation model, we designed a uniﬁed multi-task frame-\nwork comprising special

In [7]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter (
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]

    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"Example Chunk")
        print(f"Content: {split_docs[0].page_content[:200]}")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Any,Tuple,Dict
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
chunks= split_documents(all_pdfs)

Split 27 documents into 89 chunks
Example Chunk
Content: QZhou-Embedding Technical Report
Kingsoft AI
QZhou-Embedding Technical Report
Peng Yu, En Xu, Bin Chen, Haibiao Chen, Yinfei Xu
Kingsoft AI∗
August 2025
Abstract
We present QZhou-Embedding, a general-
Metadata: {'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250916165023', 'source': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_path': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'total_pages': 27, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20250916165023', 'page': 0, 'source_file': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_type': 'pdf'}


In [13]:
chunks

[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250916165023', 'source': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_path': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'total_pages': 27, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20250916165023', 'page': 0, 'source_file': '../content/sample_data/pdf_files/QZhou-Embedding Technical Report.pdf', 'file_type': 'pdf'}, page_content='QZhou-Embedding Technical Report\nKingsoft AI\nQZhou-Embedding Technical Report\nPeng Yu, En Xu, Bin Chen, Haibiao Chen, Yinfei Xu\nKingsoft AI∗\nAugust 2025\nAbstract\nWe present QZhou-Embedding, a general-purpose contextual text embed-\nding model with exceptional text representation capabilities.\nBuilt upon the\nQwen2.5-7B-Instruct foundation model, we designed a uniﬁed multi-task frame-\nwork comprising special

In [10]:
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Printing the Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading Model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generatig embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress=True)
        print(f"Generated Embeddings with shape: {embeddings.shape}")
        return embeddings


embeddings_manager = EmbeddingManager()

embeddings_manager



Printing the Embedding model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model Loaded Successfully: 384


<__main__.EmbeddingManager at 0x7efbef763f50>

In [11]:
class VectorStore:
  def __init__(self, collection_name: str = "pdf_documents", persistent_directory : str = "../content/sample_data/vector_store"):
    self.collection_name=collection_name
    self.persistent_directory=persistent_directory
    self.client=None
    self.collection=None
    self._initialize_store()

  def _initialize_store(self):
    try:
      os.makedirs(self.persistent_directory,exist_ok=True)
      self.client=chromadb.PersistentClient(path=self.persistent_directory)
      self.collection=self.client.get_or_create_collection(
          name=self.collection_name,
          metadata={"description":"PDF files embedding for RAG"}
          )
      print(f"Vector Store Initialized. Collection name:{self.collection_name}")
      print(f"Existing documents in the colection:{self.collection.count()}")
    except Exception as e:
      print(f"Error initializing vector store: {e}")
      raise
  def add_documents(self,documents: List[Any], embeddings: np.ndarray):

    if len(documents) != len(embeddings):
      raise ValueError("Number of documents must match number of embeddings")

    ids=[]
    metadatas=[]
    documents_text=[]
    embeddings_list=[]


    for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
      doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
      ids.append(doc_id)
      metadata=dict(doc.metadata)
      metadata["doc_index"]=i
      metadata["context_length"]=len(doc.page_content)
      metadatas.append(metadata)
      documents_text.append(doc.page_content)
      embeddings_list.append(embedding.tolist()) # Corrected line: append individual embedding

    try:

      self.collection.add(
          ids=ids,
          documents=documents_text,
          metadatas=metadatas,
          embeddings=embeddings_list
      )
      print(f"Added {len(documents)} documents to the vector store")
      print(f"Total documents in the collection:{self.collection.count()}")
    except Exception as e:
      print(f"Error adding documents to the vector store: {e}")
      raise

vectorStore = VectorStore()

vectorStore

Vector Store Initialized. Collection name:pdf_documents
Existing documents in the colection:0


<__main__.VectorStore at 0x7efbecc64ec0>

In [12]:
texts=[doc.page_content for doc in chunks]

embeddings=embeddings_manager.generate_embeddings(texts)

vectorStore.add_documents(chunks,embeddings)

Generatig embeddings for 89 texts...
Generated Embeddings with shape: (89, 384)
Added 89 documents to the vector store
Total documents in the collection:89


In [19]:
class RAGRetriver:
  """ Handles Query based retrival from Vector Store"""
  def __init__(self,vector_store : VectorStore, embeddings_manager: EmbeddingManager):
    self.vector_store=vector_store
    self.embeddings_manager=embeddings_manager

  def retrieve(self, query: str, top_k: int = 5, threshold: float=0.0) -> List[Dict[str, Any]]:

    print(f"Retrieving documents for query: {query}")
    print(f"Recieved data for top {top_k} documents with threshold {threshold}")

    query_embedding=self.embeddings_manager.generate_embeddings([query])[0]
    #print(query_embedding)

    try:
      results = self.vector_store.collection.query(
          query_embeddings=[query_embedding.tolist()],
          n_results=top_k,
      )
      print(results)
      print(f"Retrieved {len(results['documents'][0])} documents from the vector store")
      retrieved_docs=[]

      if results["documents"] and results["documents"][0]:
        documents=results["documents"][0]
        metadatas=results["metadatas"][0]
        distances=results["distances"][0]
        ids = results["ids"][0]

        for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):

          similarity_score = 1-distance
          if similarity_score>= threshold:
            retrieved_docs.append(
                {
                    "id":doc_id,
                    "content":document,
                    "metadata":metadata,
                    "similarity_score":similarity_score,
                    "distance":distance,
                    "rank": i+1
                }
            )
        print(f"Retrieved {len(retrieved_docs)} documents from the vector store")
      else:
        print(f"No documnets")
      return retrieved_docs
    except Exception as e:
      print(f"Error retrieving documents from the vector store: {e}")
      return []

retriver=RAGRetriver(vectorStore,embeddings_manager)

retriver







<__main__.RAGRetriver at 0x7efbe099e000>

In [15]:
retriver.retrieve("Unified Multi-task Learning Framework")

Retrieving documents for query: Unified Multi-task Learning Framework
Recieved data for top 5 documents with threshold 0.0
Generatig embeddings for 1 texts...
Generated Embeddings with shape: (1, 384)
{'ids': [['doc_d7b9dd75_9', 'doc_cc25641d_7', 'doc_84161022_18', 'doc_f46ad8ed_13', 'doc_dd87fc60_17']], 'embeddings': None, 'documents': [['erage scores on CMTEB[22] and MTEB[23] benchmarks, ranking ﬁrst overall on both\nCMTEB and MTEB leaderboards, demonstrating the eﬀectiveness of our approach.\nThe contributions of our work are summarized as follows:\n• We propose a uniﬁed multi-task learning framework that systematically coordi-\nnates both data processing and training pipelines, enhancing diversity in datasets\nand eﬃciency in model training ;\n• We develop advanced data synthesis techniques powered by LLM, including Para-\nphrasing, Data augmentation, and Hard negative generation.\nThese methods\nsigniﬁcantly enhance the quality of training corpora, thereby improving model’s\nrobus

[{'id': 'doc_d7b9dd75_9',
  'content': 'erage scores on CMTEB[22] and MTEB[23] benchmarks, ranking ﬁrst overall on both\nCMTEB and MTEB leaderboards, demonstrating the eﬀectiveness of our approach.\nThe contributions of our work are summarized as follows:\n• We propose a uniﬁed multi-task learning framework that systematically coordi-\nnates both data processing and training pipelines, enhancing diversity in datasets\nand eﬃciency in model training ;\n• We develop advanced data synthesis techniques powered by LLM, including Para-\nphrasing, Data augmentation, and Hard negative generation.\nThese methods\nsigniﬁcantly enhance the quality of training corpora, thereby improving model’s\nrobustness and generalization capabilities;\n• We emply a two-stage training paradigm: Stage 1 focuses exclusively on retrieval\ncapability building, establishing strong foundational retrieval performance; and\nstage 2 implements balanced training with controled retrieval/non-retrieval task',
  'metadata':

In [16]:
from langchain import Cohere
import os
os.environ["COHERE_API_KEY"] = "ShfsREOcnyfmLnYtkvOMRCeczeVXqCRQzX1AFWBw"
FM_Model_3 = 'command-light-nightly'
llm=Cohere(model = FM_Model_3, max_tokens = 1000, temperature = 1)

  llm=Cohere(model = FM_Model_3, max_tokens = 1000, temperature = 1)


In [17]:
from langchain_cohere import ChatCohere
import os
from google.colab import userdata

# Access your API key from Colab secrets
os.environ["COHERE_API_KEY"] = userdata.get('COHERE_API_KEY') # Please replace with your actual API key

# The model 'command-light-nightly' was not found. Please check the Cohere documentation
# for available models and ensure your API key has access to the model you want to use.
# Replacing with 'command-r' as a common alternative.
FM_Model_3 = 'command-a-03-2025' # Replace with an available model from Cohere documentation
llm=ChatCohere(model = FM_Model_3, max_tokens = 1000, temperature = 0.1)

def simple_RAG(query,rag_retriver,llm,top_k=3):
  retrieved_docs=rag_retriver.retrieve(query,top_k=top_k)
  context="\n".join([f"{doc['content']}" for doc in retrieved_docs])

  if not context.strip():
    return "No Relevant Documents Found"

  prompt =f""" Use the following context to answer the questio concisely.
            context: {context}
            Question:{query}
            Answer:"""

  response=llm.invoke([prompt.format(context=context,query=query)])
  return response.content



In [20]:
answer = simple_RAG("What is Unified Multi-task Learning Framework?",retriver,llm)
print(answer)

Retrieving documents for query: What is Unified Multi-task Learning Framework?
Recieved data for top 3 documents with threshold 0.0
Generatig embeddings for 1 texts...
Generated Embeddings with shape: (1, 384)
{'ids': [['doc_d7b9dd75_9', 'doc_cc25641d_7', 'doc_84161022_18']], 'embeddings': None, 'documents': [['erage scores on CMTEB[22] and MTEB[23] benchmarks, ranking ﬁrst overall on both\nCMTEB and MTEB leaderboards, demonstrating the eﬀectiveness of our approach.\nThe contributions of our work are summarized as follows:\n• We propose a uniﬁed multi-task learning framework that systematically coordi-\nnates both data processing and training pipelines, enhancing diversity in datasets\nand eﬃciency in model training ;\n• We develop advanced data synthesis techniques powered by LLM, including Para-\nphrasing, Data augmentation, and Hard negative generation.\nThese methods\nsigniﬁcantly enhance the quality of training corpora, thereby improving model’s\nrobustness and generalization capa