In [29]:
import os
import chromadb
from dotenv import load_dotenv
from pypdf import PdfReader
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
import uuid
from langchain.document_loaders import PyPDFLoader
from typing import Dict, List, Tuple
# Load environment variables from .env file
load_dotenv()

# def load_pdf(file_path: str) -> str:
#     """
#     Reads text content from a PDF file and returns it as a single string.
    
#     Args:
#         file_path (str): Path to the PDF file.
    
#     Returns:
#         str: Concatenated text from all pages.
#     """
#     reader = PdfReader(file_path)
#     text = ""
#     for page in reader.pages:
#         text += page.extract_text() or ""  # Handle cases where extract_text returns None
#     return text

True

In [30]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [31]:
def create_chroma_db(documents: List[str], path: str, name: str) -> tuple:
    """
    Creates a Chroma database with the provided documents.
    
    Args:
        documents (List[str]): List of text chunks to embed.
        path (str): Directory path for ChromaDB persistence.
        name (str): Name of the Chroma collection.
    
    Returns:
        tuple: Chroma collection object and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    for i, d in enumerate(documents):
        db.add(documents=[d], ids=str(i))
    return db, name

In [32]:
def load_chroma_collection(path: str, name: str):
    """
    Loads an existing Chroma collection.
    
    Args:
        path (str): Directory path of ChromaDB.
        name (str): Name of the collection to load.
    
    Returns:
        chromadb.Collection: Loaded Chroma collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    return db

In [33]:

import re
import os

def clean_pdf(pdf_path: str, min_word_count: int = 20) -> Tuple[str, Dict]:
    """
    Reads a PDF file, removes pages with fewer than the specified word count,
    and returns the cleaned content along with metadata.
    
    Args:
        pdf_path (str): Path to the PDF file
        min_word_count (int): Minimum word count for a page to be included (default: 20)
        
    Returns:
        Tuple[str, Dict]: A tuple containing:
            - cleaned_content (str): The cleaned content from the PDF
            - metadata (Dict): Metadata about the original and cleaned PDF
    """
    # Check if file exists
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    
    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    
    # Prepare variables for metadata
    total_pages = len(pages)
    removed_pages = []
    kept_pages = []
    cleaned_content = ""
    
    # Process each page
    for i, page in enumerate(pages):
        page_content = page.page_content.strip()
        words = re.findall(r'\b\w+\b', page_content)
        word_count = len(words)
        
        # Decide whether to keep or remove the page
        if word_count >= min_word_count:
            cleaned_content += page_content + "\n\n"
            kept_pages.append(i + 1)  # +1 for human-readable page numbers
        else:
            removed_pages.append(i + 1)  # +1 for human-readable page numbers
    return cleaned_content

# Example usage in Jupyter notebook:
# Replace 'your_pdf_file.pdf' with the path to your PDF file
# pdf_content = clean_pdf(r'pdfs\2312.10997v5.pdf')
# # 
# # # Access the cleaned content
# print("Cleaned Content Preview (first 500 chars):")
# print(pdf_content[:50000] + "..." if len(pdf_content) > 500 else pdf_content)


In [34]:
def embed_query(query: str) -> List[float]:
    """
    Embeds the query using Gemini AI with task_type="retrieval_query".
    
    Args:
        query (str): The query text.
    
    Returns:
        List[float]: The embedded query vector.
    """
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided.")
    genai.configure(api_key=gemini_api_key)
    model = "models/embedding-001"
    return genai.embed_content(model=model,
                               content=query,
                               task_type="retrieval_query")["embedding"]

def get_relevant_passage(query: str, db, n_results: int = 3) -> List[str]:
    """
    Retrieves the most relevant text chunks from ChromaDB based on the query.
    
    Args:
        query (str): User's question.
        db: Chroma collection object.
        n_results (int): Number of top results to retrieve.
    
    Returns:
        List[str]: List of relevant text chunks.
    """
    query_embedding = embed_query(query)
    results = db.query(query_embeddings=[query_embedding], n_results=n_results)
    return results['documents'][0]

In [35]:
def make_rag_prompt(query: str, relevant_passage: str) -> str:
    """
    Creates a prompt for the Gemini model using the query and relevant text.
    
    Args:
        query (str): User's question.
        relevant_passage (str): Retrieved text to include in the prompt.
    
    Returns:
        str: Formatted prompt string.
    """
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage below. 
    Respond in complete sentences, be comprehensive, and include all relevant background information. 
    dont write hufe number of words, simply give the answer to the question.
    Since the user may not know the context, break down complicated concepts and use a friendly, conversational tone. 
    If the passage doesn’t contain enough information to answer the question, say you don’t have enough info to provide a full answer.
    QUESTION: '{query}'
    PASSAGE: '{escaped}'

    ANSWER:
    """
    return prompt

In [36]:
def gemini_answer(prompt: str) -> str:
    """
    Generates an answer using the Gemini AI model.
    
    Args:
        prompt (str): Formatted prompt string.
    
    Returns:
        str: Generated answer text.
    """
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-2.0-pro-exp-02-05')
    answer = model.generate_content(prompt)
    return answer.text

In [37]:
def generate_answer(db, query: str) -> str:
    """
    Generates an answer to the query using the RAG pipeline.
    
    Args:
        db: Chroma collection object.
        query (str): User's question.
    
    Returns:
        str: Generated answer.
    """
    relevant_text = get_relevant_passage(query, db, n_results=3)
    prompt = make_rag_prompt(query, " ".join(relevant_text))
    answer = gemini_answer(prompt)
    return answer

In [41]:
def create_chroma_db_if_not_exists(documents, path, name):
    collection_path = os.path.join(path, name)
    if os.path.exists(collection_path):
        print(f"Collection [{name}] already exists. Skipping creation.")
        return None  # or return existing collection if you load it elsewhere
    else:
        return create_chroma_db(documents=documents, path=path, name=name)


In [59]:
def main():
    # Define file path and database parameters
    db_path = r"vectordb"
    collection_name = "rag_experiment_v5"

    # Step 1: Read and scrape text from PDF
    print("Loading PDF...")
    pdf_text = clean_pdf(r'pdfs\Premarathna, Akila Nishan - 4500013458 - Cost Amendment.docx.pdf')

    # Step 2: Chunk the text using RecursiveCharacterTextSplitter
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjusted for meaningful chunks
        chunk_overlap=100,  # Overlap to maintain context
        length_function=len,
    )
    chunked_text = text_splitter.split_text(pdf_text)
    chunked_text = [chunk for chunk in chunked_text if len(chunk.strip()) > 50]
    print(f"Created {len(chunked_text)} chunks.")

    # Step 3: Check if collection exists, if not create it
    try:
        print("Checking if collection exists...")
        db = load_chroma_collection(path=db_path, name=collection_name)
        print(f"Collection [{collection_name}] found. Updating embeddings...")
    except Exception as e:
        # If collection doesn't exist, create it
        print(f"Collection not found. Creating new ChromaDB collection...")
        db, name = create_chroma_db(documents=chunked_text, path=db_path, name=collection_name)
        print(f"Created new collection: {name}")
        # Skip the remaining steps since we've already added the documents
        return

    # Step 4: Delete all existing documents in the collection
    print("Deleting existing embeddings...")
    all_ids = db.get()['ids']
    if all_ids:
        db.delete(ids=all_ids)
        print(f"Deleted {len(all_ids)} existing documents from the collection.")
    
    # Step 5: Add new documents to the collection
    print("Adding new embeddings...")
    for i, chunk in enumerate(chunked_text):
        db.add(documents=[chunk], ids=str(i))
    print(f"Added {len(chunked_text)} new document chunks.")

    # Step 6: Query the system
    test_query = " What are the deliverybles that he got?"
    print(f"\nQuerying: '{test_query}'")
    answer = generate_answer(db, test_query)
    print("Answer:")
    print(answer)

In [60]:
if __name__ == "__main__":
    main()

Loading PDF...
Splitting text into chunks...
Created 9 chunks.
Checking if collection exists...
Collection [rag_experiment_v5] found. Updating embeddings...
Deleting existing embeddings...
Deleted 143 existing documents from the collection.
Adding new embeddings...


  db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())


Added 9 new document chunks.

Querying: ' What are the deliverybles that he got?'
Answer:
Based on the document you provided, it looks like Akila Nishan Premarathna, who is the consultant in this "Amendment to Consultancy Contract," has a specific task or deliverable mentioned.

The passage states one clear deliverable:
1.  **"Validate the model outputs and retrain the model using new wastewater treatment plants for Mexico to keep the model up to date."** This particular task has a due date of 15 June 2025.

The document also mentions "Provided (Annexure 1.A) is not carried out, the fee will be deducted in proportion." This strongly suggests that "Annexure 1.A" contains a more detailed list of the deliverables. However, the actual content of Annexure 1.A isn't included in the text you've shared, so I don't have enough info to provide a full list of all the deliverables he got.

Additionally, as part of his responsibilities, the consultant must make any "modifications/improvements to th