In [1]:
import os
import chromadb
from dotenv import load_dotenv
from pypdf import PdfReader
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
import uuid
# Load environment variables from .env file
load_dotenv()

def load_pdf(file_path: str) -> str:
    """
    Reads text content from a PDF file and returns it as a single string.
    
    Args:
        file_path (str): Path to the PDF file.
    
    Returns:
        str: Concatenated text from all pages.
    """
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""  # Handle cases where extract_text returns None
    return text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [3]:
def create_chroma_db(documents: List[str], path: str, name: str) -> tuple:
    """
    Creates a Chroma database with the provided documents.
    
    Args:
        documents (List[str]): List of text chunks to embed.
        path (str): Directory path for ChromaDB persistence.
        name (str): Name of the Chroma collection.
    
    Returns:
        tuple: Chroma collection object and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    for i, d in enumerate(documents):
        db.add(documents=[d], ids=str(i))
    return db, name

In [4]:
def load_chroma_collection(path: str, name: str):
    """
    Loads an existing Chroma collection.
    
    Args:
        path (str): Directory path of ChromaDB.
        name (str): Name of the collection to load.
    
    Returns:
        chromadb.Collection: Loaded Chroma collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    return db

In [6]:
def embed_query(query: str) -> List[float]:
    """
    Embeds the query using Gemini AI with task_type="retrieval_query".
    
    Args:
        query (str): The query text.
    
    Returns:
        List[float]: The embedded query vector.
    """
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided.")
    genai.configure(api_key=gemini_api_key)
    model = "models/embedding-001"
    return genai.embed_content(model=model,
                               content=query,
                               task_type="retrieval_query")["embedding"]

def get_relevant_passage(query: str, db, n_results: int = 3) -> List[str]:
    """
    Retrieves the most relevant text chunks from ChromaDB based on the query.
    
    Args:
        query (str): User's question.
        db: Chroma collection object.
        n_results (int): Number of top results to retrieve.
    
    Returns:
        List[str]: List of relevant text chunks.
    """
    query_embedding = embed_query(query)
    results = db.query(query_embeddings=[query_embedding], n_results=n_results)
    return results['documents'][0]

In [7]:
def make_rag_prompt(query: str, relevant_passage: str) -> str:
    """
    Creates a prompt for the Gemini model using the query and relevant text.
    
    Args:
        query (str): User's question.
        relevant_passage (str): Retrieved text to include in the prompt.
    
    Returns:
        str: Formatted prompt string.
    """
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage below. 
    Respond in complete sentences, be comprehensive, and include all relevant background information. 
    Since the user may not know the context, break down complicated concepts and use a friendly, conversational tone. 
    If the passage doesn’t contain enough information to answer the question, say you don’t have enough info to provide a full answer.
    QUESTION: '{query}'
    PASSAGE: '{escaped}'

    ANSWER:
    """
    return prompt

In [8]:
def gemini_answer(prompt: str) -> str:
    """
    Generates an answer using the Gemini AI model.
    
    Args:
        prompt (str): Formatted prompt string.
    
    Returns:
        str: Generated answer text.
    """
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-2.0-pro-exp-02-05')
    answer = model.generate_content(prompt)
    return answer.text

In [9]:
def generate_answer(db, query: str) -> str:
    """
    Generates an answer to the query using the RAG pipeline.
    
    Args:
        db: Chroma collection object.
        query (str): User's question.
    
    Returns:
        str: Generated answer.
    """
    relevant_text = get_relevant_passage(query, db, n_results=3)
    prompt = make_rag_prompt(query, " ".join(relevant_text))
    answer = gemini_answer(prompt)
    return answer

In [30]:
def main():
    # Define file path and database parameters
    file_path = r"Resume2.pdf"
    db_path = r"vectordb"
    collection_name = "rag_experiment_v5"

    # Step 1: Read and scrape text from PDF
    print("Loading PDF...")
    pdf_text = load_pdf(file_path)

    # Step 2: Chunk the text using RecursiveCharacterTextSplitter
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjusted for meaningful chunks
        chunk_overlap=100,  # Overlap to maintain context
        length_function=len,
    )
    chunked_text = text_splitter.split_text(pdf_text)
    chunked_text = [chunk for chunk in chunked_text if len(chunk.strip()) > 50]
    print(f"Created {len(chunked_text)} chunks.")

    
    # # Step 3: Embed and store in ChromaDB (uncomment to create anew)
    print("Creating new ChromaDB collection...")
    db, name = create_chroma_db(documents=chunked_text, path=db_path, name=collection_name)

    # Step 4: Load the collection
    db = load_chroma_collection(path=db_path, name=collection_name)

        #   Step 4: Delete all existing documents in the collection
    # all_ids = db.get()['ids']
    # if all_ids:
    #     db.delete(ids=all_ids)
    #     print(f"Deleted {len(all_ids)} existing documents from the collection.")



    # Step 5: Query the system
    test_query = "What are the projct that he has done?"
    print(f"\nQuerying: '{test_query}'")
    answer = generate_answer(db, test_query)
    print("Answer:")
    print(answer)

if __name__ == "__main__":
    main()

Loading PDF...
Splitting text into chunks...
Created 3 chunks.

Querying: 'What are the projct that he has done?'


Number of requested results 3 is greater than number of elements in index 0, updating n_results = 0


Answer:
That's a great question! You're asking about the projects that a particular person has completed.

However, the provided text passage is completely empty. Because there's no information in the passage, I can't tell you who "he" refers to or what projects that person might have worked on.

To answer your question, I'd need a passage that mentions a specific person and describes the projects they have been involved with. Sorry I couldn't be more helpful with the text given!


In [41]:
def main():
    # Define file path and database parameters
    file_path = r"CV - Akila  Nishan .pdf"
    db_path = r"vectordb"
    collection_name = "rag_experiment_v3"

    # Step 1: Read and scrape text from PDF
    print("Loading PDF...")
    pdf_text = load_pdf(file_path)

    # Step 2: Chunk the text using RecursiveCharacterTextSplitter
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjusted for meaningful chunks
        chunk_overlap=100,  # Overlap to maintain context
        length_function=len,
    )
    chunked_text = text_splitter.split_text(pdf_text)
    chunked_text = [chunk for chunk in chunked_text if len(chunk.strip()) > 50]
    print(f"Created {len(chunked_text)} chunks.")

    #   # # Step 3: Embed and store in ChromaDB (uncomment to create anew)
    # print("Creating new ChromaDB collection...")
    # db, name = create_chroma_db(documents=chunked_text, path=db_path, name=collection_name)


    # Step 3: Load the ChromaDB collection
    print("Loading ChromaDB collection...")
    db = load_chroma_collection(path=db_path, name=collection_name)

    # Step 4: Delete all existing documents in the collection
    all_ids = db.get()['ids']
    if all_ids:
        db.delete(ids=all_ids)
        print(f"Deleted {len(all_ids)} existing documents from the collection.")

    # Step 5: Add new documents to the collection
    print("Adding new documents to ChromaDB...")
    for i, chunk in enumerate(chunked_text):
        db.add(documents=[chunk], ids=[str(i)])
    print(f"Added {len(chunked_text)} new documents to the collection.")

    # Step 6: Query the system
    test_query = "what are the experience he has?"
    print(f"\nQuerying: '{test_query}'")
    answer = generate_answer(db, test_query)
    print("Answer:")
    print(answer)

if __name__ == "__main__":
    main()

Loading PDF...
Splitting text into chunks...
Created 7 chunks.
Loading ChromaDB collection...
Deleted 7 existing documents from the collection.
Adding new documents to ChromaDB...
Added 7 new documents to the collection.

Querying: 'what are the experience he has?'
Answer:
Based on the passage, this individual has a diverse range of experiences, particularly blending surveying science, data collection, and cutting-edge AI development. Let's break it down:

1.  **Data Collection and Geospatial Analysis:**
    *   **Hands-on Field Experience:** They have practical experience collecting data using several advanced technologies:
        *   **UAV LIDAR:** This involves using drones (Unmanned Aerial Vehicles) equipped with Light Detection and Ranging (LIDAR) sensors to create detailed 3D maps of the ground surface.
        *   **TLS (Terrestrial Laser Scanner):** This is ground-based laser scanning, also used to create highly accurate 3D models (point clouds) of objects and environments. Th