Important Imports

In [None]:
from google import genai
from google.genai.errors import ClientError
from google.genai import types
import os
from dotenv import load_dotenv
from google.genai.types import EmbedContentConfig
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

Load apiKey from .env file

In [None]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

Extract Text from PDF Function

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        full_text = ""

        if len(reader.pages) == 0:
            print("PDF has no pages")
            return ""

        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                full_text += text

        print(f"Number of pages in PDF: {len(reader.pages)}")

        return full_text

    except Exception as e:
        return f"An error occurred: {e}"


Use extract_text function to extract from pdf

In [None]:
file_to_extract = "./chemistry9.pdf"
extracted_txt_content = extract_text_from_pdf(file_to_extract)

# Uncomment this line to print extracted text
if "An error occurred" not in extracted_txt_content:
    print("Extracted Text:")
    print(extracted_txt_content)
else:
    print(extracted_txt_content)

Extract the pdf name to use as metadata

In [None]:
def get_fileName(file):
    return file[2:len(file)]

file_Name = get_fileName(file_to_extract)
print(file_Name)

Split the extracted text into Chunks using LangChain

In [None]:
def chunk_text(text, chunk_size=300, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,      
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text.replace("\n", " "))
    return chunks
txt_after_chunk_array = chunk_text(extracted_txt_content)

# Uncomment this line to print output of Chunked Text
print(len(txt_after_chunk_array))
print(txt_after_chunk_array)

Embedding the Chunks With Gemini-Embedding-Model

In [None]:
embedding_dim = 768 # increase the dimension to increase the quality

embeddings = []
for chunk in txt_after_chunk_array:
    response = client.models.embed_content( 
        model="gemini-embedding-001",
        contents=[chunk],
        config=types.EmbedContentConfig(
            task_type="RETRIEVAL_DOCUMENT",
            output_dimensionality=embedding_dim
        ),
    )
    embeddings.append(response)

# processed_embeddings contains the floating number(VECTORS)
processed_embeddings = [
    e.embeddings[0].values for e in embeddings
]
print(processed_embeddings)

Create ChromaDB Collection

In [None]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

collection = chroma_client.get_or_create_collection(
    name="TestTryCollection",
    metadata={"source": "ARJ's"}
)

Insert data into ChromaDB

In [None]:
ids = [f"{file_Name}_chunk_{i}" for i in range(len(txt_after_chunk_array))]
print(len(txt_after_chunk_array))

collection.upsert(
    ids=ids,
    documents=txt_after_chunk_array,
    embeddings=processed_embeddings,
    metadatas=[{"chunk_index": i, "Source": file_Name} for i in range(len(txt_after_chunk_array))]
)

Embed User query/promt with Gemini

In [None]:
query_text = "What is organic chemistry?"
# formula of force?
# What is  Tyndall Effect and how can it be observed?
# Antoine  Laurent Lavoisier born in? ====> 1943
# In physics what is written in section 9.1.1?
length_of_query = len(query_text.strip())
if length_of_query == 0:
    print("Query is Empty")
else:  
    try:
        query_response = client.models.embed_content(
            model="gemini-embedding-001",
            contents=[query_text],
            config=types.EmbedContentConfig(
                task_type="RETRIEVAL_QUERY",
                output_dimensionality=768
            ),
        )
    except ClientError as e:
            print(f"An error occurred while embedding the query: {e.message}")

# embedded_query contaids float value (VECTOR)
if query_response.embeddings is not None:
    embedded_query = query_response.embeddings[0].values
    print(embedded_query)
else:
    print("I think Embedded_Query is empty")

Getting result from chromaDB

In [None]:
if query_response.embeddings is not None and len(query_response.embeddings) > 0:
    
    results = collection.query(
        query_embeddings=[embedded_query],
        n_results=5
    )
    print("Query results:", results["documents"])

else:
    print("Failed to generate query embedding")
    print("Response:", query_response)

Getting response from Gemini

In [None]:
try:
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=[f"This is my question:{query_text} tell me from this context: {results['documents']}. Remember to lastly mention only Source from {results['metadatas']}"]
    )
    print(response.text)
except Exception as e:
    if hasattr(e, "message") and "model is overloaded" in str(e.message):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    elif "model is overloaded" in str(e):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    else:
        print(f"An error occurred: {e}")