Important Imports

In [None]:
from google import genai
from google.genai.errors import ClientError
from google.genai import types
import os
from dotenv import load_dotenv
from google.genai.types import EmbedContentConfig
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

Load apiKey from .env file

In [None]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

Extract Text from PDF Function

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        full_text = ""

        if len(reader.pages) == 0:
            print("PDF has no pages")
            return ""

        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                full_text += text

        print(f"Number of pages in PDF: {len(reader.pages)}")

        return full_text

    except Exception as e:
        return f"An error occurred: {e}"


Use extract_text function to extract from pdf

In [None]:
file_to_extract = "./chemistry9.pdf"
extracted_txt_content = extract_text_from_pdf(file_to_extract)

# Uncomment this line to print extracted text
if "An error occurred" not in extracted_txt_content:
    print("Extracted Text:")
    print(extracted_txt_content)
else:
    print(extracted_txt_content)

Extract the pdf name to use as metadata

In [None]:
def get_fileName(file):
    return file[2:len(file)]

file_Name = get_fileName(file_to_extract)
print(file_Name)

Split the extracted text into Chunks using LangChain

In [None]:
def chunk_text(text, chunk_size=300, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,      
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text)
    return chunks
txt_after_chunk_array = chunk_text(extracted_txt_content)

# Uncomment this line to print output of Chunked Text
print(len(txt_after_chunk_array))
print(txt_after_chunk_array)

Embedding the Chunks With Gemini-Embedding-Model

In [None]:
embedding_dim = 768 # increase the dimension to increase the quality

embeddings = []
for chunk in txt_after_chunk_array:
    response = client.models.embed_content( 
        model="gemini-embedding-001",
        contents=[types.Part.from_text(text=chunk)],
        config=types.EmbedContentConfig(
            task_type="RETRIEVAL_DOCUMENT",
            output_dimensionality=embedding_dim
        ),
    )
    embeddings.append(response)

# processed_embeddings contains the floating number(VECTORS)
processed_embeddings = [
    e.embeddings[0].values for e in embeddings
]
print(processed_embeddings)

Create ChromaDB Collection

In [None]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

collection = chroma_client.get_or_create_collection(
    name="TestTryCollection",
    metadata={"source": "ARJ's"}
)

Insert data into ChromaDB

In [None]:
ids = [f"{file_Name}_chunk_{i}" for i in range(len(txt_after_chunk_array))]
print(len(txt_after_chunk_array))

collection.upsert(
    ids=ids,
    documents=txt_after_chunk_array,
    embeddings=processed_embeddings,
    metadatas=[{"chunk_index": i, "Source": file_Name} for i in range(len(txt_after_chunk_array))]
)

Embed User query/promt with Gemini

In [51]:
query_text = "Can u give me whole sumary of chemistry document?"
length_of_query = len(query_text.strip())
if length_of_query == 0:
    print("Query is Empty")
else:  
    try:
        query_response = client.models.embed_content(
            model="gemini-embedding-001",
            contents=[types.Part.from_text(text=query_text)],
            config=types.EmbedContentConfig(
                task_type="RETRIEVAL_QUERY",
                output_dimensionality=768
            ),
        )
    except ClientError as e:
            print(f"An error occurred while embedding the query: {e.message}")

# embedded_query contaids float value (VECTOR)
if query_response.embeddings is not None:
    embedded_query = query_response.embeddings[0].values
    print(embedded_query)
else:
    print("I think Embedded_Query is empty")

[0.023519054, -0.031091241, 0.025897365, -0.037018612, 0.009995096, 0.032825585, 0.008069212, -0.005838712, -0.0059679532, 0.02835334, -0.00038137767, -0.0033704282, 0.011216595, 0.029334174, 0.11914424, 0.0021371306, -0.008214597, -0.0048142406, -0.008940175, -0.0062225824, -0.021553395, 0.03695683, 0.015918693, -0.010109461, 0.011459379, -0.0006371037, 0.0058113756, 0.005330755, 0.030027572, 0.018309193, 0.01453936, 0.031152505, -0.01238154, 0.013443175, -0.007067455, 0.029167011, 0.005421869, -0.021258567, -0.00019548161, -0.0040478073, -0.0146163795, 0.028436022, 0.020476785, -0.016991777, 0.0034780935, 0.0064536934, 0.004283533, -0.0028745143, 0.0020937559, 0.039495956, -0.012784641, -0.0018386074, -0.006300956, -0.1580355, 0.002254757, -0.022800615, -0.014498369, -0.00029258567, -0.0050365734, -0.0026211329, -0.021258047, 0.03996928, -0.0317543, 0.004220885, 0.004137122, -0.010693188, 0.003623403, 0.0033731356, -0.02647372, -0.009576349, -0.0036600323, -0.01699987, -0.011132659, 

Getting result from chromaDB

In [52]:
if query_response.embeddings is not None and len(query_response.embeddings) > 0:
    
    results = collection.query(
        query_embeddings=[embedded_query],
        n_results=5
    )
    print("Query results:", results["documents"])

else:
    print("Failed to generate query embedding")
    print("Response:", query_response)

Query results: [['combining elements.\n• The composition of a compound is the\nsame throughout. We can also observe\nthat the texture and the colour of the\ncompound are the same throughout.\nThus, we can summarise the physical\nand chemical nature of matter in the\nfollowing graphical organiser:', 'daily life. The particles are called the dispersed phase and\nthe medium in which they are distributed is called the\ndispersion medium.\n• Pure substances can be elements or compounds. An element\nis a form of matter that cannot be broken down by chemical', '2. Try segr egating the things\naround you as pure substances\nor mixtures.\n2.4 What are the Types of Pure\nSubstances?\nOn the basis of their chemical composition,\nsubstances can be classified either as elements\nor compounds.\n2.4.1 ELEMENTS\nRobert Boyle was the first scientist to use the', 'properties, though the starting materials were\nthe same. Group I has carried out the activity\ninvolving a physical change whereas in case\n

Getting response from Gemini

In [53]:
try:
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=[f"This is my question:{query_text} tell me from this context: {results['documents']}. Remember to lastly mention only Source from {results['metadatas']}"]
    )
    print(response.text)
except Exception as e:
    if hasattr(e, "message") and "model is overloaded" in str(e.message):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    elif "model is overloaded" in str(e):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    else:
        print(f"An error occurred: {e}")

This chemistry document discusses the nature of matter, specifically focusing on pure substances and mixtures. It explains that compounds are formed by combining elements and have a uniform composition, texture, and color. Pure substances are further classified into elements and compounds. An element is defined as a basic form of matter that cannot be broken down into simpler substances by chemical reactions. Elements are typically divided into metals, non-metals, and metalloids, with metals possessing properties like luster. The document also touches upon physical and chemical changes, differentiating between mixtures and compounds. It mentions that matter can exist as dispersed phases within a dispersion medium, referring to this as a colloid.

Source: chemistry9.pdf
