Important Imports

In [2]:
from google import genai
from google.genai.errors import ClientError
from google.genai import types
import os
from dotenv import load_dotenv
from google.genai.types import EmbedContentConfig
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

Load apiKey from .env file

In [8]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

Extract Text from PDF Function

In [3]:
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        full_text = ""

        if len(reader.pages) == 0:
            print("PDF has no pages")
            return ""

        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                full_text += text

        print(f"Number of pages in PDF: {len(reader.pages)}")

        return full_text

    except Exception as e:
        return f"An error occurred: {e}"


Use extract_text function to extract from pdf

In [4]:
file_to_extract = "./chemistry9.pdf"
extracted_txt_content = extract_text_from_pdf(file_to_extract)

# Uncomment this line to print extracted text
if "An error occurred" not in extracted_txt_content:
    print("Extracted Text:")
    print(extracted_txt_content)
else:
    print(extracted_txt_content)

Number of pages in PDF: 12
Extracted Text:
Fig. 2.1: Some consumable items
Have you ever noticed the word ‘pure’
written on the packs of these consumables?
For a common person pure means having no
adulteration. But, for a scientist all these things
are actually mixtures of different substances
and hence not pure. For example, milk is
actually a mixture of water, fat, proteins, etc.
When a scientist says that something is pure,
it means that all the constituent particles of
that substance are the same in their chemical
nature. A pure substance consists of a single
type of particle. In other words, a substance is
a pure single form of matter.
As we look around, we can see that most
of the matter around us exists as mixtures of
two or more pure components, for example,
sea water, minerals, soil, etc., are all mixtures.
2.1 What is a Mixture?
Mixtures  are constituted by more than one
kind of pur e for m of matter . We know that
dissolved sodium chloride can be separated
from water by the 

Extract the pdf name to use as metadata

In [5]:
def get_fileName(file):
    return file[2:len(file)]

file_Name = get_fileName(file_to_extract)
print(file_Name)

chemistry9.pdf


Split the extracted text into Chunks using LangChain

In [6]:
def chunk_text(text, chunk_size=300, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,      
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text.replace("\n", " "))
    return chunks
txt_after_chunk_array = chunk_text(extracted_txt_content)

# Uncomment this line to print output of Chunked Text
print(len(txt_after_chunk_array))
print(txt_after_chunk_array)

102
['Fig. 2.1: Some consumable items Have you ever noticed the word ‘pure’ written on the packs of these consumables? For a common person pure means having no adulteration. But, for a scientist all these things are actually mixtures of different substances and hence not pure. For example, milk is', 'and hence not pure. For example, milk is actually a mixture of water, fat, proteins, etc. When a scientist says that something is pure, it means that all the constituent particles of that substance are the same in their chemical nature. A pure substance consists of a single type of particle. In', 'consists of a single type of particle. In other words, a substance is a pure single form of matter. As we look around, we can see that most of the matter around us exists as mixtures of two or more pure components, for example, sea water, minerals, soil, etc., are all mixtures. 2.1 What is a', 'soil, etc., are all mixtures. 2.1 What is a Mixture? Mixtures  are constituted by more than one kind of

Embedding the Chunks With Gemini-Embedding-Model

In [None]:
embedding_dim = 768 # increase the dimension to increase the quality

embeddings = []
for chunk in txt_after_chunk_array:
    response = client.models.embed_content( 
        model="gemini-embedding-001",
        contents=[types.Part.from_text(text=chunk)],
        config=types.EmbedContentConfig(
            task_type="RETRIEVAL_DOCUMENT",
            output_dimensionality=embedding_dim
        ),
    )
    embeddings.append(response)

# processed_embeddings contains the floating number(VECTORS)
processed_embeddings = [
    e.embeddings[0].values for e in embeddings
]
print(processed_embeddings)

[[0.010219787, 0.0034245164, 0.01557367, -0.052826636, 0.002486584, 0.004269603, 0.022641059, 0.011748183, 0.0007073015, 0.014189176, 0.0002995694, -0.017385414, 0.0033854516, 0.009257884, 0.12621458, -0.011298914, -0.0013208603, -0.008200574, -0.010897795, -0.01380192, -0.003821583, -0.0013705222, 0.014615176, -0.003608623, -0.007681531, -0.0010042473, -0.0064259283, -0.012505332, 0.035267893, -0.0003166749, 0.0052983393, 0.016187144, 0.014872656, 0.01660874, 0.029108794, 0.030024033, 0.000121044824, -0.020564584, 0.008844858, 0.002654637, -0.022161078, 0.021355195, 0.007197431, -0.034722812, -0.0035853912, -0.0015103902, 0.021191927, 0.006314406, 0.0015293697, 0.016270274, 0.0031827583, 0.0061573344, 0.008172135, -0.20489067, -0.0150524685, -0.009620785, 0.004198337, -0.0098212985, -0.0008324049, -0.010299273, -0.012646662, 0.0068071075, -0.033659358, 0.015974544, -0.009321056, -0.017722402, 0.013255447, 0.011202649, -0.025005408, 0.00054580235, 0.00541787, 0.014150996, -0.0039698775

Create ChromaDB Collection

In [None]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

collection = chroma_client.get_or_create_collection(
    name="TestTryCollection",
    metadata={"source": "ARJ's"}
)

Insert data into ChromaDB

In [24]:
ids = [f"{file_Name}_chunk_{i}" for i in range(len(txt_after_chunk_array))]
print(len(txt_after_chunk_array))

collection.upsert(
    ids=ids,
    documents=txt_after_chunk_array,
    embeddings=processed_embeddings,
    metadatas=[{"chunk_index": i, "Source": file_Name} for i in range(len(txt_after_chunk_array))]
)

102


Embed User query/promt with Gemini

In [None]:
query_text = "Sigma boy?"
# formula of force?
# What is  Tyndall Effect and how can it be observed?
# Antoine  Laurent Lavoisier born in? ====> 1943
# In physics what is written in section 9.1.1?
length_of_query = len(query_text.strip())
if length_of_query == 0:
    print("Query is Empty")
else:  
    try:
        query_response = client.models.embed_content(
            model="gemini-embedding-001",
            contents=[types.Part.from_text(text=query_text)],
            config=types.EmbedContentConfig(
                task_type="RETRIEVAL_QUERY",
                output_dimensionality=768
            ),
        )
    except ClientError as e:
            print(f"An error occurred while embedding the query: {e.message}")

# embedded_query contaids float value (VECTOR)
if query_response.embeddings is not None:
    embedded_query = query_response.embeddings[0].values
    print(embedded_query)
else:
    print("I think Embedded_Query is empty")

[-0.0112012625, 0.0052748392, -0.0014822321, -0.08731079, -0.0073958086, -0.007059469, 0.030706372, 0.004948542, 0.0023799294, 0.028466225, -0.0018662058, -0.0046251253, 0.006279308, 0.011785142, 0.11731017, -0.0015903235, 0.01269764, -0.008373451, -0.011953818, -0.0084730415, 0.0039619054, 0.028942226, 0.013488285, -0.0037768849, -0.03776428, 0.015685469, -0.0008707984, 0.0060372613, 0.0046472517, 0.012057147, 0.00908559, 0.007547889, 0.0027310557, -0.012378203, -0.0011616723, 0.002567247, 0.014607552, -0.018551817, 0.019294258, 0.016031362, -0.03405031, 0.010667586, 0.0007134786, 0.003498356, -0.0011615418, -0.016691517, 0.0028570413, 0.0047611627, -0.025397625, 0.031237906, 0.010518306, 0.009970734, 0.008830994, -0.13199109, 0.016162576, -0.002722637, 0.0029938475, -0.011815497, -0.025133908, -0.0029621308, -0.024770195, -0.032770135, -0.0055141253, 0.0006349114, -0.00029879127, -0.010705902, -0.0054125334, 0.018773627, 0.021922074, -0.025639113, 0.026490511, 0.013990069, -0.0107154

Getting result from chromaDB

In [None]:
if query_response.embeddings is not None and len(query_response.embeddings) > 0:
    
    results = collection.query(
        query_embeddings=[embedded_query],
        n_results=5
    )
    print("Query results:", results["documents"])

else:
    print("Failed to generate query embedding")
    print("Response:", query_response)

Query results: [['consists of a single type of particle. In other words, a substance is a pure single form of matter. As we look around, we can see that most of the matter around us exists as mixtures of two or more pure components, for example, sea water, minerals, soil, etc., are all mixtures. 2.1 What is a', 'soil, etc., are all mixtures. 2.1 What is a Mixture? Mixtures  are constituted by more than one kind of pur e for m of matter . We know that dissolved sodium chloride can be separated from water by the physical process of evaporation. However, sodium chloride is itself a pure substance and cannot', 'soil are not single pure substances. Whatever the source of a pure substance may be, it will always have the same characteristic properties. Therefore, we can say that a mixture contains more than one pure substance. 2.1.1 TYPES  OF MIXTURES Depending upon the nature of the components that form a']]


Getting response from Gemini

In [30]:
try:
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=[f"This is my question:{query_text} tell me from this context: {results['documents']}. Remember to lastly mention only Source from {results['metadatas']}"]
    )
    print(response.text)
except Exception as e:
    if hasattr(e, "message") and "model is overloaded" in str(e.message):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    elif "model is overloaded" in str(e):
        print(f"The model is overloaded. Please try again later.\n actual error : {e}")
    else:
        print(f"An error occurred: {e}")

From the provided context, a mixture is defined as matter that **consists of two or more pure components** or **more than one kind of pure form of matter**. This means that mixtures are not single pure substances. Examples given include sea water, minerals, and soil.

Source: chemistry9.pdf
