Important Imports

In [3]:
from google import genai
from google.genai.errors import ClientError
from google.genai import types
import os
from dotenv import load_dotenv
from google.genai.types import EmbedContentConfig
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

Load apiKey from .env file

In [4]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

Extract Text from PDF Function

In [11]:
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        full_text = ""

        if len(reader.pages) == 0:
            print("PDF has no pages")
            return ""

        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                full_text += text

        print(f"Number of pages in PDF: {len(reader.pages)}")

        return full_text

    except Exception as e:
        return f"An error occurred: {e}"


Use extract_text function to extract from pdf

In [12]:
file_to_extract = "./physics9.pdf"
extracted_txt_content = extract_text_from_pdf(file_to_extract)

# Uncomment this line to print extracted text
if "An error occurred" not in extracted_txt_content:
    print("Extracted Text:")
    print(extracted_txt_content)
else:
    print(extracted_txt_content)

Number of pages in PDF: 13
Extracted Text:
SCIENCE100
We have learnt about the motion of objects and
force as the cause of motion. We have learnt
that a force is needed to change the speed or
the direction of motion of an object. We always
observe that an object dropped from a height
falls towards the earth. We know that all the
planets go around the Sun. The moon goes
around the earth. In all these cases, there must
be some force acting on the objects, the planets
and on the moon. Isaac Newton could grasp
that the same force is responsible for all these.
This force is called the gravitational force.
In this chapter we shall learn about
gravitation and the universal law of
gravitation. We shall discuss the motion of
objects under the influence of gravitational
force on the earth. W e shall study how the
weight of a body varies fr om place to place.
We shall also discuss the conditions for
objects to float in liquids.
9.1 Gravitation
We know that the moon goes around the earth.
An objec

Extract the pdf name to use as metadata

In [7]:
def get_fileName(file):
    return file[2:len(file)]

file_Name = get_fileName(file_to_extract)
print(file_Name)

physics9.pdf


Split the extracted text into Chunks using LangChain

In [13]:
def chunk_text(text, chunk_size=50, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,      
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text)
    return chunks
txt_after_chunk_array = chunk_text(extracted_txt_content)

# Uncomment this line to print output of Chunked Text
print(txt_after_chunk_array)

['Name: Abhinav Ranjan Jha', 'Date of Birth: 20 February 2010\nAge: 15 years', 'Age: 15 years\nCity: Patna\nClass: IX “SA”', 'Class: IX “SA”\nSchool: DAV Public School', 'Programming Languages: Python, JavaScript, C, SQL', 'MarkUp language:\n HTML\nHobbies/Interests:', 'Coding,\n \nCycling\nLanguages Known:\n \nEnglish,', 'English,\n \nHindi\nProjects:\n \nChatPat(v1.0.0\n \n-', 'ChatPat(v1.0.0\n \n-\n \nvx.0.0)']


Embedding the Chunks With Gemini-Embedding-Model

In [14]:
embedding_dim = 768 # increase the dimension to increase the quality

embeddings = []
for chunk in txt_after_chunk_array:
    response = client.models.embed_content( 
        model="gemini-embedding-001",
        contents=[types.Part.from_text(text=chunk)],
        config=types.EmbedContentConfig(
            task_type="RETRIEVAL_DOCUMENT",
            output_dimensionality=embedding_dim
        ),
    )
    embeddings.append(response)

# processed_embeddings contains the floating number(VECTORS)
processed_embeddings = [
    e.embeddings[0].values for e in embeddings
]
print(processed_embeddings)

[[-0.028288603, 0.011316441, 0.021177443, -0.08887243, -0.007773921, 0.014291063, 0.01166405, 0.0032429744, -0.017630113, -0.010506441, -0.012111592, -0.009897813, 0.012954663, 0.05100484, 0.15955597, -0.014177027, 0.005721896, -0.0039103567, 0.01527842, -0.026101787, -0.0067235986, -0.0012921334, 0.01620375, -0.018274512, 0.0054986533, -0.016674561, 0.013596752, -0.00936319, 0.023986286, 0.005656058, 0.009060542, 0.0119634885, 0.0066242595, 0.018767312, 0.026501648, 0.016539792, 0.00927238, -0.02996141, 0.002775658, 0.018423814, 0.00737156, 0.0006110452, -0.014664958, -0.011211761, -0.014488122, 0.0047233584, -0.009562415, -0.010894712, -0.023037856, -2.9822882e-06, 0.005805902, -0.00075789035, -0.02359218, -0.22342145, 0.0050656376, 0.022054078, -0.0012183762, 0.009624204, 0.029114256, -0.013634856, 0.0048616114, 0.007658894, -0.02866823, -0.026786929, -0.0049291602, -0.013851402, 0.009304287, 0.005208459, -0.01611906, -0.0005259475, 0.014781622, 0.009425784, -0.0057263505, -0.030013

Create ChromaDB Collection

In [15]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

collection = chroma_client.get_or_create_collection(
    name="gemini_docs2",
    metadata={"source": f"{file_Name}"}
)

Insert data into ChromaDB

In [16]:
ids = [f"chunk_{i}" for i in range(len(txt_after_chunk_array))]
print(len(txt_after_chunk_array))

collection.upsert(
    ids=ids,
    documents=txt_after_chunk_array,
    embeddings=processed_embeddings,
    metadatas=[{"chunk_index": i} for i in range(len(txt_after_chunk_array))]
)

9


Embed User query/promt with Gemini

In [17]:
query_text = "Name?"
length_of_query = len(query_text.strip())
if length_of_query == 0:
    print("Query is Empty")
else:  
    try:
        query_response = client.models.embed_content(
            model="gemini-embedding-001",
            contents=[types.Part.from_text(text=query_text)],
            config=types.EmbedContentConfig(
                task_type="RETRIEVAL_QUERY",
                output_dimensionality=768
            ),
        )
    except ClientError as e:
            print(f"An error occurred while embedding the query: {e.message}")

# embedded_query contaids float value (VECTOR)
if query_response.embeddings is not None:
    embedded_query = query_response.embeddings[0].values
    print(embedded_query)
else:
    print("I think Embedded_Query is empty")

[-0.02247996, 0.010816294, 0.020459212, -0.06426284, -0.018476829, 0.017050104, -0.0002156399, -0.012122546, -0.024850456, 0.0029644126, -0.004852283, -0.0047627683, 0.003698364, 0.020765414, 0.10624794, -0.0130087845, 0.004432829, -0.007805837, 0.0043061255, -0.041547216, -0.004197069, 0.015372407, 0.019727346, -0.009646261, 0.013594128, 0.0032353161, 0.021697655, -0.007884656, 0.029932734, 0.016551893, -0.01474209, -0.00840796, -0.0017433363, -0.019239182, 0.020297322, 0.0021785654, 0.0105069745, -0.018656116, -0.007532805, 0.009621702, -0.013630909, 0.003460251, 0.005603592, -0.011533679, -0.014023908, 0.015330655, -0.024212398, -0.023415972, -0.014128784, 0.03280614, 0.012080822, 0.029607492, -0.03715688, -0.13635808, -0.0025479763, 0.012935425, -0.029930443, -0.0019410506, 0.012957454, -0.006122887, -0.010193626, -0.014807395, -0.019220987, -0.027632114, 0.0044936035, 0.0143576255, -0.005306374, 0.0055680475, 0.0016861958, -0.006661468, 0.013171574, -0.009384698, -0.0024459197, -0

Getting result from chromaDB

In [18]:
if query_response.embeddings is not None and len(query_response.embeddings) > 0:
    
    results = collection.query(
        query_embeddings=[embedded_query],
        n_results=3
    )
    print("Query results:", results)
    
else:
    print("Failed to generate query embedding")
    print("Response:", query_response)

Query results: {'ids': [['chunk_0', 'chunk_1', 'chunk_3']], 'embeddings': None, 'documents': [['Name: Abhinav Ranjan Jha', 'Date of Birth: 20 February 2010\nAge: 15 years', 'Class: IX “SA”\nSchool: DAV Public School']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'chunk_index': 0}, {'chunk_index': 1}, {'chunk_index': 3}]], 'distances': [[0.23940828442573547, 0.27175137400627136, 0.2725144624710083]]}


Getting response from Gemini

In [19]:
response = client.models.generate_content(
    model="gemini-2.0-flash-lite",
    contents=[f"This is my question:{query_text}\n tell me from this context:\n {results['documents']}"]
)
print(response.text)

The name is **Abhinav Ranjan Jha**.

