In [1]:
!pip install -q -U google-genai PyMupdf chromadb sentence-transformers

In [2]:
import fitz
import chromadb
from google import genai
import re

In [3]:
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
Key = "AIzaSyDQ0S8anqLotDjxKcdLpnDeDAMNTbyEHT0"
from google import genai
def geminiGen(chunks,Command):
        context = "\n".join(chunks)
        prompt = f"""You are a helpful assistant. Use the context to answer the user's question.

            Context:
            {context}

            Question: {Command}
            Answer:"""
        client = genai.Client(api_key= Key)
        response = client.models.generate_content(
            model="gemini-2.0-flash", contents=prompt
        )
        n = response
        return n

In [5]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

In [6]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

In [15]:
def setup_chroma(chunks):
    client = chromadb.Client()

    # Delete existing collection if it exists
    try:
        client.delete_collection(name="pdf_chunks")
    except:
        pass  # Collection didn't exist

    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    collection = client.create_collection(name="pdf_chunks", embedding_function=embedding_function)
    collection.add(documents=chunks, ids=[str(i) for i in range(len(chunks))])
    return collection

In [8]:
def retrieve_chunks(question, collection, k=5):
    results = collection.query(query_texts=[question], n_results=k)
    return results['documents'][0]

In [9]:


# ===== 7. Full Pipeline Execution =====
def run_rag(pdf_path, question):
    print("📄 Extracting text...")
    text = extract_text_from_pdf(pdf_path)

    print("🔄 Chunking text...")
    chunks = chunk_text(text)

    print("🧠 Building vector database...")
    collection = setup_chroma(chunks)

    print("🔍 Retrieving relevant context...")
    relevant_chunks = retrieve_chunks(question, collection)

    print("🤖 Loading LLM...")


    print("💬 Generating answer...")
    answer = geminiGen(relevant_chunks, question)

    return answer


In [18]:
if __name__ == "__main__":
    path = "C:\\Python\\Vanka-By-Anton-Chekhov-book-PDF.pdf"
    question = "who was the vanka's favourite?"

    try:
        answer = run_rag(path, question)
        print("\n📝 Answer:\n", answer)
    except Exception as e:
        print(f"Error: {e}")
        # Reset ChromaDB client if needed
        client = chromadb.Client()
        try:
            client.delete_collection(name="pdf_chunks")
        except:
            pass

📄 Extracting text...
🔄 Chunking text...
🧠 Building vector database...
🔍 Retrieving relevant context...
🤖 Loading LLM...
💬 Generating answer...

📝 Answer:
 candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text="Olga Ignatyevna was Vanka's favourite.\n")], role='model'), citation_metadata=None, finish_message=None, token_count=None, finish_reason=<FinishReason.STOP: 'STOP'>, avg_logprobs=-0.0084820045874669, grounding_metadata=None, index=None, logprobs_result=None, safety_ratings=None)] create_time=None response_id=None model_version='gemini-2.0-flash' prompt_feedback=None usage_metadata=GenerateContentResponseUsageMetadata(cache_tokens_details=None, cached_content_token_count=None, candidates_token_count=13, candidates_tokens_details=[ModalityTokenCount(modality=<MediaModality.TEXT: 'TEXT'>, token_count=13)], prompt_token_co