In [2]:
# ✅ Cell 2: Load and Chunk PDF using PyMuPDF + LangChain

import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Load the PDF and extract text
def load_pdf(file_path: str) -> list:
    doc = fitz.open(file_path)
    text_pages = [page.get_text() for page in doc]
    doc.close()
    full_text = "\n".join(text_pages)
    return full_text

# Chunk text into overlapping pieces
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.create_documents([text])
    return chunks

# Load and chunk the document
pdf_path = "Medical_book.pdf"
raw_text = load_pdf(pdf_path)
documents = chunk_text(raw_text)

# Print summary
print(f"Total chunks created: {len(documents)}")
print(f"Sample chunk:\n\n{documents[0].page_content[:500]}...")


Total chunks created: 3392
Sample chunk:

The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION

The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION
J A C Q U E L I N E  L .  L O N G E ,  E D I T O R
D E I R D R E  S .  B L A N C H F I E L D ,  A S S O C I AT E  E D I T O R
V O L U M E
A-B
1...


In [5]:
# ✅ Cell 4: Generate embeddings using bge-large-en-v1.5 (HuggingFace)

from sentence_transformers import SentenceTransformer
import numpy as np

# Load the embedding model (downloaded from HuggingFace Hub)
model = SentenceTransformer("BAAI/bge-large-en-v1.5")

# Extract raw text content from LangChain Document objects
texts = [doc.page_content for doc in documents]

# Generate embeddings for all chunks
embeddings = model.encode(texts, show_progress_bar=True, batch_size=32, normalize_embeddings=True)

# Convert to list of vectors (if needed later for storage)
embeddings = embeddings.tolist()

print(f"✅ Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Batches: 100%|██████████| 106/106 [40:00<00:00, 22.65s/it]


✅ Generated 3392 embeddings of dimension 1024


In [None]:
# ✅ Cell 6: Pinecone (v3.x) initialization and index creation

from pinecone import Pinecone, ServerlessSpec

# Create Pinecone client instance
pc = Pinecone(api_key="Enter_Your_Pinecone_API_Key_Here"")

# Check and create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=len(embeddings[0]),
        metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
)
    )
    print(f"✅ Created index: {INDEX_NAME}")
else:
    print(f"✅ Index already exists: {INDEX_NAME}")


✅ Created index: medical-chatbot-index


In [13]:
# ✅ Cell 7: Upload embeddings and metadata to Pinecone

# Connect to your index
index = pc.Index(INDEX_NAME)

# Prepare batch data
batch_size = 100  # Pinecone recommends batching for large uploads
vectors = []

for i, (text, vector) in enumerate(zip(texts, embeddings)):
    vectors.append({
        "id": f"chunk-{i}",
        "values": vector,
        "metadata": {
            "text": text
        }
    })

# Upload in batches
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)
    print(f"✅ Uploaded batch {i // batch_size + 1}")

print("✅ All embeddings uploaded to Pinecone!")


✅ Uploaded batch 1
✅ Uploaded batch 2
✅ Uploaded batch 3
✅ Uploaded batch 4
✅ Uploaded batch 5
✅ Uploaded batch 6
✅ Uploaded batch 7
✅ Uploaded batch 8
✅ Uploaded batch 9
✅ Uploaded batch 10
✅ Uploaded batch 11
✅ Uploaded batch 12
✅ Uploaded batch 13
✅ Uploaded batch 14
✅ Uploaded batch 15
✅ Uploaded batch 16
✅ Uploaded batch 17
✅ Uploaded batch 18
✅ Uploaded batch 19
✅ Uploaded batch 20
✅ Uploaded batch 21
✅ Uploaded batch 22
✅ Uploaded batch 23
✅ Uploaded batch 24
✅ Uploaded batch 25
✅ Uploaded batch 26
✅ Uploaded batch 27
✅ Uploaded batch 28
✅ Uploaded batch 29
✅ Uploaded batch 30
✅ Uploaded batch 31
✅ Uploaded batch 32
✅ Uploaded batch 33
✅ Uploaded batch 34
✅ All embeddings uploaded to Pinecone!


In [None]:
# ✅ Cell 9: Initialize Gemini Pro from Google Generative AI

import google.generativeai as genai

# 🔐 Replace with your actual API key
GEMINI_API_KEY = "Enter_Your_Gemini_API_Key_Here"

# Configure Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Load Gemini Pro model
gemini_model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")


print("✅ Gemini 1.5 Pro loaded successfully.")


✅ Gemini 1.5 Pro loaded successfully.


In [30]:
# ✅ Cell 10: Embed user query and retrieve top-k similar chunks from Pinecone

def search_similar_chunks(query: str, top_k: int = 5):
    # Embed the query using the same model
    query_vector = model.encode(query, normalize_embeddings=True).tolist()

    # Query Pinecone index
    results = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )

    # Extract retrieved texts
    retrieved_chunks = [match['metadata']['text'] for match in results['matches']]
    return retrieved_chunks


In [31]:
# ✅ Cell 11: Generate medical answer using Gemini and retrieved context

def ask_medical_question(question: str, top_k: int = 5):
    # 1. Retrieve relevant chunks
    context_chunks = search_similar_chunks(question, top_k=top_k)
    context_text = "\n\n".join(context_chunks)

    # 2. Prepare prompt for Gemini
    prompt = f"""You are a helpful medical assistant. Use the following context to answer the question.

Context:
{context_text}

Question:
{question}

Answer:"""

    # 3. Generate answer with Gemini Pro
    response = gemini_model.generate_content(prompt)
    return response.text

# ▶️ Example usage:
# print(ask_medical_question("What are the symptoms of diabetes?"))


In [32]:
print(ask_medical_question("What are the symptoms of diabetes?"))

Based on the provided text, symptoms of Type I diabetes mellitus include fatigue and abnormally high levels of glucose in the blood (hyperglycemia).  The text also mentions that if diabetes is left untreated, it can damage or cause failure of the eyes, kidneys, nerves, heart, blood vessels, and other body organs.

