In [None]:
# Cell: Extract tables and text from PDF report
# Opens 'report.pdf' and iterates pages to extract tables and text.
# Outputs: all_tables list and all_text string for downstream processing.
# Usage: ensure 'report.pdf' exists in workspace.
import pdfplumber

all_tables = []
all_text = ""
with pdfplumber.open('report.pdf') as pdf:
    for page in pdf.pages:
        # Extract tables
        tables = page.extract_tables()
        all_tables.extend(tables)
        # Also extract text as fallback
        all_text += page.extract_text() + "\n"

print("Extracted", len(all_tables), "tables")
print("Extracted text length:", len(all_text))

Extracted 1 tables
Extracted text length: 32524


In [None]:
# Cell: Store Gemini API key (placeholder)
# Assigns API key string to variable for LLM calls.
import os
YOUR_GEMINI_API_KEY = os.getenv("YOUR_GEMINI_API_KEY")

In [None]:
# Cell: Use Gemini LLM to analyze report text and extract metrics
# Configures model, chunks 'all_text', sends prompts and aggregates responses.
# Parses LLM output to collect patient info, metrics, and issues.
# Prints JSON 'output' with Patient_info, Metrices, and Issues.
# Use with caution: ensure API keys and privacy rules are followed.
from urllib import response
import google.generativeai as genai

# Assume API key is set in environment
genai.configure(api_key=YOUR_GEMINI_API_KEY)  # replace with your key

model = genai.GenerativeModel('gemini-2.5-flash-lite')

# Chunk the text if too large (Gemini has high limits, but to be safe)
chunk_size = 10000  # characters
chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)]

metric_text = ""
patient_text = ""
for chunk in chunks:
    prompt = f"Analyze the following part of a blood test report. \
            1. Extract patient information: Name, Age, Sex, Printed On date.\
            2. Extract all the metrics and their values, including the normal ranges. \
            3. Identify any low or high values and list them as issues. \
            4. If all values are within normal ranges, return 'Good' for all the value ranges present in the blood report. \
            response = model.generate_content(prompt)"
    full_response = response.text.strip()
    metric_text += full_response + "\n"
    patient_text += full_response + "\n"

# Parse the response to extract issues
metrices = []
issues = []
patient_info = {}

lines = metric_text.split('\n')
for line in lines:
    if 'Low' in line or 'High' in line or 'Good' in line:
        metrices.append(line.strip())
    if 'Low' in line or 'High' in line:
        issues.append(line.strip())
    elif 'Name:' in line:
        patient_info['name'] = line.split(':', 1)[1].strip()
    elif 'Age:' in line:
        patient_info['age'] = line.split(':', 1)[1].strip()      
    elif 'Sex:' in line:
        patient_info['sex'] = line.split(':', 1)[1].strip()
    elif 'Printed On:' in line or 'Date:' in line:
        patient_info['Date'] = line.split(':', 1)[1].strip()

#print("Detected issues:", issues)
#print("Extracted patient info:", patient_info)

# For now, focus on extraction - output the issues and patient info
output = {
    'Patient_info': patient_info,
    'Metrices': metrices,
    'Issues': issues
}

import json
print(json.dumps(output, indent=4))

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


{
    "Patient_info": {
        "name": "** Lyubochka Svetka",
        "age": "** 41 Y",
        "sex": "** Male (Note: The report lists \"Sex/Age : Male / 41 Y\" but later \"elaM-sraeY\" which translates to \"Male-Years\" suggesting a potential transcription error or formatting issue. Based on the explicit \"Male\" next to Sex/Age, we'll use that.)",
        "Date": "** 28-Feb-2023 10:26"
    },
    "Metrices": [
        "*   **Good Hemoglobin:** 14.5 g/dL (Range: 13.0 - 16.5)",
        "*   **Good RBC Count:** 4.79 million/cmm (Range: 4.5 - 5.5)",
        "*   **Good Hematocrit:** 43.3 % (Range: 40 - 49)",
        "*   **Good MCV:** 90.3 fL (Range: 83 - 101)",
        "*   **Good MCH:** 30.2 pg (Range: 27.1 - 32.5)",
        "*   **Good MCHC:** 33.4 g/dL (Range: 32.5 - 36.7)",
        "*   **Good RDW CV:** 13.60 % (Range: 11.6 - 14)",
        "*   **High WBC Count:** 10570 /cmm (Range: 4000 - 10000)",
        "*   **Good Neutrophils (%):** 73 % (Range: 40 - 80)",
        "*   **Low L

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

persistent_directory = "db/chroma_db"

token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
embeddings = HuggingFaceEndpointEmbeddings(
    huggingfacehub_api_token=token,
    model="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)

query = "what happens if Glucose count is low in blood test"

docs = vectorstore.similarity_search(query, k=5)

print("Retrieved context:")
for i, doc in enumerate(docs):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)
    print("-" * 50)

Retrieved context:

--- Result 1 ---
UNDERSTANDING BLOOD GLUCOSE LEVELS
AND SYMPTOMS SEEN IN BLOOD TESTS

Glucose is the primary source of energy for the body. Blood glucose levels reflect
how well the body regulates sugar, which is essential for normal brain function,
muscle activity, and overall metabolism. Abnormal glucose levels can indicate
diabetes, prediabetes, or other metabolic conditions.

--------------------------------------------------

WHAT IS GLUCOSE?

Glucose is a type of sugar that comes from the food we eat, especially carbohydrates.
After digestion, glucose enters the bloodstream and is transported into cells with
the help of insulin, a hormone produced by the pancreas.

--------------------------------------------------

NORMAL BLOOD GLUCOSE LEVELS

Blood glucose is measured using different tests.

Fasting Blood Glucose:
Normal: 70–99 mg/dL
Prediabetes: 100–125 mg/dL
Diabetes: 126 mg/dL or higher

--- Result 2 ---
Post-Meal (2 hours after eating):
Normal: Below 140

In [None]:
# Cell: Construct a safety-first assistant prompt using extracted issues
# Builds a detailed instruction prompt using 'issues_text' and strict rules.
# Performs a similarity search in the vectorstore for supporting docs.
# Prints retrieved documents for review before LLM summarization.
issues_text = "\n".join(output["Issues"])

prompt = f"""
You are a medical report analysis assistant.

### Context
You are given extracted blood test abnormalities from a patient report.
Use ONLY the information available in the Vector Store to answer.
Do NOT use prior knowledge or assumptions.
If relevant information is missing, explicitly say:
"I am sorry, I do not have enough information to provide an answer."

### Identified Blood Test Issues
{issues_text}

### Instructions
For EACH clearly identified abnormal parameter:
1. Explain what the parameter indicates (in simple medical terms)
2. List possible causes ONLY if present in the Vector Store
3. Suggest general remedies or lifestyle changes ONLY if present in the Vector Store
4. Avoid giving diagnoses, medications, or dosages
5. If reference ranges or patient context are missing, explicitly mention the uncertainty

### Output Format
Return the answer in the following structure:

#### [Parameter Name]
- **What it means:**
- **Possible causes (from Vector Store):**
- **General remedies / precautions (from Vector Store):**
- **Confidence level:** High / Medium / Low (based on available context)

### Safety Rules
- Do NOT hallucinate medical facts
- Do NOT combine multiple issues unless the Vector Store explicitly links them
- If an issue appears to be metadata, duplicated, or inconclusive, clearly state that

### Closing
End with a brief note stating that this is informational and not a medical diagnosis.
"""

docs = vectorstore.similarity_search(prompt, k=5)

print("Retrieved context:")
for i, doc in enumerate(docs):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)

Retrieved context:

--- Result 1 ---
WHEN TO SEE A DOCTOR

Medical evaluation is important if:
- Blood glucose levels are consistently high or low
- Symptoms of diabetes appear
- HbA1c is in the prediabetic or diabetic range
- There is a family history of diabetes

--------------------------------------------------

FINAL NOTE

Blood glucose imbalance develops gradually and may go unnoticed for years.
Regular screening, healthy eating, and timely medical care can prevent
complications and support long-term metabolic health.

--- Result 2 ---
HDL (High-Density Lipoprotein):
Known as “good cholesterol,” HDL helps remove excess cholesterol from the blood.

Triglycerides:
Another type of fat in the blood that, when elevated, further increases heart risk.

--------------------------------------------------

WHAT DOES HIGH CHOLESTEROL MEAN IN BLOOD TESTS?

High cholesterol is detected through a lipid profile blood test.

Typical reference values:
Total Cholesterol: Less than 200 mg/dL
LDL Ch

In [None]:
# Cell: Append retrieved context to the prompt and call Gemini LLM
# Concatenates the last 'doc' content to 'prompt' and generates a model response.
# Stores the LLM response text in 'full_response' for output.
# Ensure 'model' is configured and 'doc' is defined from previous cell.
prompt = prompt + doc.page_content
response = model.generate_content(prompt)
full_response = response.text.strip()

In [None]:
# Cell: Save LLM analysis to a Markdown file
# Formats 'full_response' into a markdown string and writes to disk.
# Creates 'blood_report_analysis.md' for sharing or review.
# Prints filename on successful save.
md_content = f"""# Blood Report Analysis

{full_response}
"""

with open("blood_report_analysis.md", "w", encoding="utf-8") as f:
    f.write(md_content)

print("Markdown file saved as blood_report_analysis.md")

Markdown file saved as blood_report_analysis.md
