In [59]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import json
import fitz     
import httpx

In [None]:
path = './resources/sample_report-3.pdf'

In [61]:
# Function to extract the text from file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

text = extract_text_from_pdf(path)

In [62]:
text

' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nPatient Name:  \nReferred By:  \nAge / Sex: \n \nDate: \n \nInvestigations:  \nDaily Case \nNumber: \n \nPatient ID: \n \n \n \n \nCOMPLETE BLOOD COUNT (CBC) \n \n \nNote: Please proofread the reference ranges given here and make changes as needed. The ones not given vary with the \npatient’s age. Labsmart is not responsible for any mismatch in values. \n \n~~~ End of report ~~~ \n \n \n \n \nDr._______________  \nMBBS, MD Pathologist \nTEST \n \nVALUE \nUNIT \nREFERENCE \n Hemoglobin \n \n \ng/dl \n12 - 15 \n Total Leukocyte Count \n \n \ncumm \n4,800 - 10,800 \n Differential Leucocyte Count \nNeutrophils \n \n \n% \n40 - 80 \nLymphocyte \n \n \n% \n20 - 40 \nEosinophils \n \n \n% \n1 - 6 \nMonocytes \n \n \n% \n2 - 10 \nBasophils \n \n \n% \n< 2 \n Platelet Count \n \n \nlakhs/cumm \n1.5 - 4.1 \n Total RBC Count \n \n \nmillion/cumm \n3.9 - 4.8 \n Hematocrit Value, Hct \n \n \n% \n36 - 46 \n Mean Corpuscular Volume, MCV \n \n \nfL \n83 - 

In [63]:
import re

def extract_json_from_text(text):
    try:
        # Extract the first valid JSON array
        match = re.search(r"\[\s*{.*?}\s*]", text, re.DOTALL)
        if match:
            return json.loads(match.group(0))
        else:
            raise ValueError("No JSON array found")
    except Exception as e:
        print("❌ JSON parsing failed:", e)
        return []

In [64]:
import google.generativeai as genai
import json

genai.configure(api_key="AIzaSyB55gTlYzDd2o1On_mpy0u8ZRbJB3-DaRc")  # Replace with yours

def extract_with_gemini(text):
    # Clean up blank lines for better Gemini accuracy
    clean_text = "\n".join([line for line in text.splitlines() if line.strip()])

    prompt = f"""
You are a medical report parser.

The following text is a blood test report extracted from a PDF. The format is broken across multiple lines — each test's name, value, and unit may appear on separate lines.

---

Your job:
1. Extract all **test results** that include:
   - `"name"`: the test name (e.g., "Hemoglobin")
   - `"value"`: the actual test value with units, if available (e.g., "11.9 g/dL")

2. Ignore any metadata (e.g., "Patient Name", "Age", "Note", etc.)

3. Return the output as a valid **JSON list**, like this:

[
  {{
    "name": "Hemoglobin",
    "value": "11.9 g/dL"
  }},
  {{
    "name": "Platelet Count",
    "value": "1.5 lakhs/cumm"
  }}
]

---

Text:
{clean_text}
"""

    model = genai.GenerativeModel("models/gemini-1.5-flash")

    try:
      response = model.generate_content(prompt)
      parsed = extract_json_from_text(response.text)
      return parsed
    except Exception as e:
        print("JSON parsing failed:", e)
        return []


In [65]:
tokens = extract_with_gemini(text)

In [66]:
tokens

[{'name': 'Hemoglobin', 'value': '12 - 15 g/dl'},
 {'name': 'Total Leukocyte Count', 'value': '4,800 - 10,800 cumm'},
 {'name': 'Platelet Count', 'value': '1.5 - 4.1 lakhs/cumm'},
 {'name': 'Total RBC Count', 'value': '3.9 - 4.8 million/cumm'},
 {'name': 'Hematocrit Value, Hct', 'value': '36 - 46 %'},
 {'name': 'Mean Corpuscular Volume, MCV', 'value': '83 - 101 fL'},
 {'name': 'Mean Cell Haemoglobin, MCH', 'value': '27 - 32 Pg'},
 {'name': 'Mean Cell Haemoglobin CON, MCHC', 'value': '31.5 - 34.5 %'}]

#### Resources used:
 - https://pymupdf.readthedocs.io/en/latest/recipes-text.html#how-to-extract-all-document-text
 - https://ollama.com/library