In [68]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import json
import fitz     
import httpx

In [69]:
path = './resources/sample_report-3.pdf'

In [70]:
# Function to extract the text from file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

text = extract_text_from_pdf(path)

In [71]:
text

'Cholesterol\nmg/dL\nDesirable : <200\nBorderline High : 200-239\nHigh : >240\nCholesterol oxidase – Peroxidase method\n189.0\nTriglyceride\nH\nmg/dL\nNormal : <150\nBorderline : 150-199\nHigh : 200-499\nVery High : >500 \nEzymatic (Lipase/GK/GPO/POD)\n168.0\nHDL Cholesterol\nmg/dL\nLow: <40.0 \nHigh: >60.0  \nPTA/MgCl2\n60.0\nDirect LDL\nH\nmg/dL\nOptimal: <100\nNear to above Optimal: \n100–129\nBorderline High: 130-159\nHigh: 160–189\nVery High: =190\nDirect measured\n100.39\nVLDL\nmg/dL\n15 - 35\nCalculated\n33.60\nCHOL/HDL Ratio\nUp to 5.0\nCalculated\n3.1\nLDL/HDL Ratio\nUp to 3.5\nCalculated\n1.7\nLipid Profile\nTest\nResult\nUnit\nBiological Ref. Interval\nDr.  Sanjeev Shah\nDr.Yash Shah\nDr. Purvish Darji\nMD Path\nMD Path\nMD(Path)\nThis is an Electronically Authenticated Report.\n# Referred Test\nMale / 41 Y\n01-Feb-1982\nSex/Age\nStatus :\nFinal\n20-Feb-2023 11:29\nApproved on\n:\n:\nRef. Id\n:\nPrinted On\n28-Feb-2023 10:26\n:\nRef. By\n:\n1. NRL SAWPL Gujarat Ahmedabad Pal

In [72]:
import re

def extract_json_from_text(text):
    try:
        # Extract the first valid JSON array
        match = re.search(r"\[\s*{.*?}\s*]", text, re.DOTALL)
        if match:
            return json.loads(match.group(0))
        else:
            raise ValueError("No JSON array found")
    except Exception as e:
        print("❌ JSON parsing failed:", e)
        return []

In [None]:
import google.generativeai as genai
import json

load_dotenv(dotenv_path="keys.env")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

def extract_with_gemini(text):
    clean_text = "\n".join([line for line in text.splitlines() if line.strip()])

    prompt = f"""
      You are a medical report parser.

      The following text is a blood test report extracted from a PDF. The format is broken across multiple lines — each test's name, value, and unit may appear on separate lines.

      ---

      Your job:
      1. Extract all **test results** that include:
        - `"name"`: the test name (e.g., "Hemoglobin")
        - `"value"`: the actual test value with units, if available (e.g., "11.9 g/dL")

      2. Ignore any metadata (e.g., "Patient Name", "Age", "Note", etc.)

      3. Return the output as a valid **JSON list**, like this:

      [
        {{
          "name": "Hemoglobin",
          "value": "11.9 g/dL"
        }},
        {{
          "name": "Platelet Count",
          "value": "1.5 lakhs/cumm"
        }}
      ]

      ---

      Text:
      {clean_text}
    """

    model = genai.GenerativeModel("models/gemini-1.5-flash")

    try:
      response = model.generate_content(prompt)
      parsed = extract_json_from_text(response.text)
      return parsed
    except Exception as e:
        print("JSON parsing failed:", e)
        return []


In [74]:
tokens = extract_with_gemini(text)

In [75]:
tokens

[{'name': 'Cholesterol', 'value': '189.0 mg/dL'},
 {'name': 'Triglyceride', 'value': '168.0 mg/dL'},
 {'name': 'HDL Cholesterol', 'value': '60.0 mg/dL'},
 {'name': 'Direct LDL', 'value': '100.39 mg/dL'},
 {'name': 'VLDL', 'value': '33.60 mg/dL'},
 {'name': 'CHOL/HDL Ratio', 'value': '3.1'},
 {'name': 'LDL/HDL Ratio', 'value': '1.7'}]