<a href="https://colab.research.google.com/github/Balavignesh-25/Clinical-Decision-Support-for-Patient-Reports/blob/main/LLM_based_Clinical_Decision_Support_for_Patient_Reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get update -qq
!apt-get install -y tesseract-ocr poppler-utils
!pip install -q groq pytesseract pdf2image pillow


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 53 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 0s (2,404 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 117540 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22

In [2]:
import json
import re
from datetime import datetime
from typing import Dict, List

import pytesseract
from pdf2image import convert_from_path
from groq import Groq
from google.colab import files, userdata


In [4]:
def get_groq_client() -> Groq:
    """
    Securely fetch Groq API key from Colab Secrets
    """
    api_key = userdata.get("GROQ_API_KEY")
    if not api_key:
        raise RuntimeError(
            "‚ùå GROQ_API_KEY not found.\n"
            "Add it in Colab ‚Üí Secrets and restart runtime."
        )
    return Groq(api_key=api_key)

client = get_groq_client()


In [5]:
uploaded = files.upload()
pdf_files = list(uploaded.keys())

print("üìÑ Uploaded patient reports:")
for f in pdf_files:
    print(" -", f)


Saving report_2025_latest_followup.pdf to report_2025_latest_followup.pdf
Saving report_2024_hospital_admission.pdf to report_2024_hospital_admission.pdf
Saving report_2024_complication_screening.pdf to report_2024_complication_screening.pdf
Saving report_2023_followup.pdf to report_2023_followup.pdf
Saving report_2022_initial_diagnosis.pdf to report_2022_initial_diagnosis.pdf
üìÑ Uploaded patient reports:
 - report_2025_latest_followup.pdf
 - report_2024_hospital_admission.pdf
 - report_2024_complication_screening.pdf
 - report_2023_followup.pdf
 - report_2022_initial_diagnosis.pdf


In [6]:
def extract_text_from_pdf(pdf_path: str) -> str:
    text = []
    images = convert_from_path(pdf_path)

    for img in images:
        page_text = pytesseract.image_to_string(img)
        if page_text.strip():
            text.append(page_text)

    return "\n".join(text).strip()


In [7]:
def anonymize_phi_text(text: str):
    phi_map = {}

    # Patient name
    name_match = re.search(r"Patient Name:\s*(.*)", text)
    if name_match:
        name = name_match.group(1).strip()
        phi_map["<<PATIENT_NAME>>"] = name
        text = text.replace(name, "<<PATIENT_NAME>>")

    # Phones
    for i, phone in enumerate(re.findall(r"\b\d{10}\b", text)):
        token = f"<<PHONE_{i}>>"
        phi_map[token] = phone
        text = text.replace(phone, token)

    # Emails
    for i, email in enumerate(re.findall(r"\S+@\S+", text)):
        token = f"<<EMAIL_{i}>>"
        phi_map[token] = email
        text = text.replace(email, token)

    return text, phi_map


In [8]:
def safe_json_loads(text: str) -> dict:
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError("No valid JSON found in model output")
    return json.loads(match.group())


In [9]:
EVENT_SYSTEM_PROMPT = """
Return ONLY valid JSON.
Extract:
- report_date (YYYY-MM-DD if available)
- medical_events (date, event, category)
"""

def extract_events(text: str) -> dict:
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": EVENT_SYSTEM_PROMPT},
            {"role": "user", "content": text}
        ],
        temperature=0
    )
    return safe_json_loads(response.choices[0].message.content)


In [10]:
reports = []
global_phi_map = {}

for pdf in pdf_files:
    raw_text = extract_text_from_pdf(pdf)
    anonymized_text, phi_map = anonymize_phi_text(raw_text)

    global_phi_map.update(phi_map)

    extracted = extract_events(anonymized_text)
    extracted["raw_text"] = anonymized_text
    reports.append(extracted)


In [11]:
def parse_date(d):
    try:
        return datetime.strptime(d, "%Y-%m-%d")
    except:
        return datetime.min

reports.sort(key=lambda r: parse_date(r.get("report_date")))
latest_report = reports[-1]
previous_reports = reports[:-1]


In [12]:
HISTORY_PROMPT = """
Extract historical medical information ONLY.

Return JSON:
{
  "past_diseases": [],
  "past_medications": [],
  "past_consultations": []
}
"""

history_text = "\n\n".join(r["raw_text"] for r in previous_reports)

response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": HISTORY_PROMPT},
        {"role": "user", "content": history_text}
    ],
    temperature=0
)

history_structured = safe_json_loads(response.choices[0].message.content)


In [13]:
ABNORMALITY_PROMPT = """
You are a clinical decision-support system.

Tasks:
- Identify abnormal lab values
- Assess severity
- Provide recommendations
- Suggest general medications (non-prescriptive)
- Advise doctor consultation timeframe if severe

Return JSON only.
"""

response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": ABNORMALITY_PROMPT},
        {"role": "user", "content": latest_report["raw_text"]}
    ],
    temperature=0
)

latest_report_analysis = safe_json_loads(
    response.choices[0].message.content
)


In [14]:
ENABLE_PHI_REIDENTIFICATION = True

def reidentify_phi(data: dict, phi_map: dict, enable: bool):
    if not enable:
        data["phi_anonymized"] = True
        return data

    data_str = json.dumps(data)
    for token, original in phi_map.items():
        data_str = data_str.replace(token, original)

    restored = json.loads(data_str)
    restored["phi_anonymized"] = False
    return restored


In [15]:
final_output = {
    "medical_history_summary": history_structured,
    "latest_report_analysis": latest_report_analysis
}

final_output = reidentify_phi(
    final_output,
    global_phi_map,
    ENABLE_PHI_REIDENTIFICATION
)

print(json.dumps(final_output, indent=2))


{
  "medical_history_summary": {
    "past_diseases": [
      {
        "disease": "Type 2 Diabetes Mellitus",
        "diagnosis_date": "12-Mar-2022"
      },
      {
        "disease": "Stable Angina",
        "diagnosis_date": "22-Aug-2024"
      }
    ],
    "past_medications": [
      {
        "medication": "Metformin",
        "dosage": "500 mg once daily",
        "start_date": "18-Apr-2023"
      },
      {
        "medication": "Metformin",
        "dosage": "500 mg twice daily",
        "start_date": "10-Jan-2024"
      },
      {
        "medication": "Aspirin",
        "dosage": "75 mg once daily",
        "start_date": "22-Aug-2024"
      }
    ],
    "past_consultations": [
      {
        "consultation": "Ophthalmology review",
        "advised_date": "10-Jan-2024"
      },
      {
        "consultation": "Cardiology follow-up",
        "advised_date": "22-Aug-2024"
      }
    ]
  },
  "latest_report_analysis": {
    "patient": {
      "name": "John Doe",
      "age": 