In [1]:
import os
import io
import fitz  # PyMuPDF (For extracting text from non-scanned PDFs)
import pytesseract  # OCR for scanned PDFs
import cv2  # OpenCV for image processing
import pdf2image  # Convert PDF to images
import re  # Regular expressions for cleaning text
import numpy as np
import pandas as pd  # For handling CSV
import requests  # For Google Maps API (Finding Nearby Specialists)
from tabulate import tabulate  # For structured output formatting
import ipywidgets as widgets
from IPython.display import display

In [11]:
# Step 1: Upload PDF (Jupyter Compatible)
upload_button = widgets.FileUpload(accept='.pdf', multiple=False)
display(upload_button)

def get_uploaded_pdf():
    if upload_button.value:
        file_info = list(upload_button.value.values())[0]
        file_content = io.BytesIO(file_info['content'])
        return file_content
    return None

FileUpload(value={}, accept='.pdf', description='Upload')

In [12]:
# Step 2: Convert PDF to Images (if scanned)
def pdf_to_images(pdf_file):
    images = pdf2image.convert_from_bytes(pdf_file.read(), dpi=300)
    return images

In [13]:
# Step 3: Extract Text using OCR (for scanned PDFs)
def ocr_extract_text(images):
    extracted_text = ""
    for img in images:
        img_cv = np.array(img)
        img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
        _, img_cv = cv2.threshold(img_cv, 150, 255, cv2.THRESH_BINARY)
        text = pytesseract.image_to_string(img_cv, lang="eng")
        extracted_text += text + "\n"
    return extracted_text

In [14]:
# Step 4: Extract Text from Selectable PDFs
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_file.seek(0)
    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text


In [15]:
# Step 5: Clean Extracted Text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9.,%():/-]', ' ', text)
    return text.strip()


In [16]:
# Step 6: Read Threshold Values from CSV
def load_thresholds(csv_path="threshold3.csv"):
    if not os.path.exists(csv_path):
        print("⚠️ Warning: Threshold CSV file not found!")
        return None
    return pd.read_csv(csv_path)

threshold_df = load_thresholds()

In [17]:
# Step 7: Extract Lab Values
# Step 7: Extract Lab Values
def extract_lab_values(text, threshold_df):
    if threshold_df is None:
        return []
    
    results = []
    
    # Normalize test names in our recommendations to lowercase for comparison
    normalized_specialist_tests = {key.lower(): key for key in specialist_recommendations}

    for _, row in threshold_df.iterrows():
        test_name = row["TEST"].strip().lower()  # Normalize test name from CSV
        
        try:
            min_val = float(row["MIN VALUE"]) if row["MIN VALUE"] != '-' else None
            max_val = float(row["MAX VALUE"]) if row["MAX VALUE"] != '-' else None
        except ValueError:
            continue  # Skip invalid row

        # Search for the test in the extracted text
        match = re.search(rf"({row['TEST']}).*?(\d+\.?\d*)", text, re.IGNORECASE)
        if match:
            value = float(match.group(2))
            if min_val is not None and value < min_val:
                status = "🔻 Too Low"
            elif max_val is not None and value > max_val:
                status = "🔺 Too High"
            else:
                status = "✅ Normal"

            # Use normalized test names to find specialists
            normalized_test_name = test_name.lower()
            if normalized_test_name in normalized_specialist_tests:
                actual_test_name = normalized_specialist_tests[normalized_test_name]
            else:
                actual_test_name = row["TEST"]  # Use original if no match found

            results.append([actual_test_name, value, status])

    return results


In [22]:
# Step 8: Specialist Recommendations (Based on Abnormal Values)
specialist_recommendations = {
    "Calcium": {
        "condition": "Calcium deficiency, which may lead to bone problems.",
        "specialists": [
            "Dr Balamurugan J - Kauvery Hospitals",
            "Dr Kanniraj - Magna Ortho Clinic",
            "Chennai Ortho Clinic",
            "Shri Bone & Joints"
        ]
    },
    "Sodium": {
        "condition": "Sodium imbalance, which may affect heart health.",
        "specialists": [
            "Dr. D Vaidhynathan - Apollo",
            "Dr A.B Gopalamurugan - Royapettah Chennai",
            "Dr.Dhamodaran K - Sidharam Heart Clinic Adyar"
        ]
    },
    "Monocytes": {
        "condition": "Liver or tumor-related issues.",
        "specialists": [
            "Dr. Aswin Krishna - Apollo (Liver Specialist)",
            "Dr. S. Arulprakash - MGM Healthcare",
            "Dr. Anisha Ashok - Laser and Laparoscopic Hospital"
        ]
    },
    "Lymphocytes": {
        "condition": "Possible signs of cancer.",
        "specialists": [
            "Adyar Cancer Institute",
            "Dr Vimalathithan - C Dot Hospital",
            "MGM Cancer Institute"
        ]
    },
    "Potassium": {
        "condition": "Potassium imbalance, which may indicate diabetes.",
        "specialists": [
            "Dr. Mohan's Diabetes Specialities Centre - Gopalapuram",
            "Dr Shanmugasundar - Magna Clinic",
            "Dr. Kavitha G - Nannalam Clinic"
        ]
    },
    "Iron": {
        "condition": "Excess iron or low transferrin, which may indicate Hemochromatosis.",
        "specialists": [
            "Dr. B Benjamin - MGM Healthcare Malar Hospitals, Adyar",
            "Dr Akila Mani - Apollo Speciality Hospitals, Vanagaram",
            "Dr. Mohamed Sajjid - MS Child Care Clinic, Royapettah"
        ]
    },
    "Uric Acid": {
        "condition": "Excessive uric acid, which may lead to gout.",
        "specialists": [
            "Dr. Waseem Ahmed N - Billroth Hospital, Raja Annamalai Puram",
            "Dr. Sheethal Suresh - MGM Healthcare Malar Hospitals, Adyar",
            "Dr. Krishnamurthy - Apollo Cancer Centers, Teynampet Chennai"
        ]
    },
    "Tuberculosis": {
        "condition": "High lymphocytes and monocytes, which may indicate tuberculosis.",
        "specialists": [
            "Dr. Raghavan K - Billroth Hospital, Raja Annamalai Puram",
            "Dr. Roshan Kumar - SIMS Hospital, Vadapalani",
            "Dr. Pravin K Aggarwal - Apollo Spectra Hospital, Alwarpet"
        ]
    },
    "Random Glucose": {
        "condition": "Abnormal levels may cause Diabetic Ketoacidosis.",
        "specialists": [
            "Dr. Sundararaman P G - Billroth Hospital, Raja Annamalai Puram",
            "Dr. Bharat R - Arka Center fot Hormonal Health, Anna Nagar",
            "Dr. Geethalakshmi - Dr. Kamakshi Memorial Hospital, Pallikaranai"
        ]
    },
    "Albumin": {
        "condition": "Low albumin levels (hypoalbuminemia) \n can cause fluid retention and swelling (Edema)",
        "specialists": [
            "Dr. Jeysel Suraj - Suraj Hospital, Medavakkam",
            "Dr. Radhiga G - Billroth Hospital, Raja Annamalai Puram",
            "Dr. V. Jina Das - MGM Healthcare Malar Hospitals, Adyar"
        ]
    }
}

In [23]:
# Step 9: Process Medical Report
def process_medical_report():
    print("\n📂 **Upload a medical report PDF...**")
    pdf_file = get_uploaded_pdf()

    if not pdf_file:
        print("❌ No file uploaded. Exiting...")
        return
    
    print("\n📄 **Processing Report...**\n")

    # Try extracting text directly
    text = extract_text_from_pdf(pdf_file)

    if not text.strip():  # If no text found, use OCR
        print("⚠️ No selectable text found. Using OCR...\n")
        images = pdf_to_images(pdf_file)
        text = ocr_extract_text(images)

    text = clean_text(text)  # Clean extracted text
    
    # Extract lab values
    values = extract_lab_values(text, threshold_df)

    # Display Lab Test Results
    print("\nLab Test Results:\n")
    if values:
        print(tabulate(values, headers=["Test", "Result", "Status"], tablefmt="fancy_grid"))
    else:
        print("⚠️ No lab test values detected in the report.")

    # Check for abnormalities and suggest specialists
    for test, value, status in values:
        if "Too High" in status or "Too Low" in status:
            if test in specialist_recommendations:
                condition = specialist_recommendations[test]["condition"]
                specialists = specialist_recommendations[test]["specialists"]

                print(f"\n⚠️ You have abnormal levels of {test}.")
                print(f"🔹 This may indicate: {condition}")
                print("\n Recommended Doctors & Hospitals:")
                for specialist in specialists:
                    print(f"🔹 {specialist}")

# Run the script after uploading a PDF
process_medical_report()


📂 **Upload a medical report PDF...**

📄 **Processing Report...**


Lab Test Results:

╒═════════════════╤══════════╤═════════════╕
│ Test            │   Result │ Status      │
╞═════════════════╪══════════╪═════════════╡
│ Albumin         │      1   │ 🔻 Too Low  │
├─────────────────┼──────────┼─────────────┤
│ Creatinine      │      0.8 │ 🔻 Too Low  │
├─────────────────┼──────────┼─────────────┤
│ Total bilirubin │      5   │ ✅ Normal   │
├─────────────────┼──────────┼─────────────┤
│ Total protein   │      4   │ 🔻 Too Low  │
├─────────────────┼──────────┼─────────────┤
│ Triglycerides   │    129   │ 🔺 Too High │
├─────────────────┼──────────┼─────────────┤
│ Uric Acid       │      4   │ 🔺 Too High │
╘═════════════════╧══════════╧═════════════╛

⚠️ You have abnormal levels of Albumin.
🔹 This may indicate: Low albumin levels (hypoalbuminemia) 
 can cause fluid retention and swelling (Edema)

 Recommended Doctors & Hospitals:
🔹 Dr. Jeysel Suraj - Suraj Hospital, Medavakkam
🔹 Dr. Radhiga