In [None]:
import pandas as pd
import joblib
import pdfplumber
import re
import os
import tkinter as tk
from tkinter import filedialog
from sklearn.preprocessing import LabelEncoder, StandardScaler

model = joblib.load("diabetes_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoder = joblib.load("label_encoder.pkl")

# Expected features
features = ['age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# 1. File Selector using Tkinter
def select_pdf():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(
        title="Select PDF file",
        filetypes=[("PDF Files", "*.pdf")]
    )
    return file_path

# 2. Extract raw text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# 3. Extract parameters using regex
def extract_parameters(text):
    data = {}

    patterns = {
        "age": r"Age[:\s]*([\d]+)",
        "bmi": r"BMI[:\s]*([\d\.]+)",
        "blood_glucose_level": r"(?:Glucose|Blood Glucose Level)[:\s]*([\d]+)",
        "HbA1c_level": r"HbA1c(?:\s*[:\-])?\s*(?:\n\s*)?([\d\.]+)",
        "hypertension": r"Hypertension[:\s]*(Yes|No|1|0)",
        "heart_disease": r"Heart Disease[:\s]*(Yes|No|1|0)",
        "smoking_history": r"Smoking History[:\s]*(never|former|current|ever|not current|unknown)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            if key in ["age", "blood_glucose_level"]:
                data[key] = int(value)
            elif key in ["bmi", "HbA1c_level"]:
                data[key] = float(value)
            elif key in ["hypertension", "heart_disease"]:
                data[key] = 1 if value.lower() in ["yes", "1"] else 0
            else:
                data[key] = value.lower()
        else:
            data[key] = None  # Handle missing data

    return data

# 4. Classify Diabetes Type
def classify_diabetes(hba1c, glucose):
    if hba1c > 6.5 and glucose > 200:
        return "Type 3c Diabetes"
    elif hba1c > 6.5 and glucose > 126:
        return "Type 1 Diabetes" if hba1c > 7.5 else "Type 2 Diabetes"
    elif hba1c > 5.7 and glucose > 100:
        return "Pre-Diabetes"
    else:
        return "Non-Diabetic"

# 5. Main prediction function
def diagnose_from_pdf():
    pdf_path = select_pdf()
    if not pdf_path:
        print("❌ No file selected.")
        return

    if not os.path.exists(pdf_path):
        print(f"❌ File '{pdf_path}' does not exist.")
        return

    print("📄 Reading report...")
    text = extract_text_from_pdf(pdf_path)

    print("🔍 Extracting parameters...")
    data = extract_parameters(text)

    # Ensure all required features are present
    for feature in features:
        if data[feature] is None:
            print(f"⚠️ Missing value for: {feature}")
            return

    # Encode smoking history
    data["smoking_history"] = label_encoder.transform([data["smoking_history"]])[0]

    # Prepare data for prediction
    df = pd.DataFrame([list(data.values())], columns=features)
    scaled_data = scaler.transform(df)

    # Predict
    prediction = model.predict(scaled_data)
    result = "Diabetic" if prediction[0] == 1 else "Non-Diabetic"
    diabetes_type = classify_diabetes(data["HbA1c_level"], data["blood_glucose_level"])

    # Output
    print("\n📊 Prediction Result:")
    print("----------------------------")
    for k, v in data.items():
        print(f"{k}: {v}")
    print("----------------------------")
    print(f"✅ Diagnosis: {result}")
    print(f"🧬 Type: {diabetes_type}")
    print("----------------------------")

# Run
if __name__ == "__main__":
    diagnose_from_pdf()
