In [5]:
import fitz  # PyMuPDF for PDF extraction
import re
import pandas as pd
import joblib  # For loading ML model

# Load the trained model
model = joblib.load("diet_recommendation_model.pkl")

def extract_text_from_pdf(data1):
    """Extracts text from a PDF file."""
    try:
        doc = fitz.open(data1)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"  
        return text.strip()
    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
        return ""

def extract_values(text):
    """Extracts required values from the text using regex."""
    data = {}

    # Regular expressions to capture values
    patterns = {
        "Name": r"(?:Mr\.|Mrs\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
        "Age": r"(\d{1,3})\s*(?:Years|Yr|Yrs)?",
        "Gender": r"(Male|Female|Other)",
        "Cholesterol Total":r"Cholesterol Total[:\s]+(\d+)|Cholesterol Total\s*[:\s]+(\d+)",
        "LDL Cholesterol": r"LDL Cholesterol\s*<\d+\.\d+\s*mg/dL[:\s]+(\d+)",
        "HDL Cholesterol": r"HDL Cholesterol\s*>\d+\.\d+\s*mg/dL[:\s]+(\d+)",
        "Triglycerides": r"Triglycerides\s*<\d+\.\d+\s*mg/dL[:\s]+(\d+)"
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            data[key] = int(value) if value.isdigit() else value
        else:
            data[key] = None  # If not found, set to None

    # Convert Gender to numeric format
    if data.get("Gender") == "Male":
        data["Gender"] = 1
    elif data.get("Gender") == "Female":
        data["Gender"] = 0
    else:
        data["Gender"] = None  # Handle missing gender

    return data

def get_user_inputs():
    """Ask user for missing inputs like BP, smoking, diabetes."""
    try:
        systolic_bp = int(input("Enter Systolic Blood Pressure: "))
        diastolic_bp = int(input("Enter Diastolic Blood Pressure: "))
        smoking = int(input("Are you a smoker? (1: Yes, 0: No): "))
        diabetes = int(input("Do you have diabetes? (1: Yes, 0: No): "))
        heart_attack = int(input("Have you had a heart attack? (1: Yes, 0: No): "))

        return systolic_bp, diastolic_bp, smoking, diabetes, heart_attack
    except ValueError:
        print("❌ Invalid input! Please enter numbers only.")
        return None, None, None, None, None

def predict_diet(data):
    """Uses the trained ML model to predict the diet recommendation."""
    # Ensure all values are present
    required_keys = ["Age", "Gender", "Cholesterol Total", "LDL Cholesterol", "HDL Cholesterol"]
    for key in required_keys:
        if data.get(key) is None:
            print(f"❌ Missing data: {key}. Please check PDF extraction.")
            return None

    # Get additional user inputs
    systolic_bp, diastolic_bp, smoking, diabetes, heart_attack = get_user_inputs()
    
    if None in [systolic_bp, diastolic_bp, smoking, diabetes, heart_attack]:
        print("❌ Error in user inputs. Exiting.")
        return None

    # Prepare final data for model prediction
    input_data = pd.DataFrame([[  
        data["Age"], data["Gender"], data["Cholesterol Total"],  
        data["LDL Cholesterol"], data["HDL Cholesterol"],  
        data["Cholesterol Total"] - data["HDL Cholesterol"],  # Non-HDL Cholesterol  
        systolic_bp, diastolic_bp, smoking, diabetes, heart_attack  
    ]], columns=["age", "sex", "total_cholesterol", "ldl", "hdl", "non_hdl",  
                 "systolic_bp", "diastolic_bp", "smoking", "diabetes", "heart_attack"])

    # Predict diet
    predicted_label = model.predict(input_data)[0]

    # Mapping diet labels to recommendations
    diet_mapping = {
        0: "Low-fat, high-fiber diet",
        1: "Increase healthy fats (avocados, nuts)",
        2: "Heart-healthy Mediterranean diet",
        3: "Balanced diet"
    }

    # Print recommendation
    diet_recommendation = diet_mapping.get(predicted_label, "No recommendation available")
    print("\n🔹 **Diet Recommendation:**", diet_recommendation)

# ----------------- MAIN EXECUTION -----------------
pdf_path = "D:/sem 8 project/data1.pdf"  # Replace with your actual PDF file path
pdf_text = extract_text_from_pdf(pdf_path)
extracted_data = extract_values(pdf_text)

# Debug: Print extracted keys
print("\n📌 Extracted Data Keys:", extracted_data.keys())

if extracted_data:
    print("\n📌 Extracted Data:")
    for key, value in extracted_data.items():
        print(f"{key}: {value}")

    predict_diet(extracted_data)



📌 Extracted Data Keys: dict_keys(['Name', 'Age', 'Gender', 'Cholesterol Total', 'LDL Cholesterol', 'HDL Cholesterol', 'Triglycerides'])

📌 Extracted Data:
Name: Report Status    
Male
Age: 43
Gender: 1
Cholesterol Total: None
LDL Cholesterol: 26
HDL Cholesterol: 118
Triglycerides: 23
❌ Missing data: Cholesterol Total. Please check PDF extraction.
