In [3]:
import fitz  # PyMuPDF for PDF extraction
import re
import pandas as pd
import joblib  # For loading ML model

# Load the trained model
model = joblib.load("diet_recommendation_model.pkl")

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def extract_values(text):
    """Extracts required values using multiple regex patterns."""
    data = {}

    # Define multiple patterns for each key
    patterns = {
        "Name": [
            r"Mr\.?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
            r"Name[:\s]+(?:Mr\.|Mrs\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
            
        ],
        "Age": [
            r"(\d{1,3})\s*(?:Years|Yr|Yrs)?",
            r"Age[:\s]+(\d+)"
        ],
        "Gender": [
            r"(Male|Female|Other)",
            r"Gender[:\s]+(Male|Female)"
        ],
        "Total Cholesterol": [
            r"Total Cholesterol[:\s]+(\d+)",
            r"(\d+)\s*Cholesterol\s*Total"
        ],
        "Triglycerides": [
            r"Triglycerides[:\s]+(\d+)",
            r"(\d+)\s*Triglycerides"
        ],
        "HDL Cholesterol": [
            r"HDL Cholesterol[:\s]+(\d+)",
            r"(\d+)\s*HDL\s*Cholesterol"
        ],
        "LDL Cholesterol": [
            r"LDL Cholesterol[:\s]+(\d+)",
            r"(\d+)\s*LDL\s*Cholesterol"
        ]
    }
    
    # Try each pattern in order until a match is found
    for key, pattern_list in patterns.items():
        for pattern in pattern_list:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                data[key] = int(value) if value.isdigit() else value
                break  # Stop trying once a match is found
        else:
            data[key] = None  # No match found

    # Convert Gender to numeric format
    data["Gender"] = 1 if data.get("Gender") == "Male" else 0 if data.get("Gender") == "Female" else None
    
    return data

# Test function
pdf_path = "D:/sem 8 project/data1.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
extracted_data = extract_values(pdf_text)

if extracted_data:
    print("\n📌 Extracted Data:")
    for key, value in extracted_data.items():
        print(f"{key}: {value}")



📌 Extracted Data:
Name: ARUN  PATORKAR
Age: 43
Gender: 1
Total Cholesterol: 167
Triglycerides: 132
HDL Cholesterol: 23
LDL Cholesterol: 118


In [1]:
import fitz  # PyMuPDF for PDF extraction
import re
import pandas as pd
import joblib  # For loading ML model

# Load the trained model
model = joblib.load("diet_recommendation_model.pkl")

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def extract_values(text):
    """Extracts required values using multiple regex patterns."""
    data = {}

    # Define multiple patterns for each key
    patterns = {
        "Name": [
            r"\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
            r"Mr\.?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
            r"Name[:\s]+(?:Mr\.|Mrs\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)"
        ],
        "Age": [
            r"(\d{1,3})\s*(?:Years|Yr|Yrs)?",
            r"Age[:\s]+(\d+)"
        ],
        "Gender": [
            r"(Male|Female|Other)",
            r"Gender[:\s]+(Male|Female)"
        ],
        "Total Cholesterol": [
            r"Cholesterol Total[\s]+(\d+)",
            r"Total Cholesterol[:\s]+(\d+)",
            r"Cholesterol Total[:\s]+(\d+)",
            r"(\d+)\s*Cholesterol\s*Total",
            r"(\d+)\s*Cholesterol\s+"

        ],
        "Triglycerides": [
            r"Triglycerides[:\s]+(\d+)",
            r"(\d+)\s*Triglycerides"
        ],
        "HDL Cholesterol": [
            r"HDL Cholesterol[:\s]+(\d+)",
            r"(\d+)\s*HDL\s*Cholesterol"
        ],
        "LDL Cholesterol": [
            r"LDL Cholesterol[:\s]+(\d+)",
            r"(\d+)\s*LDL\s*Cholesterol"
        ]
    }
    
    # Try each pattern in order until a match is found
    for key, pattern_list in patterns.items():
        for pattern in pattern_list:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                data[key] = int(value) if value.isdigit() else value
                break  # Stop trying once a match is found
        else:
            data[key] = None  # No match found

    # Convert Gender to numeric format
    data["Gender"] = 1 if data.get("Gender") == "Male" else 0 if data.get("Gender") == "Female" else None
    
    return data

def get_user_inputs():
    """Ask user for missing inputs like BP, smoking, diabetes."""
    try:
        systolic_bp = int(input("Enter Systolic Blood Pressure: "))
        diastolic_bp = int(input("Enter Diastolic Blood Pressure: "))
        smoking = int(input("Are you a smoker? (1: Yes, 0: No): "))
        diabetes = int(input("Do you have diabetes? (1: Yes, 0: No): "))
        heart_attack = int(input("Have you had a heart attack? (1: Yes, 0: No): "))

        return systolic_bp, diastolic_bp, smoking, diabetes, heart_attack
    except ValueError:
        print("Invalid input! Please enter numbers only.")
        return None, None, None, None, None

def predict_diet(data):
    """Uses the trained ML model to predict the diet recommendation."""
    if None in data.values():
        print("Missing data! Please check PDF extraction.")
        return None

    # Get additional user inputs
    systolic_bp, diastolic_bp, smoking, diabetes, heart_attack = get_user_inputs()
    
    if None in [systolic_bp, diastolic_bp, smoking, diabetes, heart_attack]:
        print("Error in user inputs. Exiting.")
        return None

    # Prepare final data for model prediction
    input_data = pd.DataFrame([[
        data["Age"], data["Gender"], data["Total Cholesterol"],
        data["LDL Cholesterol"], data["HDL Cholesterol"],
        data["Total Cholesterol"] - data["HDL Cholesterol"],  # Non-HDL Cholesterol
        systolic_bp, diastolic_bp, smoking, diabetes, heart_attack
    ]], columns=["age", "sex", "total_cholesterol", "ldl", "hdl", "non_hdl", 
                 "systolic_bp", "diastolic_bp", "smoking", "diabetes", "heart_attack"])
    
    # Predict diet
    predicted_label = model.predict(input_data)[0]
    
    # Mapping diet labels to recommendations
    diet_mapping = {
        0: "\n Low-fat, high-fiber diet - \n fruits,\n vegetables, \n grains, \n beans, \n peas,\n  lentils",
        1: "\n Increase healthy fats - \n avocados,\n nuts,\n seeds,\n fatty fish,\n olives",
        2: "\nHeart-healthy Mediterranean diet\n Vegetables \n Fruits. \n Whole grains.\nBeans. Nuts seeds \n Olive oil\nSeasoning with herbs and spices. ",
        3: "\nBalanced diet"
    }
    
    # Print recommendation
    diet_recommendation = diet_mapping.get(predicted_label, "No recommendation available")
    print("\n🔹 **Diet Recommendation:**", diet_recommendation)

# Test function
pdf_path = r"D:\sem 8 project\data1.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
extracted_data = extract_values(pdf_text)

if extracted_data:
    print("\n📌 Extracted Data:")
    for key, value in extracted_data.items():
        print(f"{key}: {value}")
    
    predict_diet(extracted_data)



📌 Extracted Data:
Name: Report Status    
Male
Age: 43
Gender: 1
Total Cholesterol: 167
Triglycerides: 132
HDL Cholesterol: 23
LDL Cholesterol: 118

🔹 **Diet Recommendation:** 
 Increase healthy fats - 
 avocados,
 nuts,
 seeds,
 fatty fish,
 olives
