In [5]:
import pdfplumber  # Better for tables
import fitz  # PyMuPDF for text extraction
import re
import json
import pandas as pd
import joblib  # For loading ML model

# Load the trained ML model
diet_model = joblib.load("diet_recommendation_model.pkl")

# ---------------------- 1️⃣ EXTRACT TEXT FROM PDF ----------------------

def extract_text_from_pdf(pdf_path):
    """ Extract text from a given PDF file using both pdfplumber and PyMuPDF """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
        
        if not text.strip():  # If pdfplumber fails, fallback to PyMuPDF
            doc = fitz.open(pdf_path)
            text = "\n".join([page.get_text("text") for page in doc])
        
        return text.strip()
    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
        return ""

# ---------------------- 2️⃣ STRUCTURE DATA USING REGEX ----------------------

def clean_text(text):
    """ Normalize text by removing extra spaces and special characters """
    text = text.replace("\n", " ")
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

def parse_lipid_profile(text):
    """ Extract lipid profile values from raw text using regex patterns """
    lipid_profile = {
        "Cholesterol Total": None,
        "Triglycerides": None,
        "HDL Cholesterol": None,
        "LDL Cholesterol": None,
        "VLDL Cholesterol": None,
        "Non-HDL Cholesterol": None,
        "Age": None,
        "Gender": None
    }
    
    # Define regex patterns
    patterns = {
        "Cholesterol Total": [r"Cholesterol Total[:\s]+(\d+)", 
                              r"Total Cholesterol[:\s]+(\d+)"],
        "Triglycerides": [r"Triglycerides[:\s]+(\d+)",
                          r"(\d+)\s*Triglycerides"],
        "HDL Cholesterol": [r"HDL Cholesterol[:\s]+(\d+)",
                            r"(\d+)\s*HDL"],
        "LDL Cholesterol": [r"LDL Cholesterol[:\s]+(\d+)",
                            r"(\d+)\s*LDL"],
        "VLDL Cholesterol": [r"VLDL Cholesterol[:\s]+(\d+)", r"(\d+)\s*VLDL"],
        "Non-HDL Cholesterol": [r"Non-HDL Cholesterol[:\s]+(\d+)", r"(\d+)\s*Non-HDL"],
        "Age": [r"Age[:\s]+(\d+)"],
        "Gender": [r"Gender[:\s]+(Male|Female)" ,r"(Male|Female|Other)"]
    }
    
    text = clean_text(text)
    
    for key, pattern_list in patterns.items():
        for pattern in pattern_list:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                lipid_profile[key] = int(value) if value.isdigit() else value
                break
    
    lipid_profile["Gender"] = 1 if lipid_profile["Gender"] == "Male" else 0 if lipid_profile["Gender"] == "Female" else None
    return lipid_profile

# ---------------------- 3️⃣ ML MODEL PREDICTION ----------------------

def predict_diet(data):
    """Uses the trained ML model to predict the diet recommendation."""
    if None in data.values():
        print("❌ Missing data! Please check PDF extraction.")
        return None
    
    # Get additional inputs
    try:
        systolic_bp = int(input("Enter Systolic Blood Pressure: "))
        diastolic_bp = int(input("Enter Diastolic Blood Pressure: "))
        smoking = int(input("Are you a smoker? (1: Yes, 0: No): "))
        diabetes = int(input("Do you have diabetes? (1: Yes, 0: No): "))
        heart_attack = int(input("Have you had a heart attack? (1: Yes, 0: No): "))
    except ValueError:
        print("❌ Invalid input! Please enter numbers only.")
        return None
    
    # Prepare input data
    input_data = pd.DataFrame([[
        data["Age"], data["Gender"], data["Cholesterol Total"],
        data["LDL Cholesterol"], data["HDL Cholesterol"],
        data["Cholesterol Total"] - data["HDL Cholesterol"],  # Non-HDL Cholesterol
        systolic_bp, diastolic_bp, smoking, diabetes, heart_attack
    ]], columns=["age", "sex", "total_cholesterol", "ldl", "hdl", "non_hdl", 
                 "systolic_bp", "diastolic_bp", "smoking", "diabetes", "heart_attack"])
    
    # Predict diet
    predicted_label = diet_model.predict(input_data)[0]
    
    diet_mapping = {
        0: "Low-fat, high-fiber diet - fruits, vegetables, grains, beans, peas, lentils",
        1: "Increase healthy fats - avocados, nuts, seeds, fatty fish, olives",
        2: "Heart-healthy Mediterranean diet - vegetables, fruits, whole grains, beans, nuts, olive oil",
        3: "Balanced diet"
    }
    
    diet_recommendation = diet_mapping.get(predicted_label, "No recommendation available")
    print("\n🔹 **Diet Recommendation:**", diet_recommendation)

# ---------------------- RUN SCRIPT ----------------------

if __name__ == "__main__":
    pdf_path = r"D:\sem 8 project\lipid3.pdf"
    text = extract_text_from_pdf(pdf_path)
    if text:
        extracted_data = parse_lipid_profile(text)
        print("\n📌 Extracted Data:")
        for key, value in extracted_data.items():
            print(f"{key}: {value}")
        predict_diet(extracted_data)



📌 Extracted Data:
Cholesterol Total: None
Triglycerides: None
HDL Cholesterol: None
LDL Cholesterol: None
VLDL Cholesterol: None
Non-HDL Cholesterol: None
Age: 1
Gender: None
❌ Missing data! Please check PDF extraction.


In [1]:
#spypsf 

In [1]:
import pdfplumber
import spacy
import re
import joblib
import pandas as pd

# Load ML model
diet_model = joblib.load("diet_recommendation_model.pkl")

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# ---------------------- 1️⃣ EXTRACT TEXT FROM PDF ----------------------

def extract_text_from_pdf(pdf_path):
    """ Extract text from a given PDF file using pdfplumber """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
        return text.strip()
    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
        return ""

# ---------------------- 2️⃣ STRUCTURE DATA USING spaCy NER ----------------------

def extract_health_data(text):
    """ Use spaCy to extract lipid profile values and health-related information """
    lipid_profile = {
        "Cholesterol Total": None,
        "Triglycerides": None,
        "HDL Cholesterol": None,
        "LDL Cholesterol": None,
        "VLDL Cholesterol": None,
        "Non-HDL Cholesterol": None,
        "Age": None,
        "Gender": None
    }

    # Process text with spaCy
    doc = nlp(text)

    # Extract numbers using regex
    numbers = re.findall(r"(\d+)", text)

    # Extract entities
    for ent in doc.ents:
        if ent.label_ == "CARDINAL" and ent.text.isdigit():
            value = int(ent.text)
            # Assign values based on heuristics (you may need to improve this logic)
            if "cholesterol" in ent.sent.text.lower():
                if "total" in ent.sent.text.lower():
                    lipid_profile["Cholesterol Total"] = value
                elif "hdl" in ent.sent.text.lower():
                    lipid_profile["HDL Cholesterol"] = value
                elif "ldl" in ent.sent.text.lower():
                    lipid_profile["LDL Cholesterol"] = value
                elif "vldl" in ent.sent.text.lower():
                    lipid_profile["VLDL Cholesterol"] = value
                elif "non-hdl" in ent.sent.text.lower():
                    lipid_profile["Non-HDL Cholesterol"] = value
            elif "triglycerides" in ent.sent.text.lower():
                lipid_profile["Triglycerides"] = value
            elif "age" in ent.sent.text.lower():
                lipid_profile["Age"] = value

        elif ent.label_ == "PERSON":  # Assume gender information is mentioned
            if "male" in ent.text.lower():
                lipid_profile["Gender"] = 1  # Male
            elif "female" in ent.text.lower():
                lipid_profile["Gender"] = 0  # Female

    return lipid_profile

# ---------------------- 3️⃣ ML MODEL PREDICTION ----------------------

def predict_diet(data):
    """Uses the trained ML model to predict the diet recommendation."""
    if None in data.values():
        print("❌ Missing data! Please check PDF extraction.")
        return None

    # Get additional inputs
    try:
        systolic_bp = int(input("Enter Systolic Blood Pressure: "))
        diastolic_bp = int(input("Enter Diastolic Blood Pressure: "))
        smoking = int(input("Are you a smoker? (1: Yes, 0: No): "))
        diabetes = int(input("Do you have diabetes? (1: Yes, 0: No): "))
        heart_attack = int(input("Have you had a heart attack? (1: Yes, 0: No): "))
    except ValueError:
        print("❌ Invalid input! Please enter numbers only.")
        return None

    # Prepare input data
    input_data = pd.DataFrame([[
        data["Age"], data["Gender"], data["Cholesterol Total"],
        data["LDL Cholesterol"], data["HDL Cholesterol"],
        data["Cholesterol Total"] - data["HDL Cholesterol"],  # Non-HDL Cholesterol
        systolic_bp, diastolic_bp, smoking, diabetes, heart_attack
    ]], columns=["age", "sex", "total_cholesterol", "ldl", "hdl", "non_hdl",
                 "systolic_bp", "diastolic_bp", "smoking", "diabetes", "heart_attack"])

    # Predict diet
    predicted_label = diet_model.predict(input_data)[0]

    diet_mapping = {
        0: "Low-fat, high-fiber diet - fruits, vegetables, grains, beans, peas, lentils",
        1: "Increase healthy fats - avocados, nuts, seeds, fatty fish, olives",
        2: "Heart-healthy Mediterranean diet - vegetables, fruits, whole grains, beans, nuts, olive oil",
        3: "Balanced diet"
    }

    diet_recommendation = diet_mapping.get(predicted_label, "No recommendation available")
    print("\n🔹 **Diet Recommendation:**", diet_recommendation)

# ---------------------- RUN SCRIPT ----------------------

if __name__ == "__main__":
    pdf_path = r"D:\sem 8 project\lipid2.pdf"
    text = extract_text_from_pdf(pdf_path)
    if text:
        extracted_data = extract_health_data(text)
        print("\n📌 Extracted Data:")
        for key, value in extracted_data.items():
            print(f"{key}: {value}")
        predict_diet(extracted_data)



📌 Extracted Data:
Cholesterol Total: None
Triglycerides: None
HDL Cholesterol: 3
LDL Cholesterol: None
VLDL Cholesterol: None
Non-HDL Cholesterol: None
Age: 4
Gender: None
❌ Missing data! Please check PDF extraction.
