In [1]:
import pandas as pd
import numpy as np

# Function to generate random cholesterol values
def generate_data(samples=1000):
    data = []
    
    for _ in range(samples):
        age = np.random.randint(20, 80)  # Random age between 20 and 80
        gender = np.random.choice(["Male", "Female"])
        total_cholesterol = np.random.randint(150, 300)  # Cholesterol range
        triglycerides = np.random.randint(50, 250)
        hdl = np.random.randint(30, 70)
        ldl = np.random.randint(80, 190)

        # Assign diet based on cholesterol levels
        if total_cholesterol > 240 or ldl > 160:
            diet = "Low Fat Diet"
        elif 200 <= total_cholesterol <= 240 or ldl > 130:
            diet = "Mediterranean Diet"
        else:
            diet = "High Protein Diet"

        data.append([age, gender, total_cholesterol, triglycerides, hdl, ldl, diet])

    return pd.DataFrame(data, columns=["Age", "Gender", "Total Cholesterol", "Triglycerides", "HDL", "LDL", "Diet"])

# Generate dataset with 1000 samples
df = generate_data(1000)
df.to_csv("cholesterol_diet_dataset.csv", index=False)
print(df.head())


   Age  Gender  Total Cholesterol  Triglycerides  HDL  LDL               Diet
0   67  Female                254             51   32  147       Low Fat Diet
1   61    Male                283            115   63  100       Low Fat Diet
2   28  Female                290             75   61  164       Low Fat Diet
3   61    Male                271            130   69  174       Low Fat Diet
4   79  Female                154             65   32   85  High Protein Diet


In [2]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

def extract_values(text):
    """Extracts cholesterol values using regex."""
    data = {}
    
    # Regular expressions to extract key values (adjust patterns if needed)
    patterns = {
        "total_cholesterol": r"Total Cholesterol[:\s]+(\d+)",
        "triglycerides": r"Triglycerides[:\s]+(\d+)",
        "hdl": r"HDL Cholesterol[:\s]+(\d+)",
        "ldl": r"LDL Cholesterol[:\s]+(\d+)",
        "age": r"Age[:\s]+(\d+)",
        "gender": r"Gender[:\s]+(Male|Female)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1)

    return data

# Example Usage
pdf_path = r"D:\sem 8 project\pdf_path.pdf"  # Use raw string (r"") to avoid escape issues
pdf_text = extract_text_from_pdf(pdf_path)
user_data = extract_values(pdf_text)
print(user_data)


{'total_cholesterol': '220', 'triglycerides': '180', 'hdl': '50', 'ldl': '140', 'age': '21', 'gender': 'Male'}


In [None]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file while handling multiple page layouts."""
    try:
        doc = fitz.open(pdf_path)
        text = ""

        for page in doc:
            text += page.get_text("text") + "\n"  # Extracts text from each page

        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def extract_values(text):
    """Extracts cholesterol values using regex while ignoring unnecessary text."""
    data = {}

    # Regular expressions to extract key values (handles variations in spacing)
    patterns = {
        # "patient_name": r"Patient Name[:\s]+([\w\s]+)",
        "Name": r"Name[:\s]+([A-Za-z\.]+(?: [A-Za-z\.]+)*)",
        "age": r"Age[:\s]+(\d+)",
        "gender": r"Gender[:\s]+(Male|Female)",
        "total_cholesterol": r"Total Cholesterol[:\s]+(\d+)",
        "triglycerides": r"Triglycerides[:\s]+(\d+)",
        "hdl": r"HDL Cholesterol[:\s]+(\d+)",
        "ldl": r"LDL Cholesterol[:\s]+(\d+)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1).strip()

    return data

# Example Usage
pdf_path = "D:\sem 8 project\pdf_path.pdf"  # Corrected the file path
pdf_text = extract_text_from_pdf(pdf_path)
user_data = extract_values(pdf_text)

print("Extracted Data:", user_data)


In [None]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(data1):
    """Extracts text from a PDF file while handling multiple page layouts."""
    try:
        doc = fitz.open(data1)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"  # Extract text from each page
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def clean_text(text):
    """Cleans up text formatting issues (extra colons, misplaced newlines, etc.)."""
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with one
    text = re.sub(r"[:\n]+", ": ", text)  # Normalize colons and newlines
    return text

def extract_values(text):
    """Extracts required values using regex while handling formatting variations."""
    text = clean_text(text)  # Pre-process text to fix formatting issues
    data = {}

    # Debugging: Print extracted text sample
    print("Extracted Text Sample:", text[:1500])

    # Updated regex patterns based on new format
    patterns = {
        "Name": r"Name(?:Mr\.|Mrs\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
        "Age": r"(\d{1,3})\s*(?:Years|Yr|Yrs)?",
        "Gender": r"(Male|Female|Other)",
        "Total Cholesterol": r"Cholesterol Total\s+<\d+\.\d+\s*mg/dL[:\s]+(\d+)",
        "LDL Cholesterol": r"LDL Cholesterol\s*<\d+\.\d+\s*mg/dL[:\s]+(\d+)",
        "HDL Cholesterol": r"HDL Cholesterol\s*>\d+\.\d+\s*mg/dL[:\s]+(\d+)",
        "Triglycerides": r"Triglycerides\s*<\d+\.\d+\s*mg/dL[:\s]+(\d+)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            data[key] = int(value) if value.isdigit() else value  # Convert numbers to integers
        else:
            print(f"Warning: {key} not found!")

    return data

def format_output(data):
    """Formats extracted data with newline separation."""
    return "\n".join(f"{key} {value}" for key, value in data.items())

# Use the correct PDF file path
pdf_path = "D:\sem 8 project\data1.pdf"  # Replace with actual PDF path
pdf_text = extract_text_from_pdf(pdf_path)
user_data = extract_values(pdf_text)

formatted_output = format_output(user_data)
print("\n" + formatted_output)  # Print the formatted result


In [None]:
from pdf2image import convert_from_path
import pytesseract
import re

# Convert PDF pages to images
images = convert_from_path("data1.pdf")

# OCR on each page
text = "\n".join(pytesseract.image_to_string(img) for img in images)

# Regex Extraction (Same as NLP method)
patterns = {
    "Total Cholesterol": r"Cholesterol\s+Total\s+(\d+)",
    "LDL Cholesterol": r"LDL\s+(\d+)",
    "HDL Cholesterol": r"HDL\s+(\d+)",
    "Triglycerides": r"Triglycerides\s+(\d+)"
}
extracted_data = {key: re.search(pattern, text).group(1) if re.search(pattern, text) else None for key, pattern in patterns.items()}

print(extracted_data)


In [3]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(rovmyp2hxteu0g4ztodi1vru):
    """Extracts text from a PDF file while handling multiple page layouts."""
    try:
        doc = fitz.open(rovmyp2hxteu0g4ztodi1vru)
        text = ""

        for page in doc:
            text += page.get_text("text") + "\n"  # Extracts text from each page

        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def extract_values(text):
    """Extracts cholesterol values using regex while ignoring unnecessary text."""
    data = {}

    # Regular expressions to extract key values (handles variations in spacing)
    patterns = {
        "patient_name": r"Patient Name[:\s]+([\w\s]+)",
        "age": r"Age[:\s]+(\d+)",
        "gender": r"Gender[:\s]+(Male|Female)",
        "total_cholesterol": r"Total Cholesterol[:\s]+(\d+)",
        "triglycerides": r"Triglycerides[:\s]+(\d+)",
        "hdl": r"HDL Cholesterol[:\s]+(\d+)",
        "ldl": r"LDL Cholesterol[:\s]+(\d+)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1).strip()

    return data

# Example Usage
pdf_path = "D:\sem 8 project\rovmyp2hxteu0g4ztodi1vru.pdf"  # Corrected the file path
pdf_text = extract_text_from_pdf(pdf_path)
user_data = extract_values(pdf_text)

print("Extracted Data:", user_data)


ovmyp2hxteu0g4ztodi1vru.pdf'ile: 'D:\sem 8 project
Extracted Data: {}


  pdf_path = "D:\sem 8 project\rovmyp2hxteu0g4ztodi1vru.pdf"  # Corrected the file path


In [None]:
pip install pymupdf pdfplumber


In [None]:
import fitz  # PyMuPDF for text extraction
import pdfplumber  # Extracts tables from PDFs
import re  # Regular expressions for data extraction

def extract_text_from_pdf(rovmyp2hxteu0g4ztodi1vru):
    """Extracts raw text from a PDF file."""
    try:
        doc = fitz.open(rovmyp2hxteu0g4ztodi1vru)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"  # Extracts text from each page
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def extract_table_from_pdf(rovmyp2hxteu0g4ztodi1vru):
    """Extracts data from tables in a PDF."""
    extracted_table_data = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                table = page.extract_table()
                if table:
                    for row in table:
                        extracted_table_data.append(row)
        return extracted_table_data
    except Exception as e:
        print(f"Error extracting tables: {e}")
        return []

def extract_values(text):
    """Extracts cholesterol and patient data using regex."""
    data = {}

    # Updated regex patterns (handling cases where numbers are merged)
    patterns = {
        "name": r"Name\s+([\w\s]+)",
        "age": r"(\d{2})\s*Years",
        "gender": r"Gender\s*(Male|Female)",
        "total_cholesterol": r"Cholesterol\s*Total\s*(\d+)",
        "triglycerides": r"Triglycerides\s*(\d+)",
        "hdl": r"HDL\s*Cholesterol\s*(\d+)",
        "ldl": r"LDL\s*Cholesterol\s*(\d+)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data[key] = match.group(1).strip()

    return data

def main():
    # ✅ Use correct PDF path
    pdf_path = "D:\\sem 8 project\\rovmyp2hxteu0g4ztodi1vru.pdf"  # Double backslashes
  # Uploaded file in this environment

    # Extract text
    raw_text = extract_text_from_pdf(pdf_path)
    
    # Extract values using regex
    extracted_data = extract_values(raw_text)

    # Extract table data (if needed)
    table_data = extract_table_from_pdf(pdf_path)

    # Print results
    print("\nExtracted Text Data:")
    print(extracted_data)

    print("\nExtracted Table Data:")
    for row in table_data:
        print(row)

if __name__ == "__main__":
    main()
