In [3]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(data1):
    """Extracts text from a PDF file while handling multiple page layouts."""
    try:
        doc = fitz.open(data1)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"  # Extract text from each page
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def clean_text(text):
    """Cleans up text formatting issues (extra colons, misplaced newlines, etc.)."""
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with one
    text = re.sub(r"[:\n]+", ": ", text)  # Normalize colons and newlines
    return text

def extract_values(text):
    """Extracts required values using regex while handling formatting variations."""
    text = clean_text(text)  # Pre-process text to fix formatting issues
    data = {}

    # Debugging: Print extracted text sample
    print("Extracted Text Sample:", text[:1500])

    # Updated regex patterns based on new format
    patterns = {
        "Name": r"Name(?:Mr\.|Mrs\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
        "Age": r"(\d{1,3})\s*(?:Years|Yr|Yrs)?",
        "Gender": r"(Male|Female|Other)",
         "Total Cholesterol": r"(\d+)\s*Cholesterol\s*Total",
    "HDL Cholesterol": r"(\d+)\s*HDL\s*Cholesterol",
    "LDL Cholesterol": r"(\d+)\s*LDL\s*Cholesterol",
    "Triglycerides": r"(\d+)\s*Triglycerides"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            data[key] = int(value) if value.isdigit() else value  # Convert numbers to integers
        else:
            print(f"Warning: {key} not found!")

    return data

def format_output(data):
    """Formats extracted data with newline separation."""
    return "\n".join(f"{key} {value}" for key, value in data.items())

# Use the correct PDF file path
pdf_path = "D:\sem 8 project\data1.pdf"  # Replace with actual PDF path
pdf_text = extract_text_from_pdf(pdf_path)
user_data = extract_values(pdf_text)

formatted_output = format_output(user_data)
print("\n" + formatted_output)  # Print the formatted result


Extracted Text Sample: Report Status Male: 43 Years: Age: Gender: Reported P: 5/2/2025 11: 20: 00AM: SELF: 477013620: Mr. ARUN PATORKAR: Name Lab No. Ref By Collected A/c Status 5/2/2025 12: 10: 52PM: Final: LPL-SHREE ICU & MRC PVT LTD: C/O Shree Hospital 747,MANGALWAR PETH, Karad Dist. Satara Pin 415110: Collected at Processed at LPL-KARAD (SHREE I.C.U. & M.R.C. PVT. LTD.): C/O Shree Hospital 747,MANGALWAR PETH, Karad Dist. Satara Pin 415110 PETH, KARAD - 415110: KARAD Test Report Test Name: Results: Units: Bio. Ref. Interval: GLUCOSE, FASTING (F): (Hexokinase): Glucose, Fasting: mg/dL: 70.00 - 100.00: 163.60: SNR01-Sample Not Received: Urine,Glucose: *477013620*: Page 1 of 4 Report Status Male: 43 Years: Age: Gender: Reported P: 5/2/2025 11: 20: 00AM: SELF: 477013620: Mr. ARUN PATORKAR: Name Lab No. Ref By Collected A/c Status 5/2/2025 12: 10: 52PM: Final: Collected at : Processed at : LPL-KARAD (SHREE I.C.U. & M.R.C. PVT. LTD.): C/O Shree Hospital 747,MANGALWAR PETH, Karad Dist. Sat

  pdf_path = "D:\sem 8 project\data1.pdf"  # Replace with actual PDF path
