In [65]:
import pytesseract
from pdf2image import convert_from_path
import json
import cv2
import numpy as np
import re
from datetime import datetime


In [66]:
# Path to the Poppler installation
pop_path = r'C:\Program Files\poppler-24.08.0\Library\bin'

# Path to Tesseract OCR file
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Path to the sample PDF file
pdf_path = r'D:\semi 5\SenZmate\Sample_For_Assignment.pdf'

In [67]:
# Helper function to extract text using OCR from an image
def extract_text_from_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, threshold_img = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((2, 2), np.uint8)
    dilated_img = cv2.dilate(threshold_img, kernel, iterations=1)
    eroded_img = cv2.erode(dilated_img, kernel, iterations=1)
    closed_img = cv2.morphologyEx(eroded_img, cv2.MORPH_CLOSE, kernel)
    text = pytesseract.image_to_string(closed_img)
    # print(text)
    return text

In [68]:
# Helper function to extract table-like structures by detecting prices
def extract_table_from_text(text):
    lines = text.split('\n')
    table = []
    for line in lines:
        # Skip lines that resemble common date formats
        if re.search(r'\b(\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4})\b', line):
            continue
        if 'Subtotal' in line:
            continue

        
        # Detect lines with prices (decimal numbers not resembling dates)
        if re.search(r'\b\d+\.\d{2}\b', line):
            columns = line.split()
            if len(columns) >= 3:  # Assuming at least 3 columns for description, quantity, and amount
                table.append({
                    "Code":columns[0],
                    "DESCRIPTION": ' '.join(columns[1:-2]),
                    "QTY": columns[-2],
                    "TOTAL": columns[-1]
                })
    return table


In [69]:
# Helper function to extract key-value pairs for non-table content, including dates
def extract_key_values(text):
    key_values = {}
    keywords = {
        "Patient Name": "Patient Name", 
        "National ID": "Patient National ID", 
        "Hospital": "Hospital / clinic", 
        "GST REG NO": "GST Reg No",
        "Visit Date": "Visit Date", 
        "Tax Invoice Date": "Tax Invoice Date", 
        "NET AMOUNT PAYABLE": "Total Amount Payable",
        "Invoice No": "Tax invoice number/Invoice No",
        "Date Found": "Date Found"
    }
    
    for line in text.split("\n"):
        # Extract common keywords
        for key, display_name in keywords.items():
            if key in line and key != "Hospital":
                if ":" in line:
                    # key_values[display_name] = line.split(":")[-1].strip()
                    key_values[display_name] = line.split(":")[1].strip().split()[0]
                else:
                    key_values[display_name] = line.split()[-1].strip()
            elif key in line: 
                key_values[display_name] = line.split(":")[-1].strip()
        
        # Extract all date patterns specifically
        date_matches = re.findall(r'\b(\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4})\b', line)
        
        # Initialize or update key_values with found dates
        if "Date Found" in key_values:
            key_values["Date Found"].extend(date_matches)  # Append new dates to the existing list
        else:
            key_values["Date Found"] = date_matches  # Create the list with the found dates if not present
            
        # print("Date Found:", key_values["Date Found"])

        # Convert date strings to datetime objects for sorting
        date_objects = []
        for date in key_values["Date Found"]:
            if "/" in date:
                date_objects.append(datetime.strptime(date, "%d/%m/%Y"))
            elif "-" in date:
                date_objects.append(datetime.strptime(date, "%d-%m-%Y"))
            elif "." in date:
                date_objects.append(datetime.strptime(date, "%d.%m.%Y"))

        # Check if we have any dates to process
        if date_objects:
            # Sort the dates in ascending order
            date_objects.sort()
            
            # Assign the earliest and latest dates
            visit_date = date_objects[0].strftime("%d/%m/%Y")
            invoice_date = date_objects[-1].strftime("%d/%m/%Y")

            # Update the key_values dictionary with the results
            key_values.update({
                "Visit Date": visit_date,
                "Tax Invoice Date": invoice_date
            })
        else:
            key_values.update({
                "Visit Date": None,
                "Tax Invoice Date": None
            })

    # print(key_values)



    return key_values

In [70]:
# Main function to extract data from the medical bill PDF
def extract_data_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path, 200, poppler_path=pop_path) 
    output = []

    for page_num, page in enumerate(pages, start=1):
        page_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
        text = extract_text_from_image(page_cv)
        
        page_data = {
            "Page_Number": page_num,
            "Table": extract_table_from_text(text),
            "Key_Values": extract_key_values(text)
        }
        output.append(page_data)

    return output

In [71]:
# Extract data and format in JSON
data = extract_data_from_pdf(pdf_path)
output_json = json.dumps(data, indent=4)

# Save JSON output to a file
with open("output.json", "w") as f:
    f.write(output_json)

print("Data extraction complete. JSON saved as output.json.")

Data extraction complete. JSON saved as output.json.
