In [22]:
import os
import re
from PIL import Image, ImageEnhance
import pytesseract

In [23]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [24]:
DATASET_PATH = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples"

# Enhance image for better OCR
def preprocess_image(image: Image.Image) -> Image.Image:
    image = image.convert('L')
    image = ImageEnhance.Contrast(image).enhance(2.0)
    image = ImageEnhance.Sharpness(image).enhance(2.0)
    return image


In [25]:
from PIL import Image, ImageEnhance

def preprocess_image(image: Image.Image) -> Image.Image:
    image = image.convert('L')  # Convert to grayscale
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # Increase contrast
    enhancer = ImageEnhance.Sharpness(image)
    image = enhancer.enhance(2.0)  # Increase sharpness
    return image


In [26]:
import pytesseract

def extract_text_from_image(image: Image.Image) -> str:
    config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(image, config=config)


In [27]:
import re

def check_out_of_range(value: str, ref_range: str) -> bool:
    try:
        numbers = re.findall(r'[\d.]+', ref_range)
        if len(numbers) == 2:
            lower, upper = float(numbers[0]), float(numbers[1])
            val = float(value)
            return val < lower or val > upper
    except:
        pass
    return False


In [49]:
def parse_lab_test_results(text: str):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    results = []

    pattern = re.compile(
        r'^(.*?)\s+([\d.]+)\s+([^\d]*\d+[^\d]*\s*-\s*[^\d]*\d+[^\d]*)$'
    )

    for line in lines:
        match = pattern.search(line)
        if match:
            test_name = match.group(1).strip()
            value = match.group(2).strip()
            ref_range = match.group(3).strip()
            out_of_range = check_out_of_range(value, ref_range)

            results.append({
                "lab_test_name": test_name,
                "observed_value": value,
                "bio_reference_range": ref_range,
                "lab_test_out_of_range": out_of_range
            })

    return results


In [37]:
import os
from PIL import Image

# Path to your input image
input_path = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png"
# Path to save the enhanced output
output_path = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\enhanced_output.png"

try:
    with Image.open(input_path) as img:
        print(f"Enhancing image: {os.path.basename(input_path)}")
        enhanced_img = preprocess_image(img)

        # Save the enhanced image
        enhanced_img.save(output_path)
        print(f"Enhanced image saved to: {output_path}")

        # Optionally show the enhanced image
        enhanced_img.show()

except Exception as e:
    print(f"Error enhancing image: {e}")


Enhancing image: AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png
Enhanced image saved to: C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\enhanced_output.png


In [50]:
import os
from PIL import Image

# Full path to your test image
file_path = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png"

try:
    with Image.open(file_path) as img:
        print(f"\nProcessing: {os.path.basename(file_path)}")
        img = preprocess_image(img)
        text = extract_text_from_image(img)
        results = parse_lab_test_results(text)

        if results:
            for r in results:
                print(f"Test Name: {r['lab_test_name']}")
                print(f"Observed Value: {r['observed_value']}")
                print(f"Reference Range: {r['bio_reference_range']}")
                print(f"Out of Range: {r['lab_test_out_of_range']}")
                print("---")
        else:
            print("No lab test data found.")
except Exception as e:
    print(f"Error processing {file_path}: {e}")



Processing: AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png
Test Name: Total W.B.C. Count :
Observed Value: 10560
Reference Range: [H] ful 4000-10000 /ul
Out of Range: True
---
Test Name: Neutrophils :
Observed Value: 87.7
Reference Range: [H) % 40-70 %
Out of Range: True
---
Test Name: Lymphocytes :
Observed Value: 59
Reference Range: [L] % 20-40 %
Out of Range: True
---
Test Name: Eosinophils >
Observed Value: 0.7
Reference Range: LL % 1-6%
Out of Range: True
---
Test Name: Monocytes >
Observed Value: 685
Reference Range: % 2-10 %
Out of Range: True
---
Test Name: Basophils >
Observed Value: 0.2
Reference Range: % 0-1%
Out of Range: False
---
Test Name: Platelet Count :
Observed Value: 370
Reference Range: ful 150-450 /uL
Out of Range: False
---
Test Name: { #) IMMATURE PLATELET FRACTION:
Observed Value: 2.90
Reference Range: % 0-5 %
Out of Range: False
---
Test Name: Neutrophils (abs) :
Observed Value: 9261.12
Reference Range: [H] ful 1575-8800 /uL
Out of Range: True
---

In [51]:
import os
from PIL import Image
import pandas as pd

# Full path to your test image
file_path = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\AHD-0425-PA-0008061_E-mahendrasinghdischargecard_250427_1114@E.pdf_page_27.png"

try:
    with Image.open(file_path) as img:
        print(f"\nProcessing: {os.path.basename(file_path)}")
        img = preprocess_image(img)
        text = extract_text_from_image(img)
        results = parse_lab_test_results(text)

        if results:
            df = pd.DataFrame(results)
            print("\nStructured Output:\n")
            print(df)
        else:
            print("No lab test data found.")
except Exception as e:
    print(f"Error processing {file_path}: {e}")



Processing: AHD-0425-PA-0008061_E-mahendrasinghdischargecard_250427_1114@E.pdf_page_27.png

Structured Output:

                 lab_test_name observed_value    bio_reference_range  \
0     Packed Cell Volume (HCT)            425                % 40-50   
1        Mean Cell Volume(MCV)           86.7              fl 83-101   
2   Mean Cell Hemoglobin( MCH}           24.7               pe 27-33   
3      Mean Cell Hb Conc(MCHC)           28.5               % 32 -38   
4              Total WBC Count          13600  cells/cumm 4000-11000   
5                  Neutrophils             66                % 50-70   
6                  Lymphocytes             24               % 20 -40   
7                  Eosinophils              4                  % 1-6   
8                    Monocytes             06                 % 0-10   
9   Absolute Neutrophils Count           8976        /cumm 2000-7000   
10  Absolute Lymphocytes Count           3264        /cumm 1000-3000   
11    Absolute Monocyte

In [47]:
import os
import re
from PIL import Image
import pytesseract
import numpy as np
import cv2

# If using Windows and Tesseract is not in PATH, uncomment and set the path:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Helper: Preprocess image for better OCR
def preprocess_image(pil_image):
    img = np.array(pil_image.convert("RGB"))
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(thresh)

# Helper: OCR to text
def extract_text_from_image(pil_image):
    return pytesseract.image_to_string(pil_image)

# Helper: Check if observed value is out of reference range
def check_out_of_range(value, ref_range):
    try:
        value = float(value)
        numbers = [float(n) for n in re.findall(r"[-+]?\d*\.\d+|\d+", ref_range)]
        if len(numbers) == 2:
            return value < numbers[0] or value > numbers[1]
        else:
            return False
    except:
        return False

# Parser: Extract lab test data from text
def parse_lab_test_results(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    results = []

    # Pattern to capture: test name, value, unit, reference range
    pattern = re.compile(r'^(.*?)([:\-]?)\s*([\d.]+)\s*(\[\w*\])?\s*([a-zA-Z/%μulUL\d]+)?\s*([<>=]*\s*[\d.]+[-–]\s*[\d.]+)?')

    for line in lines:
        match = pattern.match(line)
        if match:
            test_name = match.group(1).strip(" :.-")
            test_value = match.group(3)
            unit = match.group(5) or ""
            ref_range = match.group(6) or ""
            out_of_range = check_out_of_range(test_value, ref_range)

            results.append({
                "test_name": test_name,
                "test_value": test_value,
                "bio_reference_range": ref_range,
                "test_unit": unit,
                "lab_test_out_of_range": out_of_range
            })

    return {
        "is_success": True,
        "data": results
    }

# MAIN EXECUTION BLOCK
file_path = r"C:\Users\DELL\OneDrive\Desktop\bajaj\lab_reports_samples\lbmaske\AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png"

try:
    with Image.open(file_path) as img:
        print(f"\nProcessing: {os.path.basename(file_path)}")
        img = preprocess_image(img)
        text = extract_text_from_image(img)
        results = parse_lab_test_results(text)

        if results and results.get("data"):
            for r in results["data"]:
                print(f"Test Name: {r['test_name']}")
                print(f"Observed Value: {r['test_value']}")
                print(f"Reference Range: {r['bio_reference_range']}")
                print(f"Test Unit: {r['test_unit']}")
                print(f"Out of Range: {r['lab_test_out_of_range']}")
                print("---")
        else:
            print("No lab test data found.")
except Exception as e:
    print(f"Error processing {file_path}: {e}")



Processing: AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png
Test Name: / ¢ Atul S
Observed Value: .
Reference Range: 
Test Unit: Vadhavkar
Out of Range: False
---
Test Name: a f
Observed Value: 6.
Reference Range: 
Test Unit: Sc
Out of Range: False
---
Test Name: S H R E cE
Observed Value: .
Reference Range: 
Test Unit: pon
Out of Range: False
---
Test Name: 
Observed Value: 02
Reference Range: 
Test Unit: CE
Out of Range: False
---
Test Name: Time
Observed Value: 8
Reference Range: 
Test Unit: 
Out of Range: False
---
Test Name: m
Observed Value: 7
Reference Range: 
Test Unit: 
Out of Range: False
---
Test Name: — tC ———_
Observed Value: 7
Reference Range: 
Test Unit: 
Out of Range: False
---
Test Name: a
Observed Value: .
Reference Range: 
Test Unit: 
Out of Range: False
---
Test Name: Shree Hospital IPD Time: |
Observed Value: 2
Reference Range: 
Test Unit: 
Out of Range: False
---
Test Name: Sample Id
Observed Value: 10436879
Reference Range: 
Test Unit: Report
Out of R