In [14]:
import pytesseract
import cv2
import os
import numpy as np
import json

# Initialize Tesseract (Ensure it's installed and path is set)
# You may need to set the Tesseract executable path depending on your OS (e.g., Windows).
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # For Windows

def preprocess_and_ocr_tesseract(image_path):
    """
    Loads an image, performs OCR using Tesseract, and returns structured results.
    
    Args:
        image_path (str): Path to the input image file.
    
    Returns:
        list: A list of dictionaries, where each dictionary contains:
              'text' (str): The detected text string.
              'confidence' (float): The confidence score of the detection.
              'box' (list): A list of four [x, y] coordinates representing the
                            bounding box corners (top-left, top-right, bottom-right, bottom-left).
        None: If the image cannot be read or OCR fails.
    """
    if not os.path.exists(image_path):
        print(f"Error: Image file not found at {image_path}")
        return None

    # Load the image
    image = cv2.imread(image_path)

    # Perform OCR using Tesseract
    try:
        print(f"Processing image: {image_path}")
        
        # 'image_to_data' provides detailed info about each word (text, box, and confidence)
        ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        # Check if any text was detected
        if len(ocr_result['text']) == 0:
            print("No text detected in the image.")
            return []
        
        # Process the OCR result
        extracted_data = []
        for i in range(len(ocr_result['text'])):
            if int(ocr_result['conf'][i]) > 0:  # Ignore bad confidence scores
                text = ocr_result['text'][i]
                confidence = float(ocr_result['conf'][i])
                box = [
                    [ocr_result['left'][i], ocr_result['top'][i]], 
                    [ocr_result['left'][i] + ocr_result['width'][i], ocr_result['top'][i]],
                    [ocr_result['left'][i] + ocr_result['width'][i], ocr_result['top'][i] + ocr_result['height'][i]],
                    [ocr_result['left'][i], ocr_result['top'][i] + ocr_result['height'][i]]
                ]
                extracted_data.append({
                    "text": text,
                    "confidence": confidence,
                    "box": box
                })

        print(f"OCR complete. Detected {len(extracted_data)} text blocks.")
        return extracted_data

    except Exception as e:
        print(f"Error during OCR processing for {image_path}: {e}")
        return None


# Example Usage
if __name__ == "__main__":
    image_path = r"C:\Users\Admin\Downloads\BK\lab_reports_samples\lbmaske\GUR-0425-PA-0052331_Q-gomati201590001compressed_250422_1706@F.pdf_page_53.png"  # Make sure this file exists
    extracted_results = preprocess_and_ocr_tesseract(image_path)

    if extracted_results is not None:
        print(f"\n--- Extracted Text and Coordinates for {image_path} ---")
        # Print results in a readable JSON format
        print(json.dumps(extracted_results, indent=2))

        # Save OCR results to a JSON file
        json_output_path = 'ocr_results.json'
        with open(json_output_path, 'w') as json_file:
            json.dump(extracted_results, json_file, indent=2)
        print(f"\nOCR results saved to: {json_output_path}")

        # Optional: Visualize bounding boxes
        try:
            image = cv2.imread(image_path)
            boxes = [line['box'] for line in extracted_results]
            txts = [line['text'] for line in extracted_results]
            scores = [line['confidence'] for line in extracted_results]

            # Draw boxes and texts on the image
            for box, text in zip(boxes, txts):
                cv2.polylines(image, [np.array(box, dtype=np.int32)], isClosed=True, color=(0, 255, 0), thickness=2)
                cv2.putText(image, text, (box[0][0], box[0][1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

            output_image_path = 'ocr_output_visualization_tesseract.png'
            cv2.imwrite(output_image_path, image)
            print(f"\nOCR visualization saved to: {output_image_path}")
        
        except Exception as e:
            print(f"Error during visualization: {e}")
    else:
        print("OCR process failed.")


Processing image: C:\Users\Admin\Downloads\BK\lab_reports_samples\lbmaske\GUR-0425-PA-0052331_Q-gomati201590001compressed_250422_1706@F.pdf_page_53.png
OCR complete. Detected 166 text blocks.

--- Extracted Text and Coordinates for C:\Users\Admin\Downloads\BK\lab_reports_samples\lbmaske\GUR-0425-PA-0052331_Q-gomati201590001compressed_250422_1706@F.pdf_page_53.png ---
[
  {
    "text": " ",
    "confidence": 95.0,
    "box": [
      [
        0,
        2321
      ],
      [
        959,
        2321
      ],
      [
        959,
        2328
      ],
      [
        0,
        2328
      ]
    ]
  },
  {
    "text": "HEALTHCARE",
    "confidence": 95.0,
    "box": [
      [
        222,
        196
      ],
      [
        373,
        196
      ],
      [
        373,
        209
      ],
      [
        222,
        209
      ]
    ]
  },
  {
    "text": "SERVICES",
    "confidence": 95.0,
    "box": [
      [
        402,
        195
      ],
      [
        514,
        195
      ]

In [15]:
import re
import json
import os # Added for file path operations
from collections import defaultdict
import statistics # For calculating median if needed

# --- Configuration ---
# Adjust these tolerances based on your document's font size and spacing
Y_TOLERANCE = 10  # Max vertical distance between baselines to be considered same line
X_TOLERANCE = 40  # Max horizontal distance to consider columns aligned (can be tricky)
MIN_CONFIDENCE = 60 # Ignore OCR results below this confidence

# --- Regular Expressions (Refine these heavily based on more examples) ---
# Regex to identify numeric values (potentially with <, > symbols)
# Allows integers, decimals, and optional leading < or > followed by space
REGEX_VALUE = re.compile(r"^(?:[<>]\s?)?\d+(\.\d+)?$")
# Regex for common reference range patterns
REGEX_RANGE = re.compile(
    r"^\d+(\.\d+)?\s*-\s*\d+(\.\d+)?$"  # e.g., 0.5 - 1.5 or 10 - 20
    r"|^<\s*\d+(\.\d+)?"               # e.g., < 10
    r"|^>\s*\d+(\.\d+)?"               # e.g., > 50
    r"|^UP TO\s+\d+(\.\d+)?"           # e.g., UP TO 41
    r"|^\d+\s*-\s*\d+$"                 # e.g., 0-10 (tolerant spacing)
    # Add more patterns as needed, e.g., age-specific ranges require more logic
)
# Simple list for potential units (expand significantly)
KNOWN_UNITS = {"%", "g/dl", "mg/dl", "iu/l", "u/l", "/ul", "x10^3/ul",
               "cells/ul", "fl", "pg", "gm/dl", "mm/hr", "me/l", "ng/"}
# Keywords to filter out (headers, footers, noise) - case-insensitive
FILTER_KEYWORDS = {'report', 'patient', 'doctor', 'hospital', 'clinic', 'date',
                   'sample', 'serum', 'blood', 'specimen', 'page', 'regd', 'lab',
                   'signature', 'technician', 'authorized', 'printed', 'cygnus',
                   'sterling', 'ruby', 'hall', 'good', 'life', 'oscar', 'diagnostic',
                   'superspeciality', 'medical', 'foundation', 'test', 'result',
                   'unit', 'range', 'investigation', 'biochemistry', 'haematology',
                   'header', 'footer', 'department', 'method', 'parameter', 'sr no',
                   'uhid', 'ip no', 'ref by', 'end of the report'}

# --- Helper Functions ---
def get_box_center(box):
    """Calculates the center (x, y) of a bounding box."""
    x_coords = [p[0] for p in box]
    y_coords = [p[1] for p in box]
    center_x = sum(x_coords) / 4
    center_y = sum(y_coords) / 4
    return center_x, center_y

def get_box_y_range(box):
    """Gets the min and max y coordinates of a bounding box."""
    y_coords = [p[1] for p in box]
    return min(y_coords), max(y_coords)

def get_box_x_start(box):
    """Gets the leftmost x coordinate (top-left x)."""
    return box[0][0]

def parse_numeric_value(value_str):
    """Extracts the numeric part from a value string (handles <, >)."""
    match = re.search(r"(\d+(\.\d+)?)", value_str)
    return float(match.group(1)) if match else None

def parse_range(range_str):
    """Attempts to parse low and high values from a range string."""
    range_str = range_str.replace("UP TO", "<") # Normalize "UP TO"
    low, high = None, None
    # Pattern: num - num
    match = re.match(r"^\s*(\d+(\.\d+)?)\s*-\s*(\d+(\.\d+)?)\s*$", range_str)
    if match:
        low = float(match.group(1))
        high = float(match.group(3))
        return low, high
    # Pattern: < num
    match = re.match(r"^\s*<\s*(\d+(\.\d+)?)\s*$", range_str)
    if match:
        high = float(match.group(1))
        return low, high # low remains None
    # Pattern: > num
    match = re.match(r"^\s*>\s*(\d+(\.\d+)?)\s*$", range_str)
    if match:
        low = float(match.group(1))
        return low, high # high remains None

    # Fallback for simple number ranges like 0-6
    match = re.match(r"^\s*(\d+)\s*-\s*(\d+)\s*$", range_str)
    if match:
        low = float(match.group(1))
        high = float(match.group(2))
        return low, high

    # If parsing fails, return Nones
    return low, high

def is_out_of_range(value_str, range_str):
    """Checks if a value is outside the reference range."""
    value = parse_numeric_value(value_str)
    if value is None:
        return None # Cannot determine if value is not numeric

    low, high = parse_range(range_str)

    if low is not None and high is not None:
        return not (low <= value <= high)
    elif high is not None: # Only upper bound exists (e.g., < 10)
        return value >= high
    elif low is not None: # Only lower bound exists (e.g., > 50)
        return value <= low
    else:
        return None # Cannot determine if range is not parsable

# --- Main Processing Function ---
def extract_lab_data_from_ocr(ocr_results):
    """
    Processes OCR results (list of dicts with 'text', 'confidence', 'box')
    to extract structured lab data using heuristics and regex.

    Args:
        ocr_results (list): Output from PaddleOCR or similar.

    Returns:
        list: A list of dictionaries for each extracted lab test.
    """
    if not ocr_results:
        return []

    # 1. Filter low confidence and common noise keywords
    filtered_blocks = []
    for block in ocr_results:
        # Ensure block has expected keys before accessing them
        if not all(k in block for k in ['text', 'confidence', 'box']):
            print(f"Warning: Skipping malformed block: {block}")
            continue

        text_lower = block['text'].lower().strip()
        # Basic confidence filter
        if block['confidence'] < MIN_CONFIDENCE:
            continue
        # Filter common irrelevant keywords
        is_noise = False
        for keyword in FILTER_KEYWORDS:
             # Use 'in' for partial matches, adjust if too aggressive
            if keyword in text_lower:
                is_noise = True
                break
        # Filter very short text unless it's a known unit or potentially a value part
        if not is_noise and len(text_lower) <= 1 and text_lower not in KNOWN_UNITS and not text_lower.isdigit():
             is_noise = True

        if not is_noise and text_lower: # Also check if text is not empty after stripping
             filtered_blocks.append(block)

    if not filtered_blocks:
        return []

    # 2. Group blocks into lines based on vertical proximity (baseline/center)
    # Sort by Y coordinate first to process lines top-down
    filtered_blocks.sort(key=lambda b: get_box_y_range(b.get('box'))[0])

    lines = []
    current_line = []
    if filtered_blocks:
        current_line.append(filtered_blocks[0])
        last_y_min, last_y_max = get_box_y_range(filtered_blocks[0]['box'])

        for i in range(1, len(filtered_blocks)):
            block = filtered_blocks[i]
            y_min, y_max = get_box_y_range(block['box'])
            # Check vertical overlap or proximity with the *last block added* to the line
            # Check if the current block's vertical range is close to the last block's range
            if abs(y_min - last_y_min) < Y_TOLERANCE or abs(y_max - last_y_max) < Y_TOLERANCE or \
               (y_min >= last_y_min - Y_TOLERANCE and y_min <= last_y_max + Y_TOLERANCE): # Overlap check
                current_line.append(block)
                # Update the line's y-range bounds with the latest block
                last_y_min = min(last_y_min, y_min)
                last_y_max = max(last_y_max, y_max)
            else:
                # Start a new line
                if current_line:
                    lines.append(sorted(current_line, key=lambda b: get_box_x_start(b['box'])))
                current_line = [block]
                last_y_min, last_y_max = y_min, y_max

        if current_line: # Add the last line
            lines.append(sorted(current_line, key=lambda b: get_box_x_start(b['box'])))

    # 3. Process each line to identify components
    extracted_tests = []
    for line in lines:
        # Skip very short lines unlikely to contain full results
        if len(line) < 2:
            continue

        potential_name = []
        potential_value = None
        potential_unit = None
        potential_range = None

        # Simple positional and pattern matching logic (ASSUMES Name, Value, Unit, Range order)
        possible_entities = [] # Store (type, text, block)
        for block in line:
            text = block['text'].strip()
            if not text: continue # Skip empty blocks

            # Check patterns in order of distinctiveness
            if REGEX_RANGE.match(text):
                possible_entities.append(("RANGE", text, block))
            elif REGEX_VALUE.match(text):
                possible_entities.append(("VALUE", text, block))
            elif text.lower() in KNOWN_UNITS or ('/' in text and len(text) < 10): # Basic unit heuristic
                 possible_entities.append(("UNIT", text, block))
            else:
                # Tentatively consider everything else as part of the name
                possible_entities.append(("NAME_PART", text, block))

        # Try to assemble the result based on identified entities
        found_name = []
        found_value = None
        found_unit = None
        found_range = None

        # Combine consecutive NAME_PARTs at the beginning
        name_indices = [i for i, (type, _, _) in enumerate(possible_entities) if type == "NAME_PART"]
        if name_indices:
            consecutive_name_parts = []
            last_index = -1
            for i, index in enumerate(name_indices):
                # Ensure the name part is actually at the beginning or follows the previous one
                if index == 0 or index == last_index + 1:
                    consecutive_name_parts.append(possible_entities[index][1])
                    last_index = index
                else:
                    # Stop if name parts are not contiguous from the start
                    break
            if consecutive_name_parts:
                 found_name = [" ".join(consecutive_name_parts)]


        for type, text, block in possible_entities:
            if type == "VALUE" and not found_value:
                found_value = text
            elif type == "UNIT" and not found_unit:
                # Simple check: avoid assigning a unit if it's identical to the value
                # (e.g., OCR reading '100' as value and '%' as unit, but also reading '%' separately)
                if found_value is None or text != found_value:
                    found_unit = text
            elif type == "RANGE" and not found_range:
                found_range = text

        # Create result if Name, Value, and Range are found (Unit is optional)
        if found_name and found_value and found_range:
            test_name = found_name[0]
            test_value = found_value
            test_unit = found_unit if found_unit else ""
            bio_ref_range = found_range

            # Basic filter: Ensure name is not just numbers or symbols AND is reasonably long
            # Also ensure value is not identical to range (sometimes OCR duplicates)
            if len(test_name) > 1 and not re.match(r"^[0-9\s.<>-]+$", test_name) and test_value != bio_ref_range:
                out_of_range = is_out_of_range(test_value, bio_ref_range)

                extracted_tests.append({
                    "test_name": test_name.strip(),
                    "test_value": test_value,
                    "bio_reference_range": bio_ref_range,
                    "test_unit": test_unit,
                    "lab_test_out_of_range": out_of_range
                })

    return extracted_tests

# --- Main Execution ---
if __name__ == "__main__":
    # Specify the path to your JSON file containing OCR results
    ocr_json_file_path = 'ocr_results.json' # CHANGE THIS to your actual file path

    ocr_results = None
    # --- Load OCR results from JSON file ---
    try:
        if not os.path.exists(ocr_json_file_path):
            print(f"Error: JSON file not found at {ocr_json_file_path}")
        else:
            with open(ocr_json_file_path, 'r', encoding='utf-8') as f:
                ocr_results = json.load(f)
            print(f"Successfully loaded OCR results from {ocr_json_file_path}")

    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {ocr_json_file_path}. Check file format.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")

    # --- Run the extraction if OCR results were loaded successfully ---
    if ocr_results:
        final_data = extract_lab_data_from_ocr(ocr_results)

        # --- Print the final structured data ---
        print("\n--- Extracted Lab Data ---")
        if final_data:
            print(json.dumps(final_data, indent=2))
        else:
            print("No lab data could be extracted based on the defined rules.")
        print("--------------------------")
    else:
        print("Could not proceed with extraction as OCR results were not loaded.")

Successfully loaded OCR results from ocr_results.json

--- Extracted Lab Data ---
[
  {
    "test_name": "CRP",
    "test_value": "21.3",
    "bio_reference_range": "0-6",
    "test_unit": "",
    "lab_test_out_of_range": true
  }
]
--------------------------


------------------------------
Starting OCR Process...
Processing image with Tesseract: C:\Users\Admin\Downloads\BK\lab_reports_samples\lbmaske\GUR-0425-PA-0052331_Q-gomati201590001compressed_250422_1706@F.pdf_page_53.png
OCR complete. Detected 128 valid text words.
------------------------------
Raw OCR results saved to: C:\Users\Admin\Downloads\BK\Approach 2\ocr_results.json
OCR visualization saved to: C:\Users\Admin\Downloads\BK\Approach 2\ocr_output_optimized.png

------------------------------
Starting Data Extraction Process...
Filtered down to 77 blocks for analysis.
Grouped into 21 potential data lines.
------------------------------

--- Extracted Lab Data (Structured) ---
No structured lab data could be extracted based on the rules.
---------------------------------------
