In [34]:
import pytesseract
import cv2
from PIL import Image
from IPython.display import display, Image as IPyImage
import json


In [35]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Adaptive thresholding handles lighting variations
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 11
    )
    return image, thresh

def run_tesseract_ocr(thresh_img, fx=1, fy=1):
    data = pytesseract.image_to_data(thresh_img, output_type=pytesseract.Output.DICT)  
    results = []
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        conf = int(data['conf'][i])
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        bbox = [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
        results.append((bbox, text, conf))
    return results, fx, fy  # Return scaling factors too

def draw_group_annotations(image_path, groups, fx=1.0, fy=1.0, output_path="group_annotated.png"):
    image = cv2.imread(image_path)

    for idx, group in enumerate(groups, 1):
        # Collect all points from group and scale them back
        all_pts = []
        for bbox, _, _ in group:
            scaled_bbox = [[int(x / fx), int(y / fy)] for x, y in bbox]
            all_pts.extend(scaled_bbox)

        # Compute bounding box for group
        all_pts = np.array(all_pts)
        x_min, y_min = all_pts[:, 0].min(), all_pts[:, 1].min()
        x_max, y_max = all_pts[:, 0].max(), all_pts[:, 1].max()
        height = y_max - y_min

        # Draw merged box
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

        # Combine text
        merged_text = " ".join([text for _, text, _ in group])
        cv2.putText(image, merged_text, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        print(f"[Group {idx}] {merged_text} : height={height}")

    cv2.imwrite(output_path, image)
    print(f"✅ Group-annotated image saved: {output_path}")
    display(Image.open(output_path))

def group_adjacent_by_largest(ocr_results, tolerance=10, distance_threshold=50):
    blocks = []
    for bbox, text, _ in ocr_results:
        y_top = bbox[0][1]
        y_bottom = bbox[2][1]
        height = abs(y_bottom - y_top)
        x_center = int((bbox[0][0] + bbox[2][0]) / 2)
        y_center = int((bbox[0][1] + bbox[2][1]) / 2)
        blocks.append({
            "text": text,
            "bbox": [[int(x), int(y)] for x, y in bbox],
            "height": height,
            "center": (x_center, y_center)
        })

    if not blocks:
        return []

    # Step 1: Find the block with the highest height
    largest_block = max(blocks, key=lambda b: b["height"])
    max_height = largest_block["height"]

    # Step 2: Adjust tolerance if needed
    if "," in largest_block["text"]:
        tolerance = max_height - 50

    print(f"📏 Largest block: {largest_block['text']} (height: {max_height})")
    print(f"🎯 Using tolerance: {tolerance}")

    # Step 3: Find blocks close in height and distance to the largest
    group = [largest_block]
    largest_center = largest_block["center"]

    for block in blocks:
        if block == largest_block:
            continue

        height_diff = abs(block["height"] - max_height)
        if height_diff <= tolerance:
            # dx = block["center"][0] - largest_center[0]
            # dy = block["center"][1] - largest_center[1]
            # dist = (dx**2 + dy**2)**0.5

            # if dist <= distance_threshold:
            group.append(block)

    # Step 4: Return grouped blocks in (bbox, text, conf) format
    grouped_results = []
    for block in group:
        # Re-find original confidence from input results
        for bbox, text, conf in ocr_results:
            if text == block["text"] and block["bbox"] == [[int(x), int(y)] for x, y in bbox]:
                grouped_results.append((bbox, text, conf))
                break

    return [grouped_results]


def draw_ocr_boxes(image_path, ocr_results, fx=1.0, fy=1.0, output_path="ocr_annotated.png"):
    image = cv2.imread(image_path)

    for idx, (bbox, text, conf) in enumerate(ocr_results, 1):
        scaled_bbox = [[int(x / fx), int(y / fy)] for x, y in bbox]
        x_min, y_min = scaled_bbox[0]
        x_max, y_max = scaled_bbox[2]

        # Draw box
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        # Draw text
        cv2.putText(image, text, (x_min, y_min - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

        print(f"[{idx}] {text} (conf: {conf})")

    cv2.imwrite(output_path, image)
    print(f"✅ OCR annotated image saved to {output_path}")
    display(Image.open(output_path))

import numpy as np

def clean_and_structure_ocr_results(raw_ocr_results):
    """
    Cleans and structures raw OCR results into a list of dictionaries
    suitable for an NLP API prompt.

    Args:
        raw_ocr_results (list): The raw output from your OCR engine,
                                 e.g., [([[x1,y1],...], 'text', confidence), ...]
        confidence_threshold (int): Minimum confidence score for a text
                                    entry to be included. Default is 70.

    Returns:
        list: A list of dictionaries, each representing a clean text block
              with its text, bbox (x, y, w, h), and estimated font_height.
              Sorted by Y-coordinate then X-coordinate.
    """
    cleaned_data = []

    for item in raw_ocr_results:
        # Unpack the tuple
        coords_list, text, confidence = item

        # 1. Filter out empty text and low confidence scores
        if not text.strip():
            continue

        # 2. Extract and convert bounding box coordinates to (x, y, w, h)
        # Your coordinates are [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
        # We need to find min/max x and y
        x_coords = [p[0] for p in coords_list]
        y_coords = [p[1] for p in coords_list]

        min_x = min(x_coords)
        max_x = max(x_coords)
        min_y = min(y_coords)
        max_y = max(y_coords)

        bbox_x = min_x
        bbox_y = min_y
        bbox_width = max_x - min_x
        bbox_height = max_y - min_y

        # Handle cases where height or width might be zero or negative (invalid bbox)
        if bbox_width <= 0 or bbox_height <= 0:
            continue

        # 3. Estimate font height (using bbox_height)
        # Note: True font size is complex, but bbox_height is a good proxy.
        estimated_font_height = bbox_height

        # Add to cleaned data
        cleaned_data.append({
            "text": text.strip(),
            "bbox": [bbox_x, bbox_y, bbox_width, bbox_height],
            "font_height": estimated_font_height,
            "confidence": confidence # Keep confidence if useful for further analysis
        })

    # 4. Sort the data: Crucial for understanding document flow
    # Sort primarily by y-coordinate (top to bottom), then by x-coordinate (left to right)
    cleaned_data.sort(key=lambda item: (item['bbox'][1], item['bbox'][0]))

    return cleaned_data

def draw_annotations_from_json(image_path, json_data, fx=1.0, fy=1.0, output_path="annotated_articles.png"):
    # Load image
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"❌ Could not load image from: {image_path}")

    # Parse JSON
    try:
        articles = json.loads(json_data)
    except json.JSONDecodeError:
        raise ValueError("❌ Invalid JSON data")

    # Define annotation types and colors
    parts = {
        "headline": ((255, 0, 0), 2),       # Blue
        "subheadline": ((0, 255, 0), 2),    # Green
    }

    for article in articles.get("values", []):
        article_type = article.get("type")
        bbox = article.get("bbox")  # [x, y, w, h]

        if article_type in parts and bbox:
            color, thickness = parts[article_type]
            x, y, w, h = bbox

            # Scale bbox back to original image size
            x = int(x / fx)
            y = int(y / fy)
            w = int(w / fx)
            h = int(h / fy)

            top_left = (x, y)
            bottom_right = (x + w, y + h)

            cv2.rectangle(image, top_left, bottom_right, color, thickness)
            cv2.putText(
                image,
                article_type.capitalize(),
                (x, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                color,
                2
            )

    cv2.imwrite(output_path, image)
    print(f"✅ Annotated image saved to: {output_path}")
    Image.open(output_path).show()


In [None]:
# Cell 3: Run the pipeline
import google.generativeai as genai
import os
from dotenv import load_dotenv
import json
load_dotenv()
API_KEY = os.getenv("API_KEY")
image_path = "./Label Studio/imgs/page_1.png"  # Replace with your image path

original_img, thresholded = preprocess_image(image_path)
ocr_results, fx, fy = run_tesseract_ocr(thresholded)
cleaned_results = clean_and_structure_ocr_results(ocr_results)
genai.configure(api_key=API_KEY) 

model = genai.GenerativeModel("models/gemini-2.5-flash")
json_sample = {
    "values" : [
          {
        "type" : "headline",
        "text" : "nLOUSe tO OMEN DicamM to pulDllic",
        "bbox" : [155, 1720, 3460, 313]
        },
        {
        "type": "subheadline",
        "text": "Senate minority bloc backs more transparency to end corruption",
        "bbox": [205, 2075, 3334, 129]
        }
    
    ]
}
processed_img = Image.open(image_path)
prompt = [
    
]

# Send a prompt
response = model.generate_content(f"""Here is a list of text blocks/lines extracted from a newspaper page, along with their approximate bounding box dimensions (x, y, width, height) and estimated font heights. Please identify the main headline and any associated subheadlines for each distinct article. Group them logically. Prioritize lines with larger font heights and top-of-column positions as potential headlines.

Data:
{json.dumps(cleaned_results, indent=2)}

Based on this data, please provide the identified main headlines and their subheadlines (if available), put the whole headline inside a big bounding box, do the same with subheadlines(if they exist). Please only send json file following the structure (without any text, just the json):
{json.dumps(json_sample)}
""")

draw_annotations_from_json(image_path, response.text.strip("```json"))
# draw_ocr_boxes(image_path, ocr_results, fx=fx, fy=fy)






✅ Annotated image saved to: annotated_articles.png
