In [3]:
from PIL import Image
import os

PAGE_IMAGES_FOLDER = "D:\OCR session"

In [4]:
%pip install scikit-image

Note: you may need to restart the kernel to use updated packages.


---
## Tilt correction

---

In [5]:
import cv2
import numpy as np
from skimage.transform import hough_line, hough_line_peaks
from skimage.feature import canny

WATERMARK_THRESHOLD_LOW = 175
WATERMARK_THRESHOLD_HIGH = 250

# Model for orientation skew correction
EAST_MODEL = "frozen_east_text_detection.pb"
ANGLE_TOLLERANCE = 0.25
MIN_CONFIDENCE = 0.5
MARGIN_TOLLERANCE = 9
EAST_WIDTH = 1280
EAST_HEIGHT = 1280
ALIGN = False
ALIGN_MODE = 'FAST'

class Orientation:

    def __init__(self, image, file_properties, conf_threshold=50, lang='eng'):

        # self.image_path     = image_path
        self.image = image
        self.file_properties = file_properties
        # self.lines          = lines
        self.conf_threshold = int(conf_threshold)

        self.timer = {'net': 0, 'restore': 0, 'nms': 0}
        self.text = {}
        self.lang = lang

        # self.re_orient()

    def rotate_bound(self, image, angle):
        # grab the dimensions of the image and then determine the
        # center
        (h, w) = image.shape[:2]
        (cX, cY) = (w / 2, h / 2)

        # grab the rotation matrix (applying the negative of the
        # angle to rotate clockwise), then grab the sine and cosine
        # (i.e., the rotation components of the matrix)
        M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
        cos = np.abs(M[0, 0])
        sin = np.abs(M[0, 1])

        # compute the new bounding dimensions of the image
        nW = int((h * sin) + (w * cos))
        nH = int((h * cos) + (w * sin))

        # adjust the rotation matrix to take into account translation
        M[0, 2] += (nW / 2) - cX
        M[1, 2] += (nH / 2) - cY

        # perform the actual rotation and return the image
        return cv2.warpAffine(image, M, (nW, nH), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    def east_detect(self, image, args):

        # orig = image.copy()
        (H, W) = image.shape[:2]

        (newW, newH) = (args["width"], args["height"])
        rW = W / float(newW)
        rH = H / float(newH)

        image = cv2.resize(image, (newW, newH))
        (H, W) = image.shape[:2]

        layerNames = [
            "feature_fusion/Conv_7/Sigmoid",
            "feature_fusion/concat_3"]

        # print("[INFO] loading EAST text detector...")
        net = cv2.dnn.readNet(args["east"])

        blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
                                     (123.68, 116.78, 103.94), swapRB=True, crop=False)
        # start = time.time()
        net.setInput(blob)
        (scores, geometry) = net.forward(layerNames)
        # end = time.time()

        # print("[INFO] text detection took {:.6f} seconds".format(end - start))

        # confidence scores
        (numRows, numCols) = scores.shape[2:4]
        angl = []

        for y in range(0, numRows):

            scoresData = scores[0, 0, y]
            anglesData = geometry[0, 4, y]

            for x in range(0, numCols):
                if scoresData[x] < args["min_confidence"]:
                    continue

                angle = anglesData[x]
                angl.append(angle*180/(np.pi))

        return np.median(angl)

    def east(self, image, args):

        # image = cv2.imread(image_path)
        angle = Orientation.east_detect(self, image, args)
        # print("angle*********",angle)

        return image, angle

    def hough_transforms(self, image):

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        thresh = cv2.GaussianBlur(gray, (11, 11), 0)
        edges = canny(thresh)
        tested_angles = np.deg2rad(np.arange(0.1, 180.0))
        h, theta, d = hough_line(edges, theta=tested_angles)
        accum, angles, dists = hough_line_peaks(h, theta, d)

        return accum, angles, dists

    def east_hough_line(self, image, args):
        image, angle = Orientation.east(self, image, args)
        h, theta, d = Orientation.hough_transforms(self, image)
        theta = np.rad2deg(np.pi/2-theta)
        # theta = np.rad2deg(theta-np.pi/2)
        margin = args['margin_tollerance']
        low_thresh = angle-margin
        high_thresh = angle+margin
        filter_theta = theta[theta > low_thresh]
        filter_theta = filter_theta[filter_theta < high_thresh]

        return image, np.median(filter_theta)

    def re_orient_east(self):
        lang = 'hi'

        args = {
            "image": self.image,
            "east": EAST_MODEL,
            "min_confidence": MIN_CONFIDENCE,
            "margin_tollerance": MARGIN_TOLLERANCE,
            "width": EAST_WIDTH,
            "height": EAST_HEIGHT
        }

        image, angle = Orientation.east_hough_line(self, args['image'], args)

        if abs(angle) > ANGLE_TOLLERANCE:
            image = Orientation.rotate_bound(self, image, angle)
            # print(self.image_path)
            # image_path = Orientation(self.image_path)
            # cv2.imwrite(f'{self.image_path}', image)

        print("Angle detectd is  {} ".format(angle))

        return image, angle

In [6]:
%pip install doclayout-yolo

Note: you may need to restart the kernel to use updated packages.


In [7]:
%pip show doclayout-yolo


Name: doclayout_yolo
Version: 0.0.2
Summary: DocLayout-YOLO: an effecient and robust document layout analysis method.
Home-page: 
Author: Zhiyuan Zhao, Hengrui Kang, Bin Wang, Conghui He
Author-email: 
License: AGPL-3.0
Location: c:\users\asus\appdata\local\programs\python\python310\lib\site-packages
Requires: albumentations, matplotlib, opencv-python, pandas, pillow, psutil, py-cpuinfo, pyyaml, requests, scipy, seaborn, thop, torch, torchvision, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [1]:
from ultralytics import YOLO

model_path = r"D:\OCR\Pipeline\yolov12l-doclaynet.pt"
model = YOLO(model_path)

---
## Layout detection

---

In [2]:
from PIL import Image, ImageOps
import os
import numpy as np
import cv2
from collections import Counter, defaultdict
from skimage.transform import hough_line, hough_line_peaks
from skimage.feature import canny
from doclayout_yolo import YOLOv10  # Ensure doclayout-yolo is installed

# ---------- Model Setup ----------
model_path = r"D:\OCR\Pipeline\yolov12l-doclaynet.pt"
model = YOLO(model_path)

def iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1_p, y1_p, x2_p, y2_p = box2
    xi1, yi1 = max(x1, x1_p), max(y1, y1_p)
    xi2, yi2 = min(x2, x2_p), min(y2, y2_p)
    iw = max(0.0, xi2 - xi1)
    ih = max(0.0, yi2 - yi1)
    inter = iw * ih
    area1 = max(0.0, (x2 - x1) * (y2 - y1))
    area2 = max(0.0, (x2_p - x1_p) * (y2_p - y1_p))
    union = area1 + area2 - inter
    return inter / union if union > 0 else 0.0

def perform_prediction(image, model, imgsz=1024, conf=0.4, iou=0.45, device="cpu"):
    return model.predict(image, imgsz=imgsz, conf=conf, iou=iou, device=device)

def extract_bboxes(det_res_list):
    boxes_info = []
    for result in det_res_list:
        names = result.names
        for box in result.boxes:
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            label_idx = int(box.cls.item())
            confidence = float(box.conf.item())
            label = names.get(label_idx, str(label_idx))
            boxes_info.append({"bbox": [x1, y1, x2, y2], "label": label, "confidence": confidence})
    return boxes_info

def nms_classwise(boxes, iou_thresh=0.3):  # Less aggressive
    out = []
    labels = set([b["label"] for b in boxes])
    for label in labels:
        group = [b for b in boxes if b["label"] == label]
        group = sorted(group, key=lambda x: x["confidence"], reverse=True)
        keep = []
        suppressed = [False] * len(group)
        for i in range(len(group)):
            if suppressed[i]:
                continue
            keep.append(group[i])
            for j in range(i + 1, len(group)):
                if suppressed[j]:
                    continue
                if iou(group[i]["bbox"], group[j]["bbox"]) > iou_thresh:
                    suppressed[j] = True
        out.extend(keep)
    return out

def deduplicate_boxes(boxes, iou_thresh=0.2):  # Less aggressive
    out = []
    labels = set([b["label"] for b in boxes])
    for label in labels:
        group = [b for b in boxes if b["label"] == label]
        group = sorted(group, key=lambda x: x["confidence"], reverse=True)
        keep = []
        while group:
            ref = group.pop(0)
            keep.append(ref)
            group = [g for g in group if iou(ref["bbox"], g["bbox"]) < iou_thresh]
        out.extend(keep)
    return out

def merge_text_boxes(boxes, merge_iou=0.1):  # Only mildly merge
    text_boxes = [b for b in boxes if b["label"].lower() == "text"]
    other_boxes = [b for b in boxes if b["label"].lower() != "text"]
    if not text_boxes:
        return boxes
    used = [False] * len(text_boxes)
    merged = []
    for i in range(len(text_boxes)):
        if used[i]:
            continue
        bx = text_boxes[i]["bbox"].copy()
        conf = text_boxes[i]["confidence"]
        used[i] = True
        for j in range(i+1, len(text_boxes)):
            if used[j]:
                continue
            if iou(bx, text_boxes[j]["bbox"]) >= merge_iou:
                bx = [
                    min(bx[0], text_boxes[j]["bbox"][0]),
                    min(bx[1], text_boxes[j]["bbox"][1]),
                    max(bx[2], text_boxes[j]["bbox"][2]),
                    max(bx[3], text_boxes[j]["bbox"][3]),
                ]
                conf = max(conf, text_boxes[j]["confidence"])
                used[j] = True
        merged.append({"bbox": bx, "label": "Text", "confidence": conf})
    return other_boxes + merged

def resolve_cross_class_conflicts(boxes, cross_iou_thresh=0.9999, class_priority=None):
    if class_priority is None:
        class_priority = defaultdict(lambda: 0)
        class_priority.update({"Text": 6, "Table": 5, "Picture": 4,
                              "Section-header": 3, "List-item": 2, "Caption": 1})
    boxes_sorted = sorted(boxes, key=lambda x: x["confidence"], reverse=True)
    keep = []
    removed = [False] * len(boxes_sorted)
    for i in range(len(boxes_sorted)):
        if removed[i]:
            continue
        a = boxes_sorted[i]
        keep.append(a)
        for j in range(i+1, len(boxes_sorted)):
            if removed[j]:
                continue
            b = boxes_sorted[j]
            pair_iou = iou(a["bbox"], b["bbox"])
            if pair_iou >= cross_iou_thresh:
                if b["confidence"] > a["confidence"]:
                    keep[-1] = b
                    removed[i] = True
                    removed[j] = True
                elif abs(b["confidence"] - a["confidence"]) < 1e-6:
                    if class_priority.get(b["label"],0) > class_priority.get(a["label"],0):
                        keep[-1] = b
                removed[j] = True
    uniq = []
    seen = set()
    for item in keep:
        key = tuple([round(x, 2) for x in item["bbox"]]) + (item["label"],)
        if key not in seen:
            uniq.append(item)
            seen.add(key)
    return uniq

def visualize_bboxes(image, boxes, title=None, save_path=None):
    img = np.array(image.convert("RGB"))
    label_colors = {}
    palette = [(0,255,0),(0,128,255),(255,0,0),(255,128,0),(128,0,255),(0,200,200)]
    for i,b in enumerate(sorted(list(set([x["label"] for x in boxes])))):
        label_colors[b] = palette[i % len(palette)]
    for b in boxes:
        x1,y1,x2,y2 = map(int, b["bbox"])
        lbl = b["label"]
        conf = b["confidence"]
        color = label_colors.get(lbl, (0,255,0))
        cv2.rectangle(img, (x1,y1),(x2,y2), color, 2)
        txt = f"{lbl} {conf:.2f}"
        cv2.putText(img, txt, (x1, max(0,y1-8)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2, lineType=cv2.LINE_AA)
    out = Image.fromarray(img)
    if save_path:
        out.save(save_path)
    return out

def process_image_with_yolo(model, image_path,
                            conf=0.5, model_iou=0.45,
                            pre_conf_thresh=0.5,
                            class_nms_iou=0.3,      # Lowered
                            merge_text_iou=0.1,     # Lowered
                            cross_iou_thresh=0.9999,
                            final_conf_thresh=0.5,
                            dedup_iou=0.2,          # Lowered
                            visualize=True,
                            out_before="before.png", out_after="after.png"):

    pil_img = Image.open(image_path).convert("RGB")
    raw_results = perform_prediction(pil_img, model, imgsz=1024, conf=conf, iou=model_iou, device="cpu")

    boxes = extract_bboxes(raw_results)
    counter_before = Counter([b["label"] for b in boxes])
    print("Before:", dict(counter_before), " total:", len(boxes))
    if visualize:
        visualize_bboxes(pil_img, boxes, save_path=out_before)

    # 1) Pre-filter
    boxes = [b for b in boxes if b["confidence"] >= pre_conf_thresh]
    print("Post pre-filter:", Counter([b["label"] for b in boxes]))

    # 2) Class-wise NMS
    boxes = nms_classwise(boxes, iou_thresh=class_nms_iou)
    print("Post NMS:", Counter([b["label"] for b in boxes]))

    # 3) Merge text (only if 'Text' boxes exist)
    if any(b["label"].lower() == "text" for b in boxes):
        boxes = merge_text_boxes(boxes, merge_iou=merge_text_iou)
    print("Post text merge:", Counter([b["label"] for b in boxes]))

    # 4) Cross-class resolution
    boxes = resolve_cross_class_conflicts(boxes, cross_iou_thresh=cross_iou_thresh)
    print("Post cross-class:", Counter([b["label"] for b in boxes]))

    # 5) Extra deduplication per class
    boxes = deduplicate_boxes(boxes, iou_thresh=dedup_iou)
    print("Post deduplication:", Counter([b["label"] for b in boxes]))

    # 6) Final filter
    boxes = [b for b in boxes if b["confidence"] >= final_conf_thresh]
    counter_after = Counter([b["label"] for b in boxes])
    print("After:", dict(counter_after), " total:", len(boxes))
    if visualize:
        visualize_bboxes(pil_img, boxes, save_path=out_after)

    return boxes


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import json
from doclayout_yolo import YOLO

# Load model
model = YOLO("D:\OCR\Pipeline\yolov12l-doclaynet.pt")   # update with your model path

# Run pipeline
boxes = process_image_with_yolo(
    model=model,
    image_path= "D:\OCR session\pic 14.jpg",   # replace with your image
    conf=0.5,
    model_iou=0.45,
    visualize=True
)

# Convert result to JSON
boxes_json = json.dumps(boxes, indent=4)

# Print JSON
print(boxes_json)

# Optionally save to file
with open("output_boxes.json", "w") as f:
    f.write(boxes_json)



0: 672x1024 1 Caption, 2 Page-footers, 2 Pictures, 1 Section-header, 20 Texts, 5759.3ms
Speed: 22.1ms preprocess, 5759.3ms inference, 7.7ms postprocess per image at shape (1, 3, 672, 1024)
Before: {'Text': 20, 'Picture': 2, 'Caption': 1, 'Section-header': 1, 'Page-footer': 2}  total: 26
Post pre-filter: Counter({'Text': 20, 'Picture': 2, 'Page-footer': 2, 'Caption': 1, 'Section-header': 1})
Post NMS: Counter({'Text': 20, 'Picture': 2, 'Page-footer': 2, 'Caption': 1, 'Section-header': 1})
Post text merge: Counter({'Text': 20, 'Picture': 2, 'Page-footer': 2, 'Caption': 1, 'Section-header': 1})
Post cross-class: Counter({'Text': 20, 'Picture': 2, 'Page-footer': 2, 'Caption': 1, 'Section-header': 1})
Post deduplication: Counter({'Text': 20, 'Picture': 2, 'Page-footer': 2, 'Caption': 1, 'Section-header': 1})
After: {'Text': 20, 'Picture': 2, 'Caption': 1, 'Section-header': 1, 'Page-footer': 2}  total: 26
[
    {
        "bbox": [
            245.37193298339844,
            428.941955566406

In [9]:
# Final "after.png" result:
cleaned_img = Image.open("after.png")
cleaned_img.show()

In [10]:
print(model.info())


YOLOv12l summary: 488 layers, 26,397,585 parameters, 0 gradients, 89.5 GFLOPs
(488, 26397585, 0, 89.4527744)


---
## Saving paragraph crops

---

In [15]:
# === Fixed cropping + wrapper that uses your existing pipeline functions ===
import os
from PIL import Image

# Set this to the folder that contains your images (change if necessary)
IMAGES_DIR = r"D:\OCR session"     # <-- update if your images are in a different folder
OUTPUT_DIR_ROOT = r"D:\OCR\Pipeline\Paragraph_crops"  # where crops will be saved

# If you already defined these functions in previous cells, this wrapper will use them:
# perform_prediction, extract_bboxes, nms_classwise, merge_text_boxes,
# resolve_cross_class_conflicts, deduplicate_boxes
# Also make sure `model` (YOLOv10 instance) is already created in your session.

def get_layout_bboxes_yolo(page_image, model,
                           pre_conf_thresh=0.5,
                           class_nms_iou=0.3,
                           merge_text_iou=0.1,
                           cross_iou_thresh=0.9999,
                           dedup_iou=0.2,
                           final_conf_thresh=0.5):
    """
    Returns cleaned layout boxes for a PIL page_image using existing pipeline functions.
    Assumes perform_prediction() and pipeline helper functions are defined in the session.
    """
    # 1) run model (perform_prediction accepts PIL image)
    raw_results = perform_prediction(page_image, model, imgsz=1024, conf=0.5, iou=0.45, device="cpu")

    # 2) extract boxes
    boxes = extract_bboxes(raw_results)

    # 3) pre-filter by confidence
    boxes = [b for b in boxes if b["confidence"] >= pre_conf_thresh]

    # 4) class-wise NMS
    boxes = nms_classwise(boxes, iou_thresh=class_nms_iou)

    # 5) merge text fragments (if any)
    if any(b["label"].lower() == "text" for b in boxes):
        boxes = merge_text_boxes(boxes, merge_iou=merge_text_iou)

    # 6) resolve perfect cross-class conflicts
    boxes = resolve_cross_class_conflicts(boxes, cross_iou_thresh=cross_iou_thresh)

    # 7) extra deduplication per class
    boxes = deduplicate_boxes(boxes, iou_thresh=dedup_iou)

    # 8) final confidence filter
    boxes = [b for b in boxes if b["confidence"] >= final_conf_thresh]

    # Return list of dicts: {"bbox":[x1,y1,x2,y2], "label":..., "confidence":...}
    return boxes


def crop_text_areas(page_image):
    """
    Takes a PIL page_image and returns list of PIL image crops for text areas.
    Uses get_layout_bboxes_yolo(...) to get cleaned layout boxes.
    """
    # call wrapper with the global `model` (must exist)
    bboxes = get_layout_bboxes_yolo(page_image, model)

    text_elements_list = []
    for item in bboxes:
        # correct unpacking order: x1, y1, x2, y2
        x1, y1, x2, y2 = map(int, item["bbox"])

        # skip figures/tables (you asked to crop text areas only)
        if item["label"].lower() not in ["figure", "table"]:
            # crop expects (left, upper, right, lower) => (x1,y1,x2,y2)
            crop = page_image.crop((x1, y1, x2, y2))
            # store crop and associated metadata (label, confidence)
            text_elements_list.append({
                "crop": crop,
                "label": item["label"],
                "confidence": item["confidence"],
                "bbox": [x1, y1, x2, y2]
            })
    return text_elements_list


def save_crops_for_page(page_image_filename):
    """
    Input: filename (just the file name, not full path) inside IMAGES_DIR.
    Saves crops to OUTPUT_DIR_ROOT/<filename_without_ext>/crop_i.jpg
    """
    print(f">>> Processing image: {page_image_filename}")
    image_path = os.path.join(IMAGES_DIR, page_image_filename)
    page_image = Image.open(image_path).convert("RGB")

    text_elements_crops = crop_text_areas(page_image)
    print(f"Total crops: {len(text_elements_crops)}")

    output_dir = os.path.join(OUTPUT_DIR_ROOT, os.path.splitext(page_image_filename)[0])
    os.makedirs(output_dir, exist_ok=True)

    for i, item in enumerate(text_elements_crops):
        print(f"Saving image crop {i}... label={item['label']} conf={item['confidence']:.3f}")
        crop = item["crop"]
        crop.save(os.path.join(output_dir, f"crop_{i}.jpg"))


# === Main loop: process all images in IMAGES_DIR ===
if __name__ == "__main__":
    # filter for common image extensions
    total_filenames = [f for f in os.listdir(IMAGES_DIR)
                       if f.lower().endswith((".png", ".jpg", ".jpeg", ".tiff", ".bmp"))]
    print(f"Total page images: {len(total_filenames)}")

    for filename in total_filenames:
        save_crops_for_page(filename)


Total page images: 25
>>> Processing image: pic 1.png

0: 288x1024 1 Table, 1470.4ms
Speed: 7.6ms preprocess, 1470.4ms inference, 0.0ms postprocess per image at shape (1, 3, 288, 1024)
Total crops: 0
>>> Processing image: pic 10.jpg

0: 1024x672 2 Pictures, 1 Section-header, 5 Texts, 3410.7ms
Speed: 14.7ms preprocess, 3410.7ms inference, 2.0ms postprocess per image at shape (1, 3, 1024, 672)
Total crops: 8
Saving image crop 0... label=Picture conf=0.897
Saving image crop 1... label=Picture conf=0.894
Saving image crop 2... label=Text conf=0.949
Saving image crop 3... label=Text conf=0.920
Saving image crop 4... label=Text conf=0.889
Saving image crop 5... label=Text conf=0.793
Saving image crop 6... label=Text conf=0.738
Saving image crop 7... label=Section-header conf=0.805
>>> Processing image: pic 11.png

0: 352x1024 2 Tables, 1342.1ms
Speed: 0.0ms preprocess, 1342.1ms inference, 0.0ms postprocess per image at shape (1, 3, 352, 1024)
Total crops: 0
>>> Processing image: pic 12.jpg

