# Preprocessing Pipeline: YOLOv8 Panels → CRAFT Text Regions → PaddleOCR

This notebook processes comic page images by:
- Detecting panels with YOLOv8 (bounding boxes)
- Detecting text regions in each panel with CRAFT
- Extracting text per page (aggregating all bubbles) with PaddleOCR

Input: `C:\Users\uanus\Box\AML Comic Project\2`
- Output panels: `C:\Users\uanus\Box\AML Comic Project\2_images`
- Output text: `C:\Users\uanus\Box\AML Comic Project\2_text`

Notes:
- You can adapt YOLO training from the beginner guide: `https://medium.com/@nandinilreddy/implementing-yolov8-in-detail-for-beginners-9a5d3b0fe30a`. Here we use inference (pretrained) but expose hooks for custom weights.


In [1]:
# Global toggles
USE_CRAFT = False  # Set to True to enable CRAFT; otherwise PaddleOCR handles detection+recognition

In [2]:
# Unified dependency install (run once). Restart kernel if imports fail.
import sys
import subprocess
import platform


def run(cmd: list):
    print("$", " ".join(cmd))
    subprocess.check_call(cmd)


def has(module_name: str) -> bool:
    try:
        __import__(module_name)
        return True
    except Exception:
        return False

# Upgrade packaging tools
run([sys.executable, '-m', 'pip', 'install', '-U', 'pip', 'setuptools', 'wheel'])

# Core deps helpful for PaddleOCR and CV
for pkg in [
    'numpy<2', 'opencv-python<5', 'shapely', 'scikit-image', 'pyclipper', 'rapidfuzz', 'tqdm'
]:
    try:
        run([sys.executable, '-m', 'pip', 'install', pkg])
    except subprocess.CalledProcessError:
        pass

# PaddlePaddle CPU (Windows) + PaddleOCR
if platform.system() == 'Windows' and not has('paddle'):
    run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.6.2'])
if not has('paddleocr'):
    # prefer a stable paddleocr version
    try:
        run([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'paddleocr==2.8.1'])
    except subprocess.CalledProcessError:
        run([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'paddleocr==2.7.3'])

# Ultralytics for YOLOv8
if not has('ultralytics'):
    run([sys.executable, '-m', 'pip', 'install', 'ultralytics'])

# Optional: CRAFT only if enabled
if USE_CRAFT and not has('craft_text_detector'):
    try:
        run([sys.executable, '-m', 'pip', 'install', 'craft-text-detector==0.4.6'])
    except subprocess.CalledProcessError:
        try:
            run([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/faustomorales/craft-text-detector.git'])
        except subprocess.CalledProcessError:
            print('CRAFT install skipped; set USE_CRAFT=False or install manually from clovaai/CRAFT-pytorch.')

print('Dependency check complete. Restart kernel if imports still fail.')


$ c:\Users\uanus\anaconda3\python.exe -m pip install -U pip setuptools wheel
$ c:\Users\uanus\anaconda3\python.exe -m pip install numpy<2
$ c:\Users\uanus\anaconda3\python.exe -m pip install opencv-python<5
$ c:\Users\uanus\anaconda3\python.exe -m pip install shapely
$ c:\Users\uanus\anaconda3\python.exe -m pip install scikit-image
$ c:\Users\uanus\anaconda3\python.exe -m pip install pyclipper
$ c:\Users\uanus\anaconda3\python.exe -m pip install rapidfuzz
$ c:\Users\uanus\anaconda3\python.exe -m pip install tqdm
$ c:\Users\uanus\anaconda3\python.exe -m pip install ultralytics
Dependency check complete. Restart kernel if imports still fail.


In [3]:
# Imports and configuration
from pathlib import Path
from typing import List, Tuple, Optional, Dict
import os
import json
import math
import cv2
import numpy as np

from ultralytics import YOLO

# Conditional imports guarded by availability and flags
CRAFT_AVAILABLE = False
try:
    if USE_CRAFT:
        from craft_text_detector import (Craft, get_prediction)
        CRAFT_AVAILABLE = True
except Exception:
    CRAFT_AVAILABLE = False

PADDLE_AVAILABLE = False
try:
    from paddleocr import PaddleOCR
    PADDLE_AVAILABLE = True
except Exception:
    PADDLE_AVAILABLE = False

# Folder configuration (adjust as needed)
INPUT_DIR = Path(r"C:\Users\uanus\Box\AML Comic Project\2")
PANELS_DIR = Path(r"C:\Users\uanus\Box\AML Comic Project\2_images")
TEXT_DIR = Path(r"C:\Users\uanus\Box\AML Comic Project\2_text")

# Create output directories
PANELS_DIR.mkdir(parents=True, exist_ok=True)
TEXT_DIR.mkdir(parents=True, exist_ok=True)

# Inference device config
DEVICE = 'cuda' if cv2.cuda.getCudaEnabledDeviceCount() > 0 else 'cpu'

print(f"Input: {INPUT_DIR}")
print(f"Panels out: {PANELS_DIR}")
print(f"Text out: {TEXT_DIR}")
print(f"Device: {DEVICE}")
print(f"USE_CRAFT: {USE_CRAFT}, CRAFT_AVAILABLE: {CRAFT_AVAILABLE}, PADDLE_AVAILABLE: {PADDLE_AVAILABLE}")


OSError: [WinError 127] The specified procedure could not be found. Error loading "c:\Users\uanus\anaconda3\Lib\site-packages\torch\lib\shm.dll" or one of its dependencies.

In [None]:
# Utility functions

def ensure_rgb(img: np.ndarray) -> np.ndarray:
    if img is None:
        raise ValueError("Empty image")
    if len(img.shape) == 2:
        return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    if img.shape[2] == 4:
        return cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
    return img

def sort_panels_reading_order(boxes: List[Tuple[int,int,int,int]]) -> List[Tuple[int,int,int,int]]:
    """Sort panels roughly left-to-right, top-to-bottom.
    boxes: list of (x1, y1, x2, y2)
    """
    if not boxes:
        return boxes
    # Sort by y, then x (with a row tolerance)
    row_tol = max(10, int(0.03 * np.mean([b[3]-b[1] for b in boxes])))
    boxes_sorted = sorted(boxes, key=lambda b: (b[1]//row_tol, b[0]))
    return boxes_sorted

def clip_box_to_image(box, w, h):
    x1, y1, x2, y2 = box
    return max(0,x1), max(0,y1), min(w-1,x2), min(h-1,y2)


def save_panel_crop(page_path: Path, panel_idx: int, crop: np.ndarray) -> Path:
    base = page_path.stem
    out_path = PANELS_DIR / f"{base}_panel_{panel_idx:02d}.jpg"
    cv2.imwrite(str(out_path), crop)
    return out_path


def aggregate_text_lines(results) -> str:
    """Aggregate PaddleOCR results into a single text per page.
    results format: list of [ [box, (text, conf)], ... ]
    """
    if not results:
        return ""
    lines = []
    for det in results:
        try:
            text = det[1][0]
            conf = det[1][1]
            if text and conf is not None and conf >= 0.2:
                lines.append(text)
        except Exception:
            continue
    return "\n".join(lines).strip()


In [None]:
# YOLOv8: Panel detection wrapper
class PanelDetector:
    def __init__(self, weights: Optional[str] = None, conf: float = 0.25, iou: float = 0.5):
        # Use a generic pretrained model (e.g., yolov8n) as placeholder
        # Replace with your fine-tuned panel-detector weights when ready
        self.model = YOLO(weights or 'yolov8n.pt')
        self.conf = conf
        self.iou = iou

    def predict(self, image_bgr: np.ndarray) -> List[Tuple[int,int,int,int]]:
        h, w = image_bgr.shape[:2]
        results = self.model.predict(source=image_bgr, conf=self.conf, iou=self.iou, verbose=False, device=0 if DEVICE=='cuda' else None)
        boxes = []
        for r in results:
            if r.boxes is None:
                continue
            for b in r.boxes.xyxy.cpu().numpy():
                x1, y1, x2, y2 = b[:4].astype(int)
                x1, y1, x2, y2 = clip_box_to_image((x1,y1,x2,y2), w, h)
                # Optional: filter tiny boxes
                if (x2-x1)*(y2-y1) < 0.01 * w*h:
                    continue
                boxes.append((x1,y1,x2,y2))
        return sort_panels_reading_order(boxes)


In [None]:
# CRAFT: Text region detection (optional). If CRAFT import fails, this becomes a no-op.
class TextRegionDetector:
    def __init__(self, cuda: bool = (DEVICE=='cuda')):
        try:
            from craft_text_detector import Craft, get_prediction
            self._Craft = Craft
            self._get_prediction = get_prediction
            self.craft = self._Craft(output_dir=None, cuda=cuda)
            self.available = True
        except Exception as e:
            print(f"CRAFT not available ({e}). Proceeding without text-region detection.")
            self.craft = None
            self.available = False

    def predict(self, image_bgr: np.ndarray) -> List[np.ndarray]:
        """Return list of polygons (np.ndarray Nx2) for detected text regions.
        If CRAFT isn't available, return an empty list to allow downstream OCR on full panels.
        """
        if not self.available:
            return []
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
        prediction_result = self._get_prediction(
            image=image_rgb,
            craft_net=self.craft.craft_net,
            text_threshold=0.7,
            link_threshold=0.4,
            low_text=0.4,
            cuda=self.craft.cuda,
        )
        polys = prediction_result.get('boxes', [])
        return polys or []

    def release(self):
        if self.available and self.craft is not None:
            self.craft.unload_craftnet_model()
            self.craft.unload_refinenet_model()


In [None]:
# PaddleOCR: OCR wrapper
class OCRExtractor:
    def __init__(self, lang: str = 'en', use_angle_cls: bool = True):
        # Use CPU by default on Windows unless CUDA Paddle is set up
        self.ocr = PaddleOCR(use_angle_cls=use_angle_cls, lang=lang, use_gpu=(DEVICE=='cuda'))

    def extract_text(self, image_bgr: np.ndarray) -> str:
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
        result = self.ocr.ocr(image_rgb, cls=True)
        # result is list per image; when single image provided, it's [[det]]
        if not result:
            return ""
        # Flatten and aggregate
        flat = []
        for det_list in result:
            if not det_list:
                continue
            for det in det_list:
                flat.append(det)
        return aggregate_text_lines(flat)


In [None]:
# End-to-end processing pipeline

def process_page_image(page_path: Path, panel_detector: PanelDetector, text_detector: TextRegionDetector, ocr: OCRExtractor) -> Dict:
    """Process one page: detect panels, save crops, run text detection+OCR per panel,
    aggregate text over the page, and write .txt.
    Returns a dict with metadata and saved file paths.
    """
    img = cv2.imread(str(page_path))
    if img is None:
        return {"page": str(page_path), "error": "Failed to read image"}

    img = ensure_rgb(img)
    h, w = img.shape[:2]

    # 1) Panel detection
    panel_boxes = panel_detector.predict(img)

    # 2) Save panel crops and run CRAFT+OCR per panel
    page_text_lines = []
    saved_panels = []

    for idx, (x1, y1, x2, y2) in enumerate(panel_boxes):
        crop = img[y1:y2, x1:x2]
        out_path = save_panel_crop(page_path, idx, crop)
        saved_panels.append(str(out_path))

        # Text regions (optional future use: crop to text regions before OCR)
        # For simplicity, pass whole panel to OCR, which is robust enough; CRAFT kept for extensibility
        # polygons = text_detector.predict(crop)
        text = ocr.extract_text(crop)
        if text:
            page_text_lines.append(text)

    # 3) Aggregate all panel texts into a single page-level .txt
    page_text = "\n\n".join([t for t in page_text_lines if t.strip()])
    txt_out = TEXT_DIR / f"{page_path.stem}.txt"
    with open(txt_out, 'w', encoding='utf-8') as f:
        f.write(page_text)

    return {
        "page": str(page_path),
        "size": [w, h],
        "num_panels": len(panel_boxes),
        "panel_boxes": panel_boxes,
        "panel_images": saved_panels,
        "text_file": str(txt_out),
    }


In [None]:
# Run the pipeline on the specified input folder
from tqdm import tqdm

# Gather images (jpg/jpeg/png)
image_exts = {'.jpg', '.jpeg', '.png', '.bmp'}
page_paths = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() in image_exts])
print(f"Found {len(page_paths)} pages in {INPUT_DIR}")

panel_detector = PanelDetector(weights=None, conf=0.25, iou=0.5)
text_detector = TextRegionDetector(cuda=(DEVICE=='cuda'))
ocr = OCRExtractor(lang='en', use_angle_cls=True)

results = []
for page in tqdm(page_paths):
    res = process_page_image(page, panel_detector, text_detector, ocr)
    results.append(res)

# Save a summary JSON
summary_path = TEXT_DIR / 'processing_summary.json'
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)

print(f"Saved summary to {summary_path}")
print("Done.")
