## MonReader - part 2

----

#### Goal:
At this part of the project we will experience with different OCR algorightms to read pages of two different books: 'The Chamber' from John Grisham and 'A onda que se ergueu no mar' from Ruy Castro written in Portuguese.


----


#### Imports and Environment

In [13]:
from pathlib import Path
import cv2
import numpy as np
from PIL import Image
import csv
import re
from tqdm import tqdm
import pandas as pd

In [2]:
BASE = Path.cwd()
DATA_DIR = BASE / "data"
BOOK_DIR = DATA_DIR / "books"
WORK_DIR = BASE / "work" / "A_ingest"

ENG_BOOK_DIR = BOOK_DIR / "The_Chamber-John_Grisham"
POR_BOOK_DIR = BOOK_DIR / "A_onda_que_se_ergueu_no_mar-Ruy_Castro"

ENG_IMG_DIR = ENG_BOOK_DIR / "images"
POR_IMG_DIR = POR_BOOK_DIR / "images"

for p in [BOOK_DIR, WORK_DIR, ENG_BOOK_DIR, POR_BOOK_DIR, ENG_IMG_DIR, POR_IMG_DIR]:
        p.mkdir(parents=True, exist_ok=True)


### Step 1 - Ingestion & Page conditioning

In [8]:
# Helper functions
def natural_key(p: Path):
    """
    Create a key for natural (human-like) sorting of filenames.
    E.g., 'page2' < 'page10'.
    """
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', p.stem)]

def collect_pages(img_dir: Path) -> list[Path]:
    """
    Collect page images (PNG/JPG/TIFF/BMP/WEBP) from a directory and return
    a naturally sorted, de-duplicated list of paths.
    """
    imgs = []
    for ext in ("*.png","*.jpg","*.jpeg","*.tif","*.tiff","*.bmp","*.webp"):
        imgs.extend(img_dir.glob(ext))
    return sorted(set(imgs), key=natural_key)

def _clip_small_angle(a_deg: float, limit: float = 15.0) -> float:
    """Map any angle to the nearest equivalent within [-limit, +limit] degrees."""
    a = ((a_deg + 90) % 180) - 90  # map to [-90, 90)
    if a >  limit:  a -= 180
    if a < -limit:  a += 180
    return float(np.clip(a, -limit, limit))

def estimate_skew_angle_hough(gray: np.ndarray) -> float:
    """
    Estimate skew using Hough lines on text baselines.
    Returns a small angle in degrees (clockwise positive means rotate CW to deskew).
    """
    if gray.dtype != np.uint8:
        gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Ignore a small border (5%) to avoid page frames dominating
    h, w = gray.shape[:2]
    bx = int(w * 0.05); by = int(h * 0.05)
    roi = gray[by:h-by, bx:w-bx]

    # Binarize + invert (text white)
    _, bw = cv2.threshold(roi, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    inv = 255 - bw

    # Light horizontal closing to connect text lines
    k = max(1, h // 200)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k*3+1, 1))
    closed = cv2.morphologyEx(inv, cv2.MORPH_CLOSE, kernel, iterations=1)

    # Edges + Hough
    edges = cv2.Canny(closed, 50, 150, apertureSize=3, L2gradient=True)
    lines = cv2.HoughLines(edges, rho=1, theta=np.pi/180, threshold=max(80, int(min(h,w)*0.1)))

    if lines is None:
        return estimate_skew_angle_projection(gray)  # fallback

    # Convert each line θ to a baseline angle around 0°
    # HoughLines gives θ as the normal angle; baseline angle = θ - 90°
    angles = []
    for l in lines[:200]:  # cap to avoid noise
        theta = l[0][1]  # radians
        baseline_deg = np.degrees(theta) - 90.0
        angles.append(_clip_small_angle(baseline_deg, limit=15.0))

    if len(angles) == 0:
        return estimate_skew_angle_projection(gray)

    # Robust aggregate
    median_angle = float(np.median(angles))
    return median_angle

def estimate_skew_angle_projection(gray: np.ndarray) -> float:
    """
    Fallback: small-angle sweep maximizing horizontal projection variance.
    Works on a downscaled image for speed.
    """
    if gray.dtype != np.uint8:
        gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Downscale for speed
    scale = 1000 / max(gray.shape[:2])
    if scale < 1.0:
        small = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    else:
        small = gray

    _, bw = cv2.threshold(small, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    inv = 255 - bw

    best_angle, best_score = 0.0, -1.0
    for a in np.linspace(-10.0, 10.0, 41):  # step 0.5°
        # rotate around center without changing size
        h, w = inv.shape[:2]
        M = cv2.getRotationMatrix2D((w/2, h/2), a, 1.0)
        rot = cv2.warpAffine(inv, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
        proj = rot.sum(axis=1).astype(np.float32)
        score = proj.var()
        if score > best_score:
            best_score, best_angle = score, a
    return float(best_angle)


In [9]:
def rotate_image(img: np.ndarray, angle_deg: float) -> np.ndarray:
    """Rotate around center with border replication."""
    (h, w) = img.shape[:2]
    M = cv2.getRotationMatrix2D((w/2, h/2), angle_deg, 1.0)
    return cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)

def detect_main_text_bbox(gray: np.ndarray) -> tuple[int,int,int,int] | None:
    """
    Detect a coarse bbox for the main text region on a deskewed grayscale page.
    Returns (x, y, w, h) or None if not found.
    """
    if gray.dtype != np.uint8:
        gray = cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    inv = 255 - bw

    k = max(1, gray.shape[0] // 300)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k*2+1, k*2+1))
    proc = cv2.morphologyEx(inv, cv2.MORPH_CLOSE, kernel, iterations=1)

    contours, _ = cv2.findContours(proc, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return None
    x, y, w, h = cv2.boundingRect(max(contours, key=cv2.contourArea))
    return (int(x), int(y), int(w), int(h))

def save_overlay(img_bgr: np.ndarray, bbox: tuple[int,int,int,int] | None) -> np.ndarray:
    """Draw a green rectangle over the main text region (if available)."""
    vis = img_bgr.copy()
    if bbox is not None:
        x, y, w, h = bbox
        cv2.rectangle(vis, (x, y), (x+w, y+h), (0, 255, 0), 2)
    return vis


In [21]:
# Main runner 
def process_image_folder(lang: str, img_dir: Path, work_dir: Path):
    """
    Run Phase A1 (ingestion & page conditioning) for a language, images only.
    Saves:
      - page_original.png
      - page_deskewed.png
      - overlay_text_region.png
    Appends per-page metadata to work/A_ingest/ingest_log.csv
    """
    out_lang_dir = work_dir / lang
    out_lang_dir.mkdir(parents=True, exist_ok=True)

    pages = collect_pages(img_dir)
    log_rows = []

    for page_path in tqdm(pages, desc=f"[{lang}] ingest (images)"):
        pil = Image.open(page_path).convert("RGB")
        bgr = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

        # Estimate skew & deskew
        angle = estimate_skew_angle_hough(gray)
        deskewed_bgr = rotate_image(bgr, -angle)  # rotate opposite to correct

        # Text bbox on deskewed image
        gray_deskew = cv2.cvtColor(deskewed_bgr, cv2.COLOR_BGR2GRAY)
        bbox = detect_main_text_bbox(gray_deskew)
        overlay = save_overlay(deskewed_bgr, bbox)

        # Per-page output dir
        page_out = out_lang_dir / page_path.stem
        page_out.mkdir(parents=True, exist_ok=True)

        # Save artifacts
        cv2.imwrite(str(page_out / "page_original.png"), bgr)
        cv2.imwrite(str(page_out / "page_deskewed.png"), deskewed_bgr)
        cv2.imwrite(str(page_out / "overlay_text_region.png"), overlay)

        H, W = gray.shape[:2]
        
        if bbox is None:
            bx = by = bw = bh = None
        else:
            bx, by, bw, bh = bbox

        log_rows.append({
            "language": lang,
            "page_path": str(page_path),
            "out_dir": str(page_out),
            "width": W,
            "height": H,
            "skew_angle_deg": float(angle),
            "bbox_x": bx, "bbox_y": by, "bbox_w": bw, "bbox_h": bh
        })

    # Append to CSV log
    log_csv = work_dir / "ingest_log.csv"
    headers = ["language","page_path","out_dir","width","height","skew_angle_deg","bbox_x","bbox_y","bbox_w","bbox_h"]
    write_header = not log_csv.exists()
    with open(log_csv, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=headers)
        if write_header:
            w.writeheader()
        w.writerows(log_rows)

    print(f"Done: {lang}. Pages: {len(pages)}. Log -> {log_csv}")



In [11]:
# Run Step 1
# Execute for both languages

process_image_folder("english", ENG_IMG_DIR, WORK_DIR)
process_image_folder("portuguese", POR_IMG_DIR, WORK_DIR)
print("Step1 (images-only) complete.")

[english] ingest (images): 100%|██████████| 12/12 [00:05<00:00,  2.16it/s]


Done: english. Pages: 12. Log -> e:\Devs\pyEnv-1\Apziva\MonReader\work\A_ingest\ingest_log.csv


[portuguese] ingest (images): 100%|██████████| 16/16 [00:07<00:00,  2.12it/s]

Done: portuguese. Pages: 16. Log -> e:\Devs\pyEnv-1\Apziva\MonReader\work\A_ingest\ingest_log.csv
Step1 (images-only) complete.





In [15]:
log_csv = WORK_DIR / "ingest_log.csv"
df = pd.read_csv(log_csv)

In [23]:
columns_to_exclude = ['page_path', 'out_dir']
df.drop(columns=columns_to_exclude).tail(28)

Unnamed: 0,language,width,height,skew_angle_deg,bbox_x,bbox_y,bbox_w,bbox_h
56,english,2048,1536,0.0,0,0,2048,1536
57,english,2048,1536,0.0,0,0,2048,1536
58,english,2048,1536,0.0,0,0,2048,1536
59,english,2048,1536,0.0,0,0,2048,1536
60,english,2048,1536,0.0,0,0,2048,1536
61,english,2048,1536,-1.0,0,0,2048,1536
62,english,2048,1536,0.0,0,0,2048,1536
63,english,2048,1536,0.0,0,0,2048,1536
64,english,2048,1536,0.999992,0,0,2048,1536
65,english,2048,1536,0.999992,634,0,1414,1536


#### Step 1 – Ingestion & Page Conditioning Summary
The ingestion stage successfully collected, standardized, and deskewed all book pages, producing well-aligned images with minimal skew (mostly within ±2°) and consistent text-region detection. The resulting dataset is geometrically clean and fully logged, establishing a solid foundation for the next step binarization, where we will isolate text from background while preserving fine details and diacritics.

----

### Step 2 - Binarization