# 01 - Data Preparation
This notebook documents how raw PDFs/images are converted to YOLO format (images + labels).

In [None]:
# Setup imports
from pathlib import Path
print('Notebook scaffold - add your data prep code here')

# Dataset Preparation
The following cells include the actual scripts used to prepare the dataset from PDFs and organize YOLOv8-ready images and labels.

In [None]:
# Convert PDFs to images (from convert_pdfs.py)
import os
from pdf2image import convert_from_path

PDF_FOLDER = 'pdfs'
OUTPUT_FOLDER = 'all_images'

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.lower().endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF(s).")

if not pdf_files:
    print("No PDFs found. Exiting.")
else:
    for pdf_file in pdf_files:
        pdf_path = os.path.join(PDF_FOLDER, pdf_file)
        print(f"Converting: {pdf_file}")
        try:
            images = convert_from_path(pdf_path)
            for i, image in enumerate(images):
                image_name = f"{os.path.splitext(pdf_file)[0]}_page_{i+1}.png"
                image_path = os.path.join(OUTPUT_FOLDER, image_name)
                image.save(image_path, 'PNG')
            print(f"Saved {len(images)} image(s) from {pdf_file}")
        except Exception as e:
            print(f"Error converting {pdf_file}: {e}")

In [None]:
# Prepare YOLO dataset from images and labels (from dataset.py)
import os
import shutil
import random

all_images_dir = 'images'
all_labels_dir = 'labels'
base_output = 'newspaper_yolo'

for split in ['train', 'val']:
    os.makedirs(os.path.join(base_output, 'images', split), exist_ok=True)
    os.makedirs(os.path.join(base_output, 'labels', split), exist_ok=True)

label_files = [f for f in os.listdir(all_labels_dir) if f.endswith('.txt')]
image_label_pairs = []

for label_file in label_files:
    img_file = label_file.replace('.txt', '.jpg')
    img_path = os.path.join(all_images_dir, img_file)
    label_path = os.path.join(all_labels_dir, label_file)

    if os.path.exists(img_path):
        image_label_pairs.append((img_path, label_path))

random.shuffle(image_label_pairs)
split_idx = int(len(image_label_pairs) * 0.8)
train_pairs = image_label_pairs[:split_idx]
val_pairs = image_label_pairs[split_idx:]

for img, label in train_pairs:
    shutil.copy(img, os.path.join(base_output, 'images/train'))
    shutil.copy(label, os.path.join(base_output, 'labels/train'))

for img, label in val_pairs:
    shutil.copy(img, os.path.join(base_output, 'images/val'))
    shutil.copy(label, os.path.join(base_output, 'labels/val'))

print("Dataset prepared for YOLOv8 training.")

# Unified Dataset Builder and Annotation Utilities
The next cells embed your real utilities for unifying datasets across categories and adding new annotations.

In [None]:
# unified_dataset/build_unified_dataset.py content
import shutil
from pathlib import Path
import random
import yaml

CATEGORIES = ["AdminForm", "BookCover", "Invoice", "BusinessCard", "Newspaper"]
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
DEST = Path("newspaper_yolo/unified_dataset")
VAL_FRACTION = 0.2
RANDOM_SEED = 42
DRY_RUN = True  # stay safe when running in notebook; change to False to execute

NAMES = [
    "Header","Title","Text","Table","Image","Footer",
    "Stamp or Signature","Caption","Keyvalue","List-item","Check-box","Formulas",
]


def safe_mkdir(p: Path):
    if not p.exists():
        if not DRY_RUN:
            p.mkdir(parents=True, exist_ok=True)


def build_unified():
    random.seed(RANDOM_SEED)
    images_train = DEST / "images" / "train"
    images_val = DEST / "images" / "val"
    labels_train = DEST / "labels" / "train"
    labels_val = DEST / "labels" / "val"
    for d in [images_train, images_val, labels_train, labels_val]:
        safe_mkdir(d)

    matched_pairs = []
    for cat in CATEGORIES:
        cat_root = Path("newspaper_yolo") / cat
        img_dir = cat_root / "images" / "train"
        lbl_dir = cat_root / "labels" / "train"
        if not img_dir.exists() or not lbl_dir.exists():
            print(f"[WARN] Skipping {cat}: {img_dir} / {lbl_dir} missing")
            continue
        label_stems = {p.stem for p in lbl_dir.glob("*.txt")}
        for img in img_dir.iterdir():
            if img.suffix.lower() not in IMAGE_EXTS:
                continue
            if img.stem in label_stems:
                lbl = lbl_dir / f"{img.stem}.txt"
                new_stem = f"{cat}_{img.stem}"
                matched_pairs.append((img, lbl, new_stem))

    print(f"Found {len(matched_pairs)} matched pairs")
    random.shuffle(matched_pairs)
    val_count = int(len(matched_pairs) * VAL_FRACTION)
    val_set = set(matched_pairs[:val_count])

    for img, lbl, new_stem in matched_pairs:
        if (img, lbl, new_stem) in val_set:
            img_out = images_val / (new_stem + img.suffix.lower())
            lbl_out = labels_val / (new_stem + ".txt")
        else:
            img_out = images_train / (new_stem + img.suffix.lower())
            lbl_out = labels_train / (new_stem + ".txt")
        if DRY_RUN:
            print(f"COPY {img} -> {img_out}")
            print(f"COPY {lbl} -> {lbl_out}")
        else:
            shutil.copy2(img, img_out)
            shutil.copy2(lbl, lbl_out)

    data_yaml = {
        "path": str(DEST.resolve()),
        "train": "images/train",
        "val": "images/val",
        "nc": len(NAMES),
        "names": {i: n for i, n in enumerate(NAMES)},
    }
    if DRY_RUN:
        print("data.yaml preview:\n", yaml.dump(data_yaml, allow_unicode=True, sort_keys=False))
    else:
        with open(DEST / "data.yaml", "w", encoding="utf-8") as f:
            yaml.safe_dump(data_yaml, f, allow_unicode=True, sort_keys=False)

# Example: build_unified()

In [None]:
# unified_dataset/add_new_annotations.py content
from pathlib import Path
import shutil

BASE = Path('newspaper_yolo/unified_dataset')
SRC_IMG = BASE / 'new_annotations' / 'images'
SRC_LBL = BASE / 'new_annotations' / 'labels'
DST_IMG = BASE / 'images' / 'train'
DST_LBL = BASE / 'labels' / 'train'

ALLOWED_IMG_EXT = {'.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tif', '.tiff'}

def next_free_stem(stem: str) -> str:
    i = 2
    new = stem
    while (DST_IMG / f"{new}.jpg").exists() or (DST_IMG / f"{new}.png").exists() or (DST_LBL / f"{new}.txt").exists():
        new = f"{stem}_v{i}"
        i += 1
    return new

added, skipped = 0, 0
DST_IMG.mkdir(parents=True, exist_ok=True)
DST_LBL.mkdir(parents=True, exist_ok=True)

for img in sorted(SRC_IMG.iterdir()) if SRC_IMG.exists() else []:
    if img.suffix.lower() not in ALLOWED_IMG_EXT:
        continue
    stem = img.stem
    lbl = SRC_LBL / f"{stem}.txt"
    if not lbl.exists():
        print(f"[SKIP] No label for {img.name}")
        skipped += 1
        continue

    out_stem = stem
    if (DST_LBL / f"{stem}.txt").exists() or any((DST_IMG / f"{stem}{ext}").exists() for ext in ALLOWED_IMG_EXT):
        out_stem = next_free_stem(stem)
        print(f"[RENAME] {stem} -> {out_stem}")

    out_img = DST_IMG / f"{out_stem}{img.suffix.lower()}"
    out_lbl = DST_LBL / f"{out_stem}.txt"

    shutil.copy2(img, out_img)
    shutil.copy2(lbl, out_lbl)
    added += 1

print(f"Done. Added {added} image/label pairs, skipped {skipped}.")