# Dataset Unification & Active Learning

This notebook consolidates your dataset across categories and helps select uncertain samples for annotation using your real scripts.
It embeds and runs code from:
- newspaper_yolo/unified_dataset/build_unified_dataset.py
- newspaper_yolo/unified_dataset/add_new_annotations.py
- newspaper_yolo/select_uncertain.py

Safe defaults (like DRY_RUN) are used to avoid unintended moves during exploration.

In [None]:
# build_unified_dataset.py (safe DRY_RUN)
import shutil
from pathlib import Path
import random
import yaml

CATEGORIES = ["AdminForm", "BookCover", "Invoice", "BusinessCard", "Newspaper"]
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
DEST = Path("newspaper_yolo/unified_dataset")
VAL_FRACTION = 0.2
RANDOM_SEED = 42
DRY_RUN = True

NAMES = [
    "Header","Title","Text","Table","Image","Footer",
    "Stamp or Signature","Caption","Keyvalue","List-item","Check-box","Formulas",
]

def safe_mkdir(p: Path):
    if not p.exists():
        if not DRY_RUN:
            p.mkdir(parents=True, exist_ok=True)

def build_unified():
    random.seed(RANDOM_SEED)
    images_train = DEST / "images" / "train"
    images_val = DEST / "images" / "val"
    labels_train = DEST / "labels" / "train"
    labels_val = DEST / "labels" / "val"
    for d in [images_train, images_val, labels_train, labels_val]:
        safe_mkdir(d)

    matched_pairs = []
    for cat in CATEGORIES:
        cat_root = Path("newspaper_yolo") / cat
        img_dir = cat_root / "images" / "train"
        lbl_dir = cat_root / "labels" / "train"
        if not img_dir.exists() or not lbl_dir.exists():
            print(f"[WARN] Skipping {cat}: {img_dir} / {lbl_dir} missing")
            continue
        label_stems = {p.stem for p in lbl_dir.glob("*.txt")}
        for img in img_dir.iterdir():
            if img.suffix.lower() not in IMAGE_EXTS:
                continue
            if img.stem in label_stems:
                lbl = lbl_dir / f"{img.stem}.txt"
                new_stem = f"{cat}_{img.stem}"
                matched_pairs.append((img, lbl, new_stem))

    print(f"Found {len(matched_pairs)} matched pairs")
    random.shuffle(matched_pairs)
    val_count = int(len(matched_pairs) * VAL_FRACTION)
    val_set = set(matched_pairs[:val_count])

    for img, lbl, new_stem in matched_pairs:
        if (img, lbl, new_stem) in val_set:
            img_out = DEST / "images" / "val" / (new_stem + img.suffix.lower())
            lbl_out = DEST / "labels" / "val" / (new_stem + ".txt")
        else:
            img_out = DEST / "images" / "train" / (new_stem + img.suffix.lower())
            lbl_out = DEST / "labels" / "train" / (new_stem + ".txt")
        if DRY_RUN:
            print(f"COPY {img} -> {img_out}")
            print(f"COPY {lbl} -> {lbl_out}")
        else:
            shutil.copy2(img, img_out)
            shutil.copy2(lbl, lbl_out)

    data_yaml = {
        "path": str(DEST.resolve()),
        "train": "images/train",
        "val": "images/val",
        "nc": len(NAMES),
        "names": {i: n for i, n in enumerate(NAMES)},
    }
    if DRY_RUN:
        print("data.yaml preview:\n", yaml.dump(data_yaml, allow_unicode=True, sort_keys=False))
    else:
        with open(DEST / "data.yaml", "w", encoding="utf-8") as f:
            yaml.safe_dump(data_yaml, f, allow_unicode=True, sort_keys=False)

# build_unified()

In [None]:
# add_new_annotations.py
from pathlib import Path
import shutil

BASE = Path('newspaper_yolo/unified_dataset')
SRC_IMG = BASE / 'new_annotations' / 'images'
SRC_LBL = BASE / 'new_annotations' / 'labels'
DST_IMG = BASE / 'images' / 'train'
DST_LBL = BASE / 'labels' / 'train'

ALLOWED_IMG_EXT = {'.jpg', '.jpeg', '.png', '.bmp', '.webp', '.tif', '.tiff'}

def next_free_stem(stem: str) -> str:
    i = 2
    new = stem
    while (DST_IMG / f"{new}.jpg").exists() or (DST_IMG / f"{new}.png").exists() or (DST_LBL / f"{new}.txt").exists():
        new = f"{stem}_v{i}"
        i += 1
    return new

added, skipped = 0, 0
DST_IMG.mkdir(parents=True, exist_ok=True)
DST_LBL.mkdir(parents=True, exist_ok=True)

if SRC_IMG.exists():
    for img in sorted(SRC_IMG.iterdir()):
        if img.suffix.lower() not in ALLOWED_IMG_EXT:
            continue
        stem = img.stem
        lbl = SRC_LBL / f"{stem}.txt"
        if not lbl.exists():
            print(f"[SKIP] No label for {img.name}")
            skipped += 1
            continue

        out_stem = stem
        if (DST_LBL / f"{stem}.txt").exists() or any((DST_IMG / f"{stem}{ext}").exists() for ext in ALLOWED_IMG_EXT):
            out_stem = next_free_stem(stem)
            print(f"[RENAME] {stem} -> {out_stem}")

        out_img = DST_IMG / f"{out_stem}{img.suffix.lower()}"
        out_lbl = DST_LBL / f"{out_stem}.txt"

        shutil.copy2(img, out_img)
        shutil.copy2(lbl, out_lbl)
        added += 1

print(f"Done. Added {added} image/label pairs, skipped {skipped}.")

In [None]:
# select_uncertain.py (active learning)
import os, glob, statistics, shutil
from pathlib import Path

SCRIPT_DIR = Path('newspaper_yolo')
PRED_DIR = SCRIPT_DIR / 'al_runs' / 'predict_pool' / 'labels'
IMG_SRC  = SCRIPT_DIR / 'pool_newspaper'
OUT_DIR  = SCRIPT_DIR / 'to_annotate_active'
OUT_DIR.mkdir(exist_ok=True, parents=True)

N = int(os.environ.get("SELECT_N", 40))
TH_LO = float(os.environ.get("TH_LO", 0.2))
TH_HI = float(os.environ.get("TH_HI", 0.4))
INCLUDE_NO_DET = os.environ.get("INCLUDE_NO_DET", "0") == "1"

print(f"[INFO] PRED_DIR: {PRED_DIR}")
print(f"[INFO] IMG_SRC : {IMG_SRC}")
print(f"[INFO] OUT_DIR : {OUT_DIR}")

if not PRED_DIR.exists():
    print("[WARN] Predicted labels directory not found (run pool prediction first)")
else:
    pool_map = {}
    for ext in ("*.jpg", "*.png"):
        for p in IMG_SRC.rglob(ext):
            stem = p.stem.lower()
            if stem not in pool_map or p.suffix.lower() == ".jpg":
                pool_map[stem] = p

    print(f"[INFO] Images detected in pool: {len(pool_map)}")

    candidats, hors_plage, no_det = [], [], []
    label_files = sorted(glob.glob(str(PRED_DIR / "*.txt")))
    print(f"[INFO] Predicted label files found: {len(label_files)}")

    for lbl in label_files:
        with open(lbl, "r", encoding="utf-8", errors="ignore") as f:
            confs = []
            for line in f:
                sp = line.split()
                if len(sp) >= 6:
                    try:
                        confs.append(float(sp[-1]))
                    except ValueError:
                        pass
        score = statistics.mean(confs) if confs else 0.0
        stem = Path(lbl).stem.lower()
        img_path = pool_map.get(stem)
        if not img_path:
            continue
        if score == 0.0:
            if INCLUDE_NO_DET:
                no_det.append((score, img_path))
        elif TH_LO <= score <= TH_HI:
            candidats.append((score, img_path))
        else:
            hors_plage.append((score, img_path))

    candidats.sort(key=lambda x: x[0])
    if len(candidats) < N:
        below = [(s, p) for (s, p) in hors_plage if 0.0 < s < TH_LO]
        below.sort(key=lambda x: x[0])
        supplement = N - len(candidats)
        candidats.extend(below[:supplement])
    if len(candidats) < N and INCLUDE_NO_DET and no_det:
        no_det.sort(key=lambda x: x[0])
        supplement = N - len(candidats)
        candidats.extend(no_det[:supplement])

    candidats = candidats[:N]

    copied = 0
    for score, img in candidats:
        dst = OUT_DIR / img.name
        try:
            shutil.copy2(img, dst)
            copied += 1
        except Exception as e:
            print(f"[WARN] Copy failed for {img}: {e}")

    print(f"[SUMMARY] Selected {len(candidats)} requested = {N}")
    print(f"[SUMMARY] Copied to {OUT_DIR}: {copied}")