# Step 3 â€” Merge YOLO datasets

Merge multiple YOLO-format sources into one dataset:

- single class: `fish` (id `0`)
- keeps background-only images (empty label files)
- optional dedup (path / pHash / MD5)
- stratified train/val/test split by source + fish/background


## Dataset manifests (`data.yaml`)

If a source folder doesn't have a `data.yaml`, we write a minimal one next to it.
This keeps each dataset self-contained and easy to inspect.


In [1]:
from __future__ import annotations

import os
from pathlib import Path

# Project paths (override via env vars if needed)
# - DATASET_ROOT: folder that contains the source datasets
# - EXPORT_ROOT: output folder for the merged dataset
DATASET_ROOT = Path(os.getenv("DATASET_ROOT", Path.cwd())).resolve()
EXPORT_ROOT = Path(os.getenv("EXPORT_ROOT", DATASET_ROOT / "Merged")).resolve()
EXPORT_ROOT.mkdir(parents=True, exist_ok=True)

# Split settings
RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
TEST_SIZE = float(os.getenv("TEST_SIZE", "0.10"))
VAL_SIZE = float(os.getenv("VAL_SIZE", "0.20"))  # on the non-test remainder

# Dedup toggle (path duplicates are always removed)
DO_DEDUP = os.getenv("DO_DEDUP", "0").lower() in {"1", "true", "yes"}

# Source datasets (paths are relative to DATASET_ROOT)
DATASETS = {
    "AquaCoop":  {"images": "AquaCoop/images",            "labels": "AquaCoop/labels"},
    "OzFish":    {"images": "OzFish/images",              "labels": "OzFish/labels"},
    "aquarium":  {"images": "aquarium/images",            "labels": "aquarium/labels"},
    "deepfish":  {"images": "deepfish/images",            "labels": "deepfish/labels"},
    "f4k":       {"images": "f4k/images",                 "labels": "f4k/labels"},
    "fish_416":  {"images": "fish_416/images",            "labels": "fish_416/labels"},
    "fishclef":  {"images": "fishclef/images",            "labels": "fishclef/labels"},
    "luderick":  {"images": "luderick/images",            "labels": "luderick/labels"},
    "deepfish_negatives": {"images": "negatives/deepfish_negatives/images", "labels": "negatives/deepfish_negatives/labels"},
}

def _infer_dataset_base(images_rel: str, labels_rel: str) -> str:
    # First folder is usually the dataset name
    img0 = Path(images_rel).parts[0] if images_rel else ""
    lab0 = Path(labels_rel).parts[0] if labels_rel else ""
    return img0 or lab0

def ensure_data_yaml(dataset_name: str, cfg: dict) -> Path:
    """Create a minimal YOLO `data.yaml` if missing."""
    base = _infer_dataset_base(cfg["images"], cfg["labels"])
    ds_dir = DATASET_ROOT / base
    yml = ds_dir / "data.yaml"

    if yml.exists():
        return yml

    content = (
        "path: .\n"
        f"train: {cfg['images']}\n"
        f"val: {cfg['images']}\n"
        f"test: {cfg['images']}\n"
        "names:\n"
        "  0: fish\n"
    )
    yml.write_text(content, encoding="utf-8")
    return yml

# Ensure each source dataset has a manifest (handy for quick sanity checks)
created = []
for name, cfg in DATASETS.items():
    p = ensure_data_yaml(name, cfg)
    created.append((name, str(p), p.exists()))

for name, p, ok in created:
    print(f"{name:18s} -> {p}  [{'OK' if ok else 'MISSING'}]")


AquaCoop           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/AquaCoop/data.yaml  [OK]
OzFish             -> /Users/MarcoPiccolo/Desktop/GitHub_Test/OzFish/data.yaml  [OK]
aquarium           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/aquarium/data.yaml  [OK]
deepfish           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/deepfish/data.yaml  [OK]
f4k                -> /Users/MarcoPiccolo/Desktop/GitHub_Test/f4k/data.yaml  [OK]
fish_416           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/fish_416/data.yaml  [OK]
fishclef           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/fishclef/data.yaml  [OK]
luderick           -> /Users/MarcoPiccolo/Desktop/GitHub_Test/luderick/data.yaml  [OK]
deepfish_negatives -> /Users/MarcoPiccolo/Desktop/GitHub_Test/negatives/data.yaml  [OK]


## Diagnostics helpers

Small checkpoints to spot missing labels, empty splits, or unexpected drops.
Turn them on when you need to debug the pipeline.


In [2]:

def checkpoint_extended(df, name):
    print(f"\n=== {name} ===")
    print(f"Total images: {len(df):,}")

    if len(df) == 0:
        print("[WARN] DataFrame is empty.")
        return

 # Dataset distribution
    if "dataset_id" in df.columns:
        print("\nImages per dataset:")
        display(df["dataset_id"].value_counts().rename("n_images").to_frame())

 # Positive / negative balance
    if "has_fish" in df.columns:
        print("\nPositive vs negative:")
        display(df["has_fish"].value_counts().rename("n_images").to_frame())

 # Boxes per image
    if "n_boxes" in df.columns:
        print("\nBoxes per image (summary):")
        display(df["n_boxes"].describe().to_frame(name="value"))

        zero_boxes = (df["n_boxes"] == 0).sum()
        print(f"Images with zero boxes: {zero_boxes:,}")

 # Path sanity
    if "image_path" in df.columns:
        missing_imgs = sum(not p.exists() for p in df["image_path"])
        print(f"Missing image files on disk: {missing_imgs}")

    if "label_path" in df.columns:
        missing_lbls = sum((p != '' and not p.exists()) for p in df["label_path"])
        print(f"Missing label files on disk: {missing_lbls}")


## Dataset inventory (before merge)

This section summarizes the number of images and label files available in each dataset folder before building the unified index.


In [3]:

from pathlib import Path
import pandas as pd

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

def _count_images(images_dir: Path) -> int:
    if not images_dir.exists():
        return 0
    return sum(1 for p in images_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS)

def _count_labels(labels_dir: Path) -> int:
    if not labels_dir.exists():
        return 0
    return sum(1 for p in labels_dir.rglob("*.txt") if p.is_file())

def _count_pairs(images_dir: Path, labels_dir: Path):
    if not images_dir.exists():
        img_stems = set()
    else:
        img_stems = {p.stem for p in images_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS}
    if not labels_dir.exists():
        lbl_stems = set()
    else:
        lbl_stems = {p.stem for p in labels_dir.rglob("*.txt") if p.is_file()}

    matched = len(img_stems & lbl_stems)
    missing_labels = len(img_stems - lbl_stems)
    orphan_labels = len(lbl_stems - img_stems)
    return matched, missing_labels, orphan_labels

rows = []
for ds_name, cfg in DATASETS.items():
    images_dir = DATASET_ROOT / cfg["images"]
    labels_dir = DATASET_ROOT / cfg["labels"]

    n_images = _count_images(images_dir)
    n_labels = _count_labels(labels_dir)
    matched, missing_labels, orphan_labels = _count_pairs(images_dir, labels_dir)

    rows.append({
        "dataset": ds_name,
        "images_dir": str(images_dir),
        "labels_dir": str(labels_dir),
        "n_images": n_images,
        "n_label_files": n_labels,
        "matched_image_label": matched,
        "images_missing_label": missing_labels,
        "orphan_label_files": orphan_labels,
    })

inventory_df = pd.DataFrame(rows).sort_values("dataset").reset_index(drop=True)
display(inventory_df)

print("Totals:")
display(inventory_df[["n_images", "n_label_files", "matched_image_label", "images_missing_label", "orphan_label_files"]].sum().to_frame(name="total"))


Unnamed: 0,dataset,images_dir,labels_dir,n_images,n_label_files,matched_image_label,images_missing_label,orphan_label_files
0,AquaCoop,/Users/MarcoPiccolo/Desktop/GitHub_Test/AquaCo...,/Users/MarcoPiccolo/Desktop/GitHub_Test/AquaCo...,1238,1238,1238,0,0
1,OzFish,/Users/MarcoPiccolo/Desktop/GitHub_Test/OzFish...,/Users/MarcoPiccolo/Desktop/GitHub_Test/OzFish...,350,350,350,0,0
2,aquarium,/Users/MarcoPiccolo/Desktop/GitHub_Test/aquari...,/Users/MarcoPiccolo/Desktop/GitHub_Test/aquari...,637,637,637,0,0
3,deepfish,/Users/MarcoPiccolo/Desktop/GitHub_Test/deepfi...,/Users/MarcoPiccolo/Desktop/GitHub_Test/deepfi...,4505,4505,4505,0,0
4,deepfish_negatives,/Users/MarcoPiccolo/Desktop/GitHub_Test/negati...,/Users/MarcoPiccolo/Desktop/GitHub_Test/negati...,2012,2012,2012,0,0
5,f4k,/Users/MarcoPiccolo/Desktop/GitHub_Test/f4k/im...,/Users/MarcoPiccolo/Desktop/GitHub_Test/f4k/la...,794,794,794,0,0
6,fish_416,/Users/MarcoPiccolo/Desktop/GitHub_Test/fish_4...,/Users/MarcoPiccolo/Desktop/GitHub_Test/fish_4...,680,680,680,0,0
7,fishclef,/Users/MarcoPiccolo/Desktop/GitHub_Test/fishcl...,/Users/MarcoPiccolo/Desktop/GitHub_Test/fishcl...,14273,14273,14273,0,0
8,luderick,/Users/MarcoPiccolo/Desktop/GitHub_Test/luderi...,/Users/MarcoPiccolo/Desktop/GitHub_Test/luderi...,4276,4276,4276,0,0


Totals:


Unnamed: 0,total
n_images,28765
n_label_files,28765
matched_image_label,28765
images_missing_label,0
orphan_label_files,0


## Pipeline checkpoints

The cells below print dataset sizes at key checkpoints so you can see exactly where the dataset shrinks (if it does).


In [4]:

def checkpoint(df, name):
    print(f"\n=== {name} ===")
    print(f"Total images: {len(df):,}")
    if "dataset_id" in df.columns:
        display(df["dataset_id"].value_counts().rename("n_images").to_frame())
    if "has_fish" in df.columns:
        display(df["has_fish"].value_counts().rename("n_images").to_frame())


## 1) Setup


In [5]:

from __future__ import annotations

import os
import re
import shutil
import hashlib
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
import numpy as np

# Seed for repeatable splits
RANDOM_SEED = int(os.getenv("RANDOM_SEED", str(RANDOM_SEED)))
np.random.seed(RANDOM_SEED)


## 2) Configuration

Set `DATASET_ROOT` as an environment variable (recommended) or place datasets under the default relative path.

Expected per-dataset structure (flexible via config below):

- images directory (e.g., `images/` or `all_frames/`)
- labels directory (YOLO bbox `.txt`, one per image)


## Class inventory (before remapping)

This cell scans YOLO label files in each dataset and reports which `class_id` values are present and how frequently they occur.
Run this **before** setting the remapping rules.


In [7]:

from collections import Counter
import pandas as pd
from pathlib import Path

def scan_dataset_classes(dataset_root: Path, datasets: dict) -> pd.DataFrame:
    rows = []
    per_dataset_counts = {}

    for ds_name, cfg in datasets.items():
        labels_dir = dataset_root / cfg["labels"]
        counter = Counter()
        n_files = 0
        n_empty = 0
        n_bad = 0

        if not labels_dir.exists():
            rows.append({
                "dataset": ds_name,
                "labels_dir": str(labels_dir),
                "n_label_files": 0,
                "n_empty": 0,
                "n_bad": 0,
                "unique_class_ids": [],
                "n_unique": 0,
                "n_boxes": 0,
            })
            per_dataset_counts[ds_name] = counter
            continue

        for lp in labels_dir.rglob("*.txt"):
            if not lp.is_file():
                continue
            n_files += 1
            try:
                txt = lp.read_text(encoding="utf-8", errors="ignore").strip()
            except Exception:
                n_bad += 1
                continue

            if not txt:
                n_empty += 1
                continue

            for line in txt.splitlines():
                parts = line.strip().split()
                if not parts:
                    continue
                try:
                    cls = int(float(parts[0]))
                    counter[cls] += 1
                except Exception:
 # ignore malformed lines for the inventory (they will be handled later in validation)
                    n_bad += 1
                    continue

        per_dataset_counts[ds_name] = counter
        rows.append({
            "dataset": ds_name,
            "labels_dir": str(labels_dir),
            "n_label_files": n_files,
            "n_empty": n_empty,
            "n_bad": n_bad,
            "unique_class_ids": sorted(counter.keys()),
            "n_unique": len(counter),
            "n_boxes": int(sum(counter.values())),
        })

    df = pd.DataFrame(rows).sort_values("dataset").reset_index(drop=True)

    print("Class IDs found per dataset:")
    display(df)

    print("Detailed counts:")
    for ds in df["dataset"].tolist():
        c = per_dataset_counts.get(ds, Counter())
        if not c:
            print(f"  [{ds}] (no classes found)")
            continue
        print(f"  [{ds}] class_id -> boxes")
        for k, v in sorted(c.items(), key=lambda x: x[0]):
            print(f"    {k}: {v}")

    return df, per_dataset_counts

class_inventory_df, class_counts = scan_dataset_classes(DATASET_ROOT, DATASETS)


Class IDs found per dataset:


Unnamed: 0,dataset,labels_dir,n_label_files,n_empty,n_bad,unique_class_ids,n_unique,n_boxes
0,AquaCoop,/Users/MarcoPiccolo/Desktop/GitHub_Test/AquaCo...,1238,0,0,[0],1,13693
1,OzFish,/Users/MarcoPiccolo/Desktop/GitHub_Test/OzFish...,350,0,0,[0],1,7540
2,aquarium,/Users/MarcoPiccolo/Desktop/GitHub_Test/aquari...,637,0,0,"[0, 1, 2, 3, 4, 5, 6]",7,4821
3,deepfish,/Users/MarcoPiccolo/Desktop/GitHub_Test/deepfi...,4505,0,0,[0],1,15463
4,deepfish_negatives,/Users/MarcoPiccolo/Desktop/GitHub_Test/negati...,2012,2012,0,[],0,0
5,f4k,/Users/MarcoPiccolo/Desktop/GitHub_Test/f4k/la...,794,0,0,[0],1,3054
6,fish_416,/Users/MarcoPiccolo/Desktop/GitHub_Test/fish_4...,680,0,0,"[12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...",13,1582
7,fishclef,/Users/MarcoPiccolo/Desktop/GitHub_Test/fishcl...,14273,0,0,[0],1,22627
8,luderick,/Users/MarcoPiccolo/Desktop/GitHub_Test/luderi...,4276,0,0,"[0, 1]",2,9429


Detailed counts:
  [AquaCoop] class_id -> boxes
    0: 13693
  [OzFish] class_id -> boxes
    0: 7540
  [aquarium] class_id -> boxes
    0: 2673
    1: 694
    2: 516
    3: 284
    4: 354
    5: 116
    6: 184
  [deepfish] class_id -> boxes
    0: 15463
  [deepfish_negatives] (no classes found)
  [f4k] class_id -> boxes
    0: 3054
  [fish_416] class_id -> boxes
    12: 2
    14: 1
    15: 6
    16: 116
    17: 267
    18: 117
    19: 155
    20: 225
    21: 1
    22: 482
    23: 1
    24: 207
    25: 2
  [fishclef] class_id -> boxes
    0: 22627
  [luderick] class_id -> boxes
    0: 125
    1: 9304


In [8]:
# Map each dataset to the class ids that count as "fish".

POSITIVE_CLASS_IDS = {
    "AquaCoop": None,
    "OzFish": None,
    "deepfish": None,
    "f4k": None,
    "fishclef": None,
    "fish_416": None,
    "luderick": None,

    # aquarium classes include non-fish; keep fish + shark
    # Update these ids if your inventory step shows different numbers.
    "aquarium": [0, 4],
}


## 3) Utilities


In [9]:

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

def is_image(p: Path) -> bool:
    return p.suffix.lower() in IMG_EXTS

def md5_file(path: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.md5()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def average_hash(img: Image.Image, hash_size: int = 8) -> str:
 # aHash: resize to hash_size x hash_size, convert to grayscale, compare to mean
    img = img.convert("L").resize((hash_size, hash_size), Image.Resampling.BILINEAR)
    arr = np.asarray(img, dtype=np.float32)
    avg = arr.mean()
    bits = (arr > avg).astype(np.uint8).flatten()
 # Convert bits to hex string
    bit_string = "".join("1" if b else "0" for b in bits)
    return f"{int(bit_string, 2):0{hash_size*hash_size//4}x}"

YOLO_LINE_RE = re.compile(r"^\s*(\d+)\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)\s+([0-9.eE+-]+)\s*$")

def parse_yolo_label_file(label_path: Path) -> List[Tuple[int, float, float, float, float]]:
    if not label_path.exists():
        return []
    text = label_path.read_text(encoding="utf-8", errors="ignore").strip()
    if not text:
        return []
    rows = []
    for line in text.splitlines():
        m = YOLO_LINE_RE.match(line)
        if not m:
            raise ValueError(f"Invalid YOLO label line: '{line}'")
        cls = int(m.group(1))
        x, y, w, h = map(float, m.groups()[1:])
        rows.append((cls, x, y, w, h))
    return rows

def validate_yolo_rows(rows: List[Tuple[int, float, float, float, float]]) -> None:
    for cls, x, y, w, h in rows:
        if not np.isfinite([x, y, w, h]).all():
            raise ValueError("Non-finite YOLO values")
        if not (0.0 <= x <= 1.0 and 0.0 <= y <= 1.0):
            raise ValueError("Center coords out of [0,1]")
        if not (0.0 < w <= 1.0 and 0.0 < h <= 1.0):
            raise ValueError("Width/height must be in (0,1]")
 # allow cls as any int here; remapping happens later

# --- Robust YOLO label sanitation (handles occasional non-normalized boxes) ---

def _normalize_rows_if_pixel(rows: List[Tuple[int, float, float, float, float]], image_path: Path) -> List[Tuple[int, float, float, float, float]]:
    """If values look like pixel units (common issue in some exports), normalize them to [0,1]."""
    if not rows:
        return rows
    max_v = max(max(abs(x), abs(y), abs(w), abs(h)) for _, x, y, w, h in rows)
    if max_v <= 1.5:
        return rows  # already normalized (or close enough)

    with Image.open(image_path) as im:
        img_w, img_h = im.size

    normed = []
    for cls, x, y, w, h in rows:
        normed.append((cls, x / img_w, y / img_h, w / img_w, h / img_h))
    return normed

def sanitize_yolo_rows(
    rows: List[Tuple[int, float, float, float, float]],
    *,
    clip: bool = True,
    eps: float = 1e-6,
) -> List[Tuple[int, float, float, float, float]]:
    """Validate and optionally clip YOLO rows to a safe range."""
    cleaned: List[Tuple[int, float, float, float, float]] = []
    for cls, x, y, w, h in rows:
        if not np.isfinite([x, y, w, h]).all():
            continue
        if clip:
            x = float(np.clip(x, 0.0, 1.0))
            y = float(np.clip(y, 0.0, 1.0))
            w = float(np.clip(w, eps, 1.0))
            h = float(np.clip(h, eps, 1.0))
 # Final hard checks (drop if still invalid)
        if not (0.0 <= x <= 1.0 and 0.0 <= y <= 1.0):
            continue
        if not (0.0 < w <= 1.0 and 0.0 < h <= 1.0):
            continue
        cleaned.append((cls, x, y, w, h))
    return cleaned


## 4) Build the unified index (scan + validation + class harmonization)

The index tracks:
- `image_path`
- `label_path` (may be missing for background-only images)
- `dataset_id`
- `has_fish` (True if at least one box remains after filtering)


In [10]:
@dataclass
class Record:
    image_path: Path
    label_path: Path
    dataset_id: str
    has_fish: bool
    n_boxes: int

def find_images(images_dir: Path) -> List[Path]:
    if not images_dir.exists():
        return []
    return sorted([p for p in images_dir.rglob("*") if p.is_file() and is_image(p)])

def expected_label_path(labels_dir: Path, image_path: Path) -> Path:
 # Label name mirrors image stem
    return labels_dir / f"{image_path.stem}.txt"

def load_and_harmonize_labels(dataset_id: str, label_path: Path, image_path: Path) -> Tuple[List[Tuple[int,float,float,float,float]], bool]:
 # Parse YOLO labels
    rows = parse_yolo_label_file(label_path)

 # If values look like pixel units, normalize using image size
    rows = _normalize_rows_if_pixel(rows, image_path)

 # Clip / drop invalid rows
    rows = sanitize_yolo_rows(rows, clip=True)

 # Dataset-specific positive class filter (thesis-aligned)
    allowed = POSITIVE_CLASS_IDS.get(dataset_id, None)
    if allowed is not None:
        rows = [r for r in rows if r[0] in set(allowed)]

 # Remap all remaining boxes to class 0 (fish)
    rows = [(0, x, y, w, h) for (_, x, y, w, h) in rows]
    has_fish = len(rows) > 0
    return rows, has_fish

def build_index() -> pd.DataFrame:
    records: List[Record] = []

    for ds_name, cfg in DATASETS.items():
        images_dir = (DATASET_ROOT / cfg["images"])
        labels_dir = (DATASET_ROOT / cfg["labels"])

        imgs = find_images(images_dir)
        if not imgs:
            print(f"[WARN] No images found for '{ds_name}' in {images_dir}")
            continue
        if not labels_dir.exists():
            print(f"[WARN] Labels dir not found for '{ds_name}': {labels_dir}")

        for img_path in imgs:
            lbl_path = expected_label_path(labels_dir, img_path)

            try:
                if lbl_path.exists():
                    rows, has_fish = load_and_harmonize_labels(ds_name, lbl_path, img_path)
                    n_boxes = len(rows)
                else:
                    rows, has_fish, n_boxes = [], False, 0
            except Exception as e:
                raise RuntimeError(
                    f"Label error in dataset '{ds_name}' for image '{img_path.name}' (label: '{lbl_path.name}'): {e}"
                ) from e

            records.append(Record(
                image_path=img_path,
                label_path=lbl_path if lbl_path.exists() else Path(""),
                dataset_id=ds_name,
                has_fish=has_fish,
                n_boxes=n_boxes
            ))

    df = pd.DataFrame([r.__dict__ for r in records])
    if df.empty:
        raise RuntimeError("Index is empty. Check DATASETS config and DATASET_ROOT.")
    return df

index_df = build_index()
index_df.head()

merged_df = index_df.copy()


checkpoint(index_df, 'After build_index (pre-dedup)')
checkpoint_extended(index_df, 'After build_index (raw scan)')



=== After build_index (pre-dedup) ===
Total images: 28,765


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,14273
deepfish,4505
luderick,4276
deepfish_negatives,2012
AquaCoop,1238
f4k,794
fish_416,680
aquarium,637
OzFish,350


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,26492
False,2273



=== After build_index (raw scan) ===
Total images: 28,765

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,14273
deepfish,4505
luderick,4276
deepfish_negatives,2012
AquaCoop,1238
f4k,794
fish_416,680
aquarium,637
OzFish,350



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,26492
False,2273



Boxes per image (summary):


Unnamed: 0,value
count,28765.0
mean,2.656527
std,4.27051
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,100.0


Images with zero boxes: 2,273
Missing image files on disk: 0
Missing label files on disk: 0


## 5) Write the master index

The master index is useful for auditability and to reproduce splits without rescanning.


In [11]:

EXPORT_ROOT.mkdir(parents=True, exist_ok=True)
master_index_path = EXPORT_ROOT / "master_index.csv"

# Store paths relative to DATASET_ROOT for portability
def rel(p: Path) -> str:
    try:
        return str(Path(p).resolve().relative_to(DATASET_ROOT.resolve()))
    except Exception:
        return str(p)

out_df = merged_df.copy()
out_df["image_rel"] = out_df["image_path"].apply(rel)
out_df["label_rel"] = out_df["label_path"].apply(lambda p: rel(p) if str(p) else "")
out_df[["image_rel", "label_rel", "dataset_id", "has_fish", "n_boxes"]].to_csv(master_index_path, index=False)

print(f"Wrote: {master_index_path}")
out_df[["dataset_id", "has_fish"]].value_counts()


Wrote: /Users/MarcoPiccolo/Desktop/GitHub_Test/Merged/master_index.csv


dataset_id          has_fish
fishclef            True        14273
deepfish            True         4505
luderick            True         4276
deepfish_negatives  False        2012
AquaCoop            True         1238
f4k                 True          794
fish_416            True          680
aquarium            True          376
OzFish              True          350
aquarium            False         261
Name: count, dtype: int64

## 6) Stratified train/val/test split (two-step)

Stratification key = `dataset_id` + `has_fish`


In [12]:

from sklearn.model_selection import StratifiedShuffleSplit

def stratified_splits(df: pd.DataFrame, test_size: float, val_size: float, seed: int = 42):
    df = df.copy().reset_index(drop=True)
    strat_key = df["dataset_id"].astype(str) + "__" + df["has_fish"].astype(int).astype(str)

 # Split out test
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_val_idx, test_idx = next(sss1.split(df, strat_key))
    train_val = df.iloc[train_val_idx].reset_index(drop=True)
    test = df.iloc[test_idx].reset_index(drop=True)

 # Split train/val from remaining
    strat_key_tv = train_val["dataset_id"].astype(str) + "__" + train_val["has_fish"].astype(int).astype(str)
    val_frac_of_tv = val_size / (1.0 - test_size)

    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_frac_of_tv, random_state=seed)
    train_idx, val_idx = next(sss2.split(train_val, strat_key_tv))
    train = train_val.iloc[train_idx].reset_index(drop=True)
    val = train_val.iloc[val_idx].reset_index(drop=True)

    return train, val, test

train_df, val_df, test_df = stratified_splits(merged_df, TEST_SIZE, VAL_SIZE, RANDOM_SEED)

print("Split sizes:")
print("  train:", len(train_df))
print("  val:  ", len(val_df))
print("  test: ", len(test_df))

# Quick checks
display(train_df[["dataset_id", "has_fish"]].value_counts().head(10))
display(val_df[["dataset_id", "has_fish"]].value_counts().head(10))
display(test_df[["dataset_id", "has_fish"]].value_counts().head(10))

checkpoint(train_df, 'Train split')
checkpoint(val_df, 'Val split')
checkpoint(test_df, 'Test split')
checkpoint_extended(train_df, 'Train split (final)')
checkpoint_extended(val_df, 'Validation split (final)')
checkpoint_extended(test_df, 'Test split (final)')


Split sizes:
  train: 20135
  val:   5753
  test:  2877


dataset_id          has_fish
fishclef            True        9991
deepfish            True        3153
luderick            True        2993
deepfish_negatives  False       1409
AquaCoop            True         866
f4k                 True         556
fish_416            True         476
aquarium            True         263
OzFish              True         245
aquarium            False        183
Name: count, dtype: int64

dataset_id          has_fish
fishclef            True        2855
deepfish            True         901
luderick            True         855
deepfish_negatives  False        402
AquaCoop            True         248
f4k                 True         159
fish_416            True         136
aquarium            True          75
OzFish              True          70
aquarium            False         52
Name: count, dtype: int64

dataset_id          has_fish
fishclef            True        1427
deepfish            True         451
luderick            True         428
deepfish_negatives  False        201
AquaCoop            True         124
f4k                 True          79
fish_416            True          68
aquarium            True          38
OzFish              True          35
aquarium            False         26
Name: count, dtype: int64


=== Train split ===
Total images: 20,135


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,9991
deepfish,3153
luderick,2993
deepfish_negatives,1409
AquaCoop,866
f4k,556
fish_416,476
aquarium,446
OzFish,245


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,18543
False,1592



=== Val split ===
Total images: 5,753


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,2855
deepfish,901
luderick,855
deepfish_negatives,402
AquaCoop,248
f4k,159
fish_416,136
aquarium,127
OzFish,70


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,5299
False,454



=== Test split ===
Total images: 2,877


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,1427
deepfish,451
luderick,428
deepfish_negatives,201
AquaCoop,124
f4k,79
fish_416,68
aquarium,64
OzFish,35


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,2650
False,227



=== Train split (final) ===
Total images: 20,135

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,9991
deepfish,3153
luderick,2993
deepfish_negatives,1409
AquaCoop,866
f4k,556
fish_416,476
aquarium,446
OzFish,245



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,18543
False,1592



Boxes per image (summary):


Unnamed: 0,value
count,20135.0
mean,2.651502
std,4.245225
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,100.0


Images with zero boxes: 1,592
Missing image files on disk: 0
Missing label files on disk: 0

=== Validation split (final) ===
Total images: 5,753

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,2855
deepfish,901
luderick,855
deepfish_negatives,402
AquaCoop,248
f4k,159
fish_416,136
aquarium,127
OzFish,70



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,5299
False,454



Boxes per image (summary):


Unnamed: 0,value
count,5753.0
mean,2.695811
std,4.445616
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,95.0


Images with zero boxes: 454
Missing image files on disk: 0
Missing label files on disk: 0

=== Test split (final) ===
Total images: 2,877

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,1427
deepfish,451
luderick,428
deepfish_negatives,201
AquaCoop,124
f4k,79
fish_416,68
aquarium,64
OzFish,35



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,2650
False,227



Boxes per image (summary):


Unnamed: 0,value
count,2877.0
mean,2.613139
std,4.08653
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,66.0


Images with zero boxes: 227
Missing image files on disk: 0
Missing label files on disk: 0


## 8) Export final dataset structure

Output layout:

```
EXPORT_ROOT/
  train/
    images/
    labels/
  val/
    images/
    labels/
  test/
    images/
    labels/
```

Notes:
- If a source label file is missing or empty, an empty `.txt` is created.
- All boxes are written with class id `0`.


In [13]:
checkpoint_extended(merged_df, 'Before export (post-split aggregation)')


def write_yolo_label(rows: List[Tuple[int,float,float,float,float]], out_path: Path) -> None:
    if not rows:
        out_path.write_text("", encoding="utf-8")
        return
    lines = [f"{cls} {x:.6f} {y:.6f} {w:.6f} {h:.6f}" for cls, x, y, w, h in rows]
    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")

def export_split(df: pd.DataFrame, split_name: str) -> None:
    img_out = EXPORT_ROOT / split_name / "images"
    lbl_out = EXPORT_ROOT / split_name / "labels"
    img_out.mkdir(parents=True, exist_ok=True)
    lbl_out.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Export {split_name}"):
        img_src = Path(row["image_path"])
        lbl_src = Path(row["label_path"]) if str(row["label_path"]) else None

 # Copy image
        dst_img = img_out / img_src.name
        if not dst_img.exists():
            shutil.copy2(img_src, dst_img)

 # Write label
        dst_lbl = lbl_out / f"{img_src.stem}.txt"
        rows, _has_fish = load_and_harmonize_labels(row["dataset_id"], lbl_src, img_src) if (lbl_src and lbl_src.exists()) else ([], False)
        write_yolo_label(rows, dst_lbl)

# Safety: do not delete existing exports automatically
print(f"Export root: {EXPORT_ROOT}")



=== Before export (post-split aggregation) ===
Total images: 28,765

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,14273
deepfish,4505
luderick,4276
deepfish_negatives,2012
AquaCoop,1238
f4k,794
fish_416,680
aquarium,637
OzFish,350



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,26492
False,2273



Boxes per image (summary):


Unnamed: 0,value
count,28765.0
mean,2.656527
std,4.27051
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,100.0


Images with zero boxes: 2,273
Missing image files on disk: 0
Missing label files on disk: 0
Export root: /Users/MarcoPiccolo/Desktop/GitHub_Test/Merged


In [14]:
checkpoint_extended(merged_df, 'Before export (post-split aggregation)')


export_split(train_df, "train")
export_split(val_df, "val")
export_split(test_df, "test")

print("Export complete.")



=== Before export (post-split aggregation) ===
Total images: 28,765

Images per dataset:


Unnamed: 0_level_0,n_images
dataset_id,Unnamed: 1_level_1
fishclef,14273
deepfish,4505
luderick,4276
deepfish_negatives,2012
AquaCoop,1238
f4k,794
fish_416,680
aquarium,637
OzFish,350



Positive vs negative:


Unnamed: 0_level_0,n_images
has_fish,Unnamed: 1_level_1
True,26492
False,2273



Boxes per image (summary):


Unnamed: 0,value
count,28765.0
mean,2.656527
std,4.27051
min,0.0
25%,1.0
50%,1.0
75%,3.0
max,100.0


Images with zero boxes: 2,273
Missing image files on disk: 0
Missing label files on disk: 0


Export train: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20135/20135 [00:34<00:00, 591.41it/s]
Export val: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5753/5753 [00:08<00:00, 655.91it/s]
Export test: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2877/2877 [00:04<00:00, 683.01it/s]

Export complete.



