# Dataset Audit and Export

This notebook performs a structured audit of multiple computer vision datasets and prepares them for unified downstream use.

## Purpose

The goal of this notebook is to validate dataset structure, identify inconsistencies, and export cleaned datasets in a standardized format suitable for training and evaluation.

In [1]:
import os, glob, hashlib, math, shutil
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image

# ----------------- Configuration -----------------
ROOT = Path("/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets")  # Root folder containing all datasets

# Map dataset keys to subfolders under ROOT
SOURCE_DATASETS = {
    'luderick': 'InstanceSegmented_Luderick_dataset_yolov11',
    'fish_416': 'fish_dataset_416x416',
    'aquarium': 'Aquarium_Combined_roboflow_v6i.yolov8',
    'f4k': 'f4k_detection_tracking',
    'fishclef': 'fishclef_2015_release',
    'deepfish': 'Deepfish_Annotation',
    'AquaCoop': 'Vasconcelos_Thesis_DS/Seabass_Aquaculture_640',
    'OzFish': 'Vasconcelos_Thesis_DS/OzFish_cleaned',
    'deepfish_negatives': 'Deepfish_Annotation/Negative_samples',
}

# Datasets with sparse annotations (do not treat unlabeled frames as negatives)
SPARSE_SOURCES = {'f4k', 'fishclef'}

# Image extensions to scan (case-insensitive)
IMG_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff'}

# ----------------- Outputs -----------------
EXPORT_DIR          = Path('/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test')
PICTURES_KEEP_CSV   = EXPORT_DIR / 'pictures_keep.csv'

PICTURES_REMOVE_CSV = EXPORT_DIR / 'pictures_remove.csv'
REVIEWED_NEG_CSV    = EXPORT_DIR / 'negatives_kept.csv'

# Preferred source order for choosing representatives when duplicates exist
PREFERRED_SOURCE_ORDER = [
    "deepfish_negatives", "deepfish", "fishclef", "f4k",
    "luderick", "fish_416", "aquarium", "AquaCoop_OzFish"
]

# Random seed for reproducibility (affects any sampling/random operations)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Also set Python's built-in random module for any operations that might use it
import random
random.seed(RANDOM_SEED)

# ----------------- Path Validation -----------------
def check_paths(root: Path, dataset_map: dict):
    """Validate ROOT and dataset folder existence."""
    print(f"[INFO] Checking dataset paths under: {root}")
    if not root.exists():
        raise FileNotFoundError(f"[ERROR] ROOT directory does not exist: {root}")

    missing = []
    for name, rel_path in dataset_map.items():
        full_path = root / rel_path
        if full_path.exists():
            print(f"{name:18s} ‚Üí OK  ({full_path})")
        else:
            print(f"{name:18s} ‚Üí MISSING  ({full_path})")
            missing.append(name)

    if missing:
        print(f"\n[WARN] Missing dataset folders: {', '.join(missing)}")
    else:
        print("\n[INFO] All dataset paths verified successfully.")

# ----------------- Main Entry -----------------
if __name__ == "__main__":
    check_paths(ROOT, SOURCE_DATASETS)


[INFO] Checking dataset paths under: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets
luderick           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/InstanceSegmented_Luderick_dataset_yolov11)
fish_416           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/fish_dataset_416x416)
aquarium           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/Aquarium_Combined_roboflow_v6i.yolov8)
f4k                ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/f4k_detection_tracking)
fishclef           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/fishclef_2015_release)
deepfish           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/Deepfish_Annotation)
AquaCoop           ‚Üí OK  (/Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/Original_Datasets/Vasconcelos_Thesis_DS/Seab

## 1) Define Dataset Size

In [2]:
LABEL_EXTS = {'.txt', '.xml', '.json'}  # extend if you use other label formats

# Skip any dataset path that looks like the unified export
SKIP_KEYWORD = 'UNIFIED_DATASET_FISH'

rows = []
for name, rel in SOURCE_DATASETS.items():
    ds_path = Path(ROOT) / rel

    # Explicitly skip any dataset that points to the unified dataset export
    if SKIP_KEYWORD in str(ds_path):
        print(f"[INFO] Skipping dataset '{name}' because path contains '{SKIP_KEYWORD}': {ds_path}")
        rows.append({
            'dataset': name,
            'path': str(ds_path),
            'exists': False,
            'n_images': 0,
            'n_labels': 0,
            'n_bboxes': 0,
        })
        continue

    if not ds_path.exists():
        rows.append({
            'dataset': name,
            'path': str(ds_path),
            'exists': False,
            'n_images': 0,
            'n_labels': 0,
            'n_bboxes': 0,
        })
        continue

    n_images = 0
    n_labels = 0
    # Count of bounding boxes (total non-empty lines in .txt YOLO label files)
    n_bboxes = 0

    for p in ds_path.rglob('*'):
        if not p.is_file():
            continue
        suf = p.suffix.lower()
        if suf in IMG_EXTS:
            n_images += 1
        elif suf in LABEL_EXTS:
            n_labels += 1
            # If it's a plain YOLO .txt label file, count non-empty lines as bboxes
            if suf == '.txt':
                try:
                    with open(p, 'r', encoding='utf-8') as lf:
                        for ll in lf:
                            if ll.strip():
                                n_bboxes += 1
                except Exception:
                    # If we can't read the label file, skip counting but keep the label file count
                    pass

    rows.append({
        'dataset': name,
        'path': str(ds_path),
        'exists': True,
        'n_images': n_images,
        'n_labels': n_labels,
        'n_bboxes': n_bboxes,
    })

# Build DataFrame with only the columns required by the pipeline
df_sizes = pd.DataFrame(rows)
# Include n_bboxes in display columns
display_cols = ['dataset', 'path', 'exists', 'n_images', 'n_labels', 'n_bboxes']
if not df_sizes.empty:
    display(df_sizes[display_cols].sort_values('n_images', ascending=False).reset_index(drop=True))

# Save to EXPORT_DIR if present, else save locally
try:
    out_dir = Path(EXPORT_DIR)
except NameError:
    out_dir = None

out_csv = Path('dataset_sizes.csv') if out_dir is None else (out_dir / 'dataset_sizes.csv')
if out_dir is not None:
    out_dir.mkdir(parents=True, exist_ok=True)

# Write the CSV with the reduced columns so it integrates with the pipeline
# Ensure n_bboxes is included in the written CSV
cols_to_write = [c for c in display_cols if c in df_sizes.columns]
df_sizes.to_csv(out_csv, index=False, columns=cols_to_write)
print(f"Wrote dataset sizes to: {out_csv}")

Unnamed: 0,dataset,path,exists,n_images,n_labels,n_bboxes
0,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,77235,917,3460
1,fishclef,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,53196,14809,23294
2,deepfish,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,6517,6518,15464
3,luderick,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,4276,8554,18881
4,deepfish_negatives,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,2012,2012,0
5,fish_416,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,1350,1352,3183
6,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,1250,1250,13840
7,aquarium,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,638,640,4854
8,OzFish,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,350,350,7540


Wrote dataset sizes to: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/dataset_sizes.csv


## 2) Helper functions

In [3]:
def _suffix_ok(path: Path):
    try:
        return path.suffix.lower() in IMG_EXTS
    except Exception:
        return False

def iter_images(root: Path):
    # Recursive scan with case-insensitive filter by suffix
    files = [Path(p) for p in glob.glob(str(root / '**' / '*'), recursive=True)]
    # Sort for deterministic order across different systems/filesystems
    return sorted([str(p) for p in files if p.is_file() and _suffix_ok(p)])

def safe_open_image(p: Path):
    """Return (width, height, format) or (None, None, None) on failure."""
    try:
        with Image.open(p) as im:
            im.verify()   # quick structural check
        with Image.open(p) as im2:
            im2.load()    # force decode
            return im2.width, im2.height, im2.format
    except Exception:
        return None, None, None

def find_label_for(img_path: Path, dataset_key: str, dataset_root: Path):
    """Find YOLO .txt label path for an image."""
    # DeepFish variants: labels next to images
    if dataset_key in ['deepfish', 'deepfish_negatives']:
        return img_path.with_suffix('.txt')

    # Typical YOLO trees
    try:
        rel = img_path.relative_to(dataset_root)
    except Exception:
        rel = img_path.name

    candidates = []
    for labdir in ['labels', 'labels_bbox', 'annotations']:
        parts = list(rel.parts) if isinstance(rel, Path) else [rel]
        if isinstance(rel, Path) and 'images' in parts:
            idx = parts.index('images')
            parts[idx] = labdir
            label_rel = Path(*parts).with_suffix('.txt')
        else:
            label_rel = Path(labdir) / (Path(img_path.stem).name + '.txt')
        candidates.append(dataset_root / label_rel)

    for c in candidates:
        if c.exists():
            return c
    return None

def parse_yolo_label(path: Path):
    """Return (n_lines, n_valid, n_bad, n_oob_or_zero_area)."""
    n_lines = n_valid = n_bad = n_oob_za = 0
    if path is None or not path.exists():
        return 0, 0, 0, 0
    try:
        with open(path, 'r') as f:
            for line in f:
                s = line.strip().split()
                if len(s) != 5:
                    n_bad += 1; n_lines += 1; continue
                try:
                    _cid = int(float(s[0])); x, y, w, h = map(float, s[1:])
                except Exception:
                    n_bad += 1; n_lines += 1; continue
                if (0 <= x <= 1) and (0 <= y <= 1) and (0 < w <= 1) and (0 < h <= 1):
                    n_valid += 1
                else:
                    n_oob_za += 1
                n_lines += 1
    except Exception:
        # Treat unreadable label as 1 bad line so it can be flagged upstream if needed
        return 1, 0, 1, 0
    return n_lines, n_valid, n_bad, n_oob_za

def average_hash_hex(img_path: Path, hash_size=8):
    try:
        with Image.open(img_path) as im:
            im = im.convert('L').resize((hash_size, hash_size), Image.BILINEAR)
            px = np.asarray(im, dtype=np.float32)
            m = px.mean()
            bits = (px > m).astype(np.uint8).flatten()
            value = 0
            for b in bits:
                value = (value << 1) | int(b)
            return f"{value:0{hash_size*hash_size//4}x}"
    except Exception:
        return None

def file_md5(path: Path):
    try:
        h = hashlib.md5()
        with open(path, 'rb') as f:
            for chunk in iter(lambda: f.read(8192), b''):
                h.update(chunk)
        return h.hexdigest()
    except Exception:
        return None

def normalized_label_md5(p: Path):
    try:
        lines = []
        with open(p, 'r') as f:
            for line in f:
                s = ' '.join(line.strip().split())
                if s:
                    lines.append(s)
        lines = sorted(lines)
        blob = '\n'.join(lines).encode('utf-8')
        return hashlib.md5(blob).hexdigest()
    except Exception:
        return None

def is_negatives_subpath(path: Path):
    """Heuristics to recognize a 'negatives' subfolder name in path parts."""
    return any('negative' in part.lower() for part in path.parts)

def safe_exists(p):
    try:
        return Path(str(p)).exists()
    except Exception:
        return False


## 3) Audit datasets

This step scans each dataset for images, locates YOLO label files, validates label lines, and assigns a `status`.

**Statuses**
- `ok` ‚Äî image readable; label either not required (negatives/sparse unlabeled) or has valid boxes
- `image_corrupted` ‚Äî image cannot be opened/decoded
- `label_missing_unexpected` ‚Äî label is expected but missing (not sparse source)
- `sparse_unlabeled_frame` ‚Äî sparse dataset frame without label (excluded from training; not a negative)
- `label_all_invalid` ‚Äî label exists but has no valid boxes


In [4]:
SCHEMA = {
    'source': None, 'status': None,
    'image_path': None, 'label_path': None,
    'label_exists': False,
    'n_label_lines': 0, 'n_valid_boxes': 0, 'n_bad_format': 0, 'n_oob_or_zero_area': 0,
    'img_width': None, 'img_height': None, 'img_format': None,
    'expected_labels': None, 'sparse_source': None
}

records = []

# Process datasets in sorted order for reproducibility
for key, sub in sorted(SOURCE_DATASETS.items()):
    dataset_root = ROOT / sub
    expected = (key != 'deepfish_negatives')
    sparse = key in SPARSE_SOURCES

    if not dataset_root.exists():
        rec = SCHEMA.copy()
        rec.update({'source': key, 'status': 'dataset_missing',
                    'expected_labels': expected, 'sparse_source': sparse})
        records.append(rec)
        print(f'[WARN] Dataset missing: {dataset_root}')
        continue

    imgs = iter_images(dataset_root)
    if len(imgs) == 0:
        rec = SCHEMA.copy()
        rec.update({'source': key, 'status': 'no_images_found',
                    'expected_labels': expected, 'sparse_source': sparse})
        records.append(rec)
        print(f'[WARN] No images found under: {dataset_root}')
        continue

    print(f'[INFO] Auditing {key} ({len(imgs)} files) ...')
    for ip in imgs:
        ipath = Path(ip)
        # Avoid overlap: don't let 'deepfish' ingest the negatives subfolder
        if key == 'deepfish' and is_negatives_subpath(ipath):
            continue

        w, h, fmt = safe_open_image(ipath)
        lp = find_label_for(ipath, key, dataset_root)
        label_exists = bool(lp and lp.exists())
        n_lines, n_valid, n_bad, n_oob = parse_yolo_label(lp) if label_exists else (0,0,0,0)

        status = 'ok'
        if w is None:
            status = 'image_corrupted'
        elif expected and not label_exists and not sparse:
            status = 'label_missing_unexpected'
        elif expected and sparse and not label_exists:
            status = 'sparse_unlabeled_frame'
        elif label_exists and n_lines > 0 and n_valid == 0:
            status = 'label_all_invalid'

        rec = SCHEMA.copy()
        rec.update({'source': key, 'status': status,
                    'image_path': str(ipath), 'label_path': str(lp) if lp else None,
                    'label_exists': label_exists,
                    'n_label_lines': n_lines, 'n_valid_boxes': n_valid,
                    'n_bad_format': n_bad, 'n_oob_or_zero_area': n_oob,
                    'img_width': w, 'img_height': h, 'img_format': fmt,
                    'expected_labels': expected, 'sparse_source': sparse})
        records.append(rec)

audit_df = pd.DataFrame.from_records(records, columns=list(SCHEMA.keys()))
print(f"[OK] Audit complete. Rows = {len(audit_df)}")
display(audit_df.head())

[INFO] Auditing AquaCoop (1250 files) ...
[INFO] Auditing OzFish (350 files) ...
[INFO] Auditing aquarium (638 files) ...
[INFO] Auditing deepfish (6517 files) ...
[INFO] Auditing deepfish_negatives (2012 files) ...
[INFO] Auditing f4k (77235 files) ...
[INFO] Auditing fish_416 (1350 files) ...
[INFO] Auditing fishclef (53196 files) ...
[INFO] Auditing luderick (4276 files) ...
[OK] Audit complete. Rows = 144812


Unnamed: 0,source,status,image_path,label_path,label_exists,n_label_lines,n_valid_boxes,n_bad_format,n_oob_or_zero_area,img_width,img_height,img_format,expected_labels,sparse_source
0,AquaCoop,ok,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,1,1,0,0,640.0,640.0,JPEG,True,False
1,AquaCoop,ok,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,7,7,0,0,640.0,640.0,JPEG,True,False
2,AquaCoop,ok,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,8,8,0,0,640.0,640.0,JPEG,True,False
3,AquaCoop,ok,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,3,3,0,0,640.0,640.0,JPEG,True,False
4,AquaCoop,ok,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,True,11,11,0,0,640.0,640.0,JPEG,True,False



## 4) Deduplication (avoid overlap, dedup by path and content)

We now build a consolidated list of images across sources, avoid the negatives overlap in `deepfish`,
and deduplicate by:
- **Absolute path** using a preferred source order
- **Content** using **aHash + MD5** (exact duplicates)


In [5]:
# Build scan list
rows = []
# Process datasets in sorted order for reproducibility
for key, sub in sorted(SOURCE_DATASETS.items()):
    dataset_root = ROOT / sub
    if not dataset_root.exists():
        continue
    imgs = iter_images(dataset_root)
    for ip in imgs:
        ipath = Path(ip)
        # Avoid overlap from DeepFish: skip negatives under the main deepfish key
        if key == 'deepfish' and is_negatives_subpath(ipath):
            continue
        lp = find_label_for(ipath, key, dataset_root)
        rows.append({'source': key, 'image_path': str(ipath), 'label_path': str(lp) if lp else None})

scan_df = pd.DataFrame(rows)
if scan_df.empty:
    raise SystemExit("No images found. Check ROOT/SOURCE_DATASETS paths.")

# Keep only existing image files
scan_df['image_path'] = scan_df['image_path'].astype(str)
scan_df = scan_df[scan_df['image_path'].apply(lambda p: Path(p).exists())].copy()

# (A) Deduplicate by absolute path with preferred source order
pr_map = {s: i for i, s in enumerate(PREFERRED_SOURCE_ORDER)}
scan_df['source_priority'] = scan_df['source'].map(pr_map).fillna(len(PREFERRED_SOURCE_ORDER)).astype(int)

dedup_by_path = (
    scan_df.sort_values(['image_path', 'source_priority'])
           .drop_duplicates(subset=['image_path'], keep='first')
           .drop(columns=['source_priority'])
           .copy()
)
dedup_by_path['keep_reason'] = 'unique_or_preferred_source_by_path'

# (B) Deduplicate by content: compute hashes
print('[INFO] Computing image hashes (aHash + MD5) ...')
dedup_by_path['ahash']   = dedup_by_path['image_path'].apply(lambda p: average_hash_hex(Path(p)))
dedup_by_path['img_md5'] = dedup_by_path['image_path'].apply(lambda p: file_md5(Path(p)))

# Identify exact duplicate content clusters
dups_mask = dedup_by_path['ahash'].notna() & dedup_by_path['img_md5'].notna()
img_dups = (
    dedup_by_path[dups_mask]
    .groupby(['ahash','img_md5'], dropna=True)
    .filter(lambda g: len(g) > 1)
    .sort_values(['ahash','img_md5','source','image_path'])
)

print(f"[INFO] Exact duplicate image rows = {len(img_dups)}" )

def _priority(row):
    return (pr_map.get(row.get('source'), len(PREFERRED_SOURCE_ORDER)),
            len(str(row.get('image_path',''))),
            str(row.get('image_path','')))

keep_rows = []
# Process groups in sorted order for reproducibility
for (ah, md5), g in sorted(dedup_by_path.groupby(['ahash','img_md5'], dropna=True)):
    g = g.copy()
    if len(g) == 1:
        g['keep_reason'] = 'unique_content'
        keep_rows.append(g.iloc[0])
    else:
        best_idx = min(g.index, key=lambda i: _priority(g.loc[i]))
        g.loc[best_idx, 'keep_reason'] = f'content_cluster_keeper:{str(ah)[:6]}::{str(md5)[:8]}'
        keep_rows.append(g.loc[best_idx])

# Rows with missing hashes (keep as-is, but mark reason)
nan_hash = dedup_by_path[dedup_by_path['ahash'].isna() | dedup_by_path['img_md5'].isna()].copy()
if not nan_hash.empty:
    nan_hash['keep_reason'] = 'hash_missing_keep'

final_keep_dedup = pd.concat([pd.DataFrame(keep_rows), nan_hash], ignore_index=True)
final_keep_dedup = final_keep_dedup.drop_duplicates(subset=['image_path']).sort_values(['source','image_path']).reset_index(drop=True)

print(f"[OK] Dedup done. Rows = {len(final_keep_dedup)}")
display(final_keep_dedup.head())

[INFO] Computing image hashes (aHash + MD5) ...
[INFO] Exact duplicate image rows = 29749
[OK] Dedup done. Rows = 129388


Unnamed: 0,source,image_path,label_path,keep_reason,ahash,img_md5
0,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff1f00000000,58d7f7a4f7e3ce167487285d15d6cdc8
1,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff7c00000000,29f76e7dce1d0408075e6579b7159880
2,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff7f00000000,fce35dcc67b6519f6f313916e8e3192a
3,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffffff00000000,0ef4734bb5cfb59f5df9d576cb918cfb
4,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff1f00000000,39096b6380bab777cf6708b19a521434



## 5) Build **supervised-ready** keep/remove lists

Merge with audit statuses and filter:
- **Keep**:
  - Positives: `status == "ok"` and `n_valid_boxes > 0`
  - Negatives: images from the designated **negatives** dataset (`deepfish_negatives`) and (optionally) from `negatives_kept.csv`
- **Remove**: everything else, including **bad-for-training**:
  - `sparse_unlabeled_frame`, `image_corrupted`, `label_missing_unexpected`, `label_all_invalid`


In [6]:

# Join deduped list to audit for statuses and label counts
audit_df['image_path'] = audit_df['image_path'].astype(str)
final_keep_dedup['image_path'] = final_keep_dedup['image_path'].astype(str)

fk_aud = final_keep_dedup.merge(
    audit_df[['image_path','source','status','n_valid_boxes','expected_labels']],
    on=['image_path','source'],
    how='left'
)

bad_status = {'sparse_unlabeled_frame','image_corrupted','label_missing_unexpected','label_all_invalid'}

# Positives (with valid boxes)
pos_mask = (fk_aud['status'] == 'ok') & (fk_aud.get('n_valid_boxes', 0).fillna(0) > 0)

# Negatives: from explicit negatives dataset + optionally reviewed negatives list
neg_mask = (fk_aud['source'] == 'deepfish_negatives')
if REVIEWED_NEG_CSV.exists():
    kept_negs = pd.read_csv(REVIEWED_NEG_CSV)
    kept_negs['image_path'] = kept_negs['image_path'].astype(str)
    fk_aud['is_kept_review_neg'] = fk_aud['image_path'].isin(kept_negs['image_path'])
    neg_mask = neg_mask | fk_aud['is_kept_review_neg']

not_bad = ~fk_aud['status'].isin(bad_status)

supervised_keep = fk_aud[(not_bad) & (pos_mask | neg_mask)].copy()
supervised_keep = supervised_keep.reset_index(drop=True)

# Everything else is remove
supervised_remove = fk_aud[~fk_aud.index.isin(supervised_keep.index)].copy().reset_index(drop=True)

# Save exactly two CSV files (as requested)
# Ensure export directory exists before saving CSV files
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

(supervised_keep[['source','image_path','label_path','status','n_valid_boxes','keep_reason']]
 .sort_values(['source','image_path'])
 .to_csv(PICTURES_KEEP_CSV, index=False))

(supervised_remove[['source','image_path','label_path','status','n_valid_boxes','keep_reason']]
 .sort_values(['source','image_path'])
 .to_csv(PICTURES_REMOVE_CSV, index=False))

# User-facing summary
pos_kept = int(((supervised_keep['status'] == 'ok') & (supervised_keep.get('n_valid_boxes', 0).fillna(0) > 0)).sum())
neg_kept = int((supervised_keep['source'] == 'deepfish_negatives').sum() + supervised_keep.get('is_kept_review_neg', pd.Series(False, index=supervised_keep.index)).sum())
excluded_bad = int((fk_aud['status'].isin(bad_status)).sum())

print('[OK] Wrote:', PICTURES_KEEP_CSV)
print('[OK] Wrote:', PICTURES_REMOVE_CSV)
print('\nSummary:')
print(' Positives kept:', pos_kept)
print(' Negatives kept:', neg_kept)
print(' Excluded (bad statuses):', excluded_bad)

display(supervised_keep.head())
display(supervised_remove.head())

[OK] Wrote: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/pictures_keep.csv
[OK] Wrote: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/pictures_remove.csv

Summary:
 Positives kept: 26753
 Negatives kept: 2012
 Excluded (bad statuses): 100622


Unnamed: 0,source,image_path,label_path,keep_reason,ahash,img_md5,status,n_valid_boxes,expected_labels
0,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff1f00000000,58d7f7a4f7e3ce167487285d15d6cdc8,ok,1,True
1,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff7c00000000,29f76e7dce1d0408075e6579b7159880,ok,7,True
2,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff7f00000000,fce35dcc67b6519f6f313916e8e3192a,ok,8,True
3,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffffff00000000,0ef4734bb5cfb59f5df9d576cb918cfb,ok,3,True
4,AquaCoop,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,unique_content,ffffff1f00000000,39096b6380bab777cf6708b19a521434,ok,11,True


Unnamed: 0,source,image_path,label_path,keep_reason,ahash,img_md5,status,n_valid_boxes,expected_labels
0,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,,content_cluster_keeper:7f7f3f::35cb51d8,7f7f3f3e1e0e0c08,35cb51d82f971b5cd276dc8f34d2e4d3,sparse_unlabeled_frame,0,True
1,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,,unique_content,7f7f3f3e1e0e0c08,f54e08b0d2c475b64e40d476adaf2451,sparse_unlabeled_frame,0,True
2,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,,content_cluster_keeper:7f7f3f::408fb950,7f7f3f3f1e0e0c08,408fb9505db9bf0cb8b9a7129b6861f7,sparse_unlabeled_frame,0,True
3,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,,unique_content,7f7f3f1e1e0e0828,ff5c8dddd0486fa11f7e376d0aa61d91,sparse_unlabeled_frame,0,True
4,f4k,/Users/Marco/Desktop/SmartFISHER/SmartFISHER_D...,,content_cluster_keeper:7f7f3f::f58ef3dd,7f7f3f1e1e0e0808,f58ef3dd3fad46b3309506d51c8f95af,sparse_unlabeled_frame,0,True


## 6) Export **supervised-ready** dataset (images + labels to keep)

This will create the folder defined in `EXPORT_DIR` and copy:
- All **kept** images organized by dataset with sequential naming (datasetname_00001.jpg, etc.)
- Their YOLO **labels** when available (for negatives, labels may not exist by design)
- **data.yaml** files from each dataset to preserve class definitions and configurations
- **Negative datasets** are placed under `negatives/` subfolder structure
- A **consolidated data.yaml** file is created at the root level combining all class information

**Export structure:**
```
supervised_ready_dataset/
‚îú‚îÄ‚îÄ data.yaml                           # Consolidated dataset config
‚îú‚îÄ‚îÄ luderick/
‚îÇ   ‚îú‚îÄ‚îÄ images/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ luderick_00001.jpg
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ luderick_00002.jpg
‚îÇ   ‚îú‚îÄ‚îÄ labels/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ luderick_00001.txt
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ luderick_00002.txt
‚îÇ   ‚îî‚îÄ‚îÄ data.yaml                       # Original dataset config
‚îú‚îÄ‚îÄ fish_416/
‚îÇ   ‚îú‚îÄ‚îÄ images/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ fish_416_00001.jpg
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ fish_416_00002.jpg
‚îÇ   ‚îú‚îÄ‚îÄ labels/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ fish_416_00001.txt
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ fish_416_00002.txt
‚îÇ   ‚îî‚îÄ‚îÄ data.yaml                       # Original dataset config
‚îî‚îÄ‚îÄ negatives/
    ‚îî‚îÄ‚îÄ deepfish_negatives/
        ‚îú‚îÄ‚îÄ images/
        ‚îÇ   ‚îú‚îÄ‚îÄ deepfish_negatives_00001.jpg
        ‚îÇ   ‚îî‚îÄ‚îÄ deepfish_negatives_00002.jpg
        ‚îú‚îÄ‚îÄ labels/
        ‚îÇ   ‚îî‚îÄ‚îÄ deepfish_negatives_00001.txt
        ‚îî‚îÄ‚îÄ data.yaml                   # Original dataset config (if exists)
```

In [7]:
# =============================================================================
# Export Dataset: Create per-dataset folders with images/, labels/, data.yaml
# =============================================================================

# Initialize export directory
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
if any(EXPORT_DIR.iterdir()):
    print(f'[INFO] Export directory exists: {EXPORT_DIR}.\n'
          f'       Files may be overwritten.')

def copy_file_safe(src: Path, dst: Path) -> bool:
    """Safely copy file with error handling and directory creation."""
    try:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, dst)
        return True
    except Exception as e:
        print(f'[WARN] Failed to copy {src} -> {dst}: {e}')
        return False

def copy_dataset_yaml(source: str, base_dir: Path) -> bool:
    """Copy or create dataset YAML file. For Deepfish, convert .txt classes to YAML."""
    original_root = ROOT / SOURCE_DATASETS.get(source, '')

    # Try to find existing YAML files
    yaml_candidates = ['data.yaml', 'data.yml', 'dataset.yaml', 'dataset.yml']
    for yaml_name in yaml_candidates:
        yaml_path = original_root / yaml_name
        if yaml_path.exists():
            dst_path = base_dir / 'data.yaml'  # Standardize to data.yaml
            if copy_file_safe(yaml_path, dst_path):
                print(f'[INFO] Copied {yaml_name} for {source}')
                return True
            # If copy failed, try next candidate instead of breaking
            continue

    # Special handling for Deepfish datasets - look for .txt files with class names
    if 'deepfish' in source.lower():
        # Common class file names in Deepfish datasets
        class_file_candidates = [
            'classes.txt', 'class_names.txt', 'labels.txt',
            'object_classes.txt', 'categories.txt'
        ]
        for class_filename in class_file_candidates:
            class_file_path = original_root / class_filename
            if class_file_path.exists():
                try:
                    # Read class names from .txt file
                    with open(class_file_path, 'r', encoding='utf-8') as f:
                        class_names = [line.strip() for line in f if line.strip()]

                    if class_names:
                        # Create data.yaml content
                        yaml_lines = [
                            f"# Dataset configuration for {source}",
                            f"# Generated from {class_filename}",
                            f"path: {base_dir}",
                            "train: images",
                            "val: images  # Configure train/val split as needed",
                            "test: ''",
                            "",
                            "# Class names (adjust IDs if needed)",
                            "names:",
                        ]
                        for i, class_name in enumerate(class_names):
                            yaml_lines.append(f"  {i}: {class_name}")
                        yaml_lines.append(f"\nnc: {len(class_names)}  # Number of classes\n")

                        dst_path = base_dir / 'data.yaml'
                        dst_path.parent.mkdir(parents=True, exist_ok=True)
                        with open(dst_path, 'w', encoding='utf-8') as f:
                            f.write('\n'.join(yaml_lines))

                        print(f'[INFO] Created data.yaml for {source} from {class_filename} '
                              f'({len(class_names)} classes)')
                        return True

                except Exception as e:
                    print(f'[WARN] Could not process {class_file_path}: {e}')
                    continue

    return False

# Process each unique dataset source
stats = {'images': 0, 'labels': 0, 'yamls': 0}
dataset_counters = {}

for source in supervised_keep['source'].unique():
    dataset_rows = supervised_keep[supervised_keep['source'] == source].copy()
    dataset_counters[source] = 0

    # Determine dataset type and create appropriate directory structure
    is_negative = (
        source == 'deepfish_negatives'
        or dataset_rows.get('is_kept_review_neg',
                            pd.Series(False, index=dataset_rows.index)).any()
    )

    # Create base directory path (negatives go under negatives/ subfolder)
    base_dir = EXPORT_DIR / ('negatives' / Path(source) if is_negative else Path(source))
    images_dir = base_dir / 'images'
    labels_dir = base_dir / 'labels'

    # Create directory structure
    images_dir.mkdir(parents=True, exist_ok=True)
    labels_dir.mkdir(parents=True, exist_ok=True)

    # Copy dataset configuration file (best-effort)
    if copy_dataset_yaml(source, base_dir):
        stats['yamls'] += 1

    print(f'[INFO] Processing {source}: {len(dataset_rows)} files -> {base_dir}')

    # Process all files for this dataset
    for _, row in dataset_rows.iterrows():
        image_path = Path(row['image_path'])
        if not image_path.exists():
            continue

        # Generate sequential filename
        dataset_counters[source] += 1
        file_number = f"{dataset_counters[source]:05d}"
        img_ext = image_path.suffix.lower()

        new_img_name = f"{source}_{file_number}{img_ext}"
        new_lbl_name = f"{source}_{file_number}.txt"

        # Copy image file
        if copy_file_safe(image_path, images_dir / new_img_name):
            stats['images'] += 1

        # Copy label file if it exists
        label_path = row.get('label_path')
        if isinstance(label_path, str) and label_path.strip():
            label_file = Path(label_path)
            if label_file.exists():
                if copy_file_safe(label_file, labels_dir / new_lbl_name):
                    stats['labels'] += 1

# Create consolidated dataset configuration
def create_consolidated_yaml() -> bool:
    """Create a consolidated data.yaml combining all dataset classes."""
    try:
        import yaml

        consolidated_config = {
            'path': str(EXPORT_DIR),
            'train': '',  # To be configured by user based on their training setup
            'val': '',    # To be configured by user based on their validation setup
            'test': '',   # Optional test set path
            'names': {},
            'nc': 0,
        }

        # Collect class information from all datasets (excluding negatives)
        all_classes = {}
        class_counter = 0

        for source in supervised_keep['source'].unique():
            if source == 'deepfish_negatives':
                continue

            # Figure out where the exported YAML would be
            is_negative = (
                source == 'deepfish_negatives'
                or supervised_keep[supervised_keep['source'] == source]
                   .get('is_kept_review_neg', pd.Series(False)).any()
            )
            export_yaml_path = (
                EXPORT_DIR / 'negatives' / source / 'data.yaml'
                if is_negative else EXPORT_DIR / source / 'data.yaml'
            )

            # Try exported data.yaml first, then fall back to original dataset
            yaml_sources = [export_yaml_path]
            dataset_root = ROOT / SOURCE_DATASETS.get(source, '')
            for yaml_name in ['data.yaml', 'data.yml', 'dataset.yaml', 'dataset.yml']:
                yaml_sources.append(dataset_root / yaml_name)

            for yaml_path in yaml_sources:
                if yaml_path.exists():
                    try:
                        with open(yaml_path, 'r', encoding='utf-8') as f:
                            dataset_config = yaml.safe_load(f) or {}
                        names = dataset_config.get('names')
                        if names:
                            # Handle both dict and list formats
                            if isinstance(names, dict):
                                class_list = list(names.values())
                            else:
                                class_list = list(names)
                            for class_name in class_list:
                                # Prefix with source to avoid conflicts
                                prefixed_name = f"{source}_{class_name}"
                                all_classes[class_counter] = prefixed_name
                                class_counter += 1
                        break
                    except Exception as e:
                        print(f'[WARN] Could not parse {yaml_path}: {e}')
                        continue

        # Use default class if no classes found
        if not all_classes:
            all_classes = {0: 'fish'}

        consolidated_config['names'] = all_classes
        consolidated_config['nc'] = len(all_classes)

        # Write consolidated configuration
        consolidated_path = EXPORT_DIR / 'data.yaml'
        with open(consolidated_path, 'w', encoding='utf-8') as f:
            yaml.dump(consolidated_config, f, default_flow_style=False, sort_keys=False)

        print(f'[INFO] Created consolidated data.yaml with {len(all_classes)} classes')
        return True

    except ImportError:
        print('[WARN] PyYAML not available - install with: pip install pyyaml')
        return False
    except Exception as e:
        print(f'[WARN] Could not create consolidated data.yaml: {e}')
        return False

# Generate consolidated configuration
if create_consolidated_yaml():
    stats['yamls'] += 1

# Final summary
print('\n[SUCCESS] Export completed successfully!')
print(f'  Images copied: {stats["images"]}')
print(f'  Labels copied: {stats["labels"]}')
print(f'  YAML configs:  {stats["yamls"]}')
print(f'  Dataset file counts: {dict(dataset_counters)}')
print(f'  Export location: {EXPORT_DIR}')

[INFO] Export directory exists: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test.
       Files may be overwritten.
[INFO] Copied data.yaml for AquaCoop
[INFO] Processing AquaCoop: 1238 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/AquaCoop
[INFO] Copied data.yaml for OzFish
[INFO] Processing OzFish: 350 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/OzFish
[INFO] Copied data.yaml for aquarium
[INFO] Processing aquarium: 637 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/aquarium
[INFO] Created data.yaml for deepfish from classes.txt (1 classes)
[INFO] Processing deepfish: 4505 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/deepfish
[INFO] Processing deepfish_negatives: 2012 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/negatives/deepfish_negatives
[INFO] Processing f4k: 794 files -> /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/

## 7) Notes & Tips

**Configuration:**
- **Adjust `SOURCE_DATASETS`**: Ensure all dataset paths are correct and the negatives dataset key matches your structure
- **Sparse sources** (`SPARSE_SOURCES`): Unlabeled frames are excluded from training (not treated as negatives)

**Label discovery:**
- For non-standard layouts, extend the search paths in `find_label_for()` function
- DeepFish variants expect labels next to images; other datasets use typical YOLO structure

**Deduplication strategy:**
- **Path-based**: Uses preferred source order for identical file paths
- **Content-based**: Uses aHash + MD5 for exact duplicate detection
- For near-duplicates, consider adding Hamming distance threshold on aHash

**Export structure:**
- **Per-dataset organization**: Each dataset gets its own folder with `images/`, `labels/`, and `data.yaml`
- **Sequential naming**: Files renamed as `datasetname_00001.jpg`, `datasetname_00002.jpg`, etc.
- **Negatives separation**: Negative datasets placed under `negatives/` subfolder
- **Consolidated config**: Root-level `data.yaml` merges all dataset classes with source prefixes

**Post-processing recommendations:**
- Review the consolidated `data.yaml` and adjust train/val/test paths as needed
- Consider splitting datasets into train/validation sets based on your requirements
- Verify class mappings and remove source prefixes if desired for cleaner class names

## 8) Dataset Size Comparison: Before vs After Cleaning

This section compares the original dataset sizes (before audit and cleaning) with the final exported dataset sizes (after deduplication, filtering, and export).

In [8]:
# ================================================================================
# Dataset Size Analysis: Before (Original) vs After (Exported/Cleaned)
# ================================================================================

print("=" * 80)
print("DATASET SIZE COMPARISON: BEFORE vs AFTER CLEANING (PER DATASET)")
print("=" * 80)

# ---------- BEFORE: Original Dataset Sizes ----------
if 'df_sizes' not in globals():
    print("‚ö†Ô∏è  df_sizes not found. Please run cell 2 first.")
else:
    df_before = df_sizes[['dataset', 'n_images', 'n_labels', 'n_bboxes']].copy()
    df_before.columns = ['dataset', 'before_images', 'before_labels', 'before_bboxes']
    
    # ---------- AFTER: Exported Dataset Sizes ----------
    export_rows = []
    export_root = Path(EXPORT_DIR)
    
    if not export_root.exists():
        print(f"‚ö†Ô∏è  Export directory does not exist: {export_root}")
        print("Please run cell 6 (Export) first.")
    else:
        # Scan exported datasets
        for child in sorted(export_root.iterdir()):
            if child.name.startswith('.') or child.is_file():
                continue
                
            if child.name == 'negatives':
                # Handle negatives subfolder
                for neg_dataset in sorted(child.iterdir()):
                    if not neg_dataset.is_dir():
                        continue
                    images_dir = neg_dataset / 'images'
                    labels_dir = neg_dataset / 'labels'
                    
                    n_imgs = len(list(images_dir.glob('*'))) if images_dir.exists() else 0
                    n_lbls = len(list(labels_dir.glob('*.txt'))) if labels_dir.exists() else 0
                    n_boxes = 0
                    
                    if labels_dir.exists():
                        for lbl_file in labels_dir.glob('*.txt'):
                            try:
                                with open(lbl_file, 'r', encoding='utf-8') as f:
                                    n_boxes += sum(1 for line in f if line.strip())
                            except Exception:
                                pass
                    
                    # Map back to original dataset name
                    original_name = neg_dataset.name.replace('negatives/', '')
                    export_rows.append({
                        'dataset': original_name,
                        'after_images': n_imgs,
                        'after_labels': n_lbls,
                        'after_bboxes': n_boxes
                    })
            else:
                # Regular dataset
                images_dir = child / 'images'
                labels_dir = child / 'labels'
                
                n_imgs = len(list(images_dir.glob('*'))) if images_dir.exists() else 0
                n_lbls = len(list(labels_dir.glob('*.txt'))) if labels_dir.exists() else 0
                n_boxes = 0
                
                if labels_dir.exists():
                    for lbl_file in labels_dir.glob('*.txt'):
                        try:
                            with open(lbl_file, 'r', encoding='utf-8') as f:
                                n_boxes += sum(1 for line in f if line.strip())
                        except Exception:
                            pass
                
                export_rows.append({
                    'dataset': child.name,
                    'after_images': n_imgs,
                    'after_labels': n_lbls,
                    'after_bboxes': n_boxes
                })
        
        df_after = pd.DataFrame(export_rows)
        
        if not df_after.empty:
            # Merge before and after dataframes
            df_comparison = df_before.merge(df_after, on='dataset', how='outer').fillna(0)
            
            # Calculate differences and percentages
            df_comparison['images_removed'] = df_comparison['before_images'] - df_comparison['after_images']
            df_comparison['labels_removed'] = df_comparison['before_labels'] - df_comparison['after_labels']
            df_comparison['bboxes_removed'] = df_comparison['before_bboxes'] - df_comparison['after_bboxes']
            
            df_comparison['images_retained_%'] = (df_comparison['after_images'] / df_comparison['before_images'] * 100).fillna(0).round(1)
            df_comparison['labels_retained_%'] = (df_comparison['after_labels'] / df_comparison['before_labels'] * 100).fillna(0).round(1)
            df_comparison['bboxes_retained_%'] = (df_comparison['after_bboxes'] / df_comparison['before_bboxes'] * 100).fillna(0).round(1)
            
            # Convert to int where appropriate
            int_cols = ['before_images', 'before_labels', 'before_bboxes', 
                       'after_images', 'after_labels', 'after_bboxes',
                       'images_removed', 'labels_removed', 'bboxes_removed']
            for col in int_cols:
                df_comparison[col] = df_comparison[col].astype(int)
            
            # Sort by before_images descending
            df_comparison = df_comparison.sort_values('before_images', ascending=False).reset_index(drop=True)
            
            # Display detailed comparison
            print("\nüìä PER-DATASET COMPARISON:")
            print("-" * 80)
            display(df_comparison)
            
            # ---------- TOTALS SUMMARY ----------
            print("\n" + "=" * 80)
            print("üìà OVERALL TOTALS:")
            print("-" * 80)
            
            before_total_images = int(df_comparison['before_images'].sum())
            before_total_labels = int(df_comparison['before_labels'].sum())
            before_total_bboxes = int(df_comparison['before_bboxes'].sum())
            
            after_total_images = int(df_comparison['after_images'].sum())
            after_total_labels = int(df_comparison['after_labels'].sum())
            after_total_bboxes = int(df_comparison['after_bboxes'].sum())
            
            img_reduction = before_total_images - after_total_images
            lbl_reduction = before_total_labels - after_total_labels
            box_reduction = before_total_bboxes - after_total_bboxes
            
            img_pct = (img_reduction / before_total_images * 100) if before_total_images > 0 else 0
            lbl_pct = (lbl_reduction / before_total_labels * 100) if before_total_labels > 0 else 0
            box_pct = (box_reduction / before_total_bboxes * 100) if before_total_bboxes > 0 else 0
            
            print(f"\nBEFORE CLEANING:")
            print(f"  Images:         {before_total_images:,}")
            print(f"  Label Files:    {before_total_labels:,}")
            print(f"  Bounding Boxes: {before_total_bboxes:,}")
            
            print(f"\nAFTER CLEANING:")
            print(f"  Images:         {after_total_images:,}")
            print(f"  Label Files:    {after_total_labels:,}")
            print(f"  Bounding Boxes: {after_total_bboxes:,}")
            
            print(f"\nREMOVED:")
            print(f"  Images:         {img_reduction:,} ({img_pct:.1f}%)")
            print(f"  Label Files:    {lbl_reduction:,} ({lbl_pct:.1f}%)")
            print(f"  Bounding Boxes: {box_reduction:,} ({box_pct:.1f}%)")
            
            print(f"\nRETAINED:")
            print(f"  Images:         {after_total_images:,} ({100-img_pct:.1f}%)")
            print(f"  Label Files:    {after_total_labels:,} ({100-lbl_pct:.1f}%)")
            print(f"  Bounding Boxes: {after_total_bboxes:,} ({100-box_pct:.1f}%)")
            
            # Save detailed comparison to CSV
            comparison_csv = export_root / 'dataset_size_comparison_detailed.csv'
            df_comparison.to_csv(comparison_csv, index=False)
            print(f"\n‚úÖ Saved detailed comparison to: {comparison_csv}")
            
            # Also save summary CSV
            summary_csv = export_root / 'dataset_size_comparison_summary.csv'
            summary_df = pd.DataFrame({
                'Metric': ['Images', 'Labels', 'Bounding Boxes'],
                'Before': [before_total_images, before_total_labels, before_total_bboxes],
                'After': [after_total_images, after_total_labels, after_total_bboxes],
                'Removed': [img_reduction, lbl_reduction, box_reduction],
                'Removed_%': [f"{img_pct:.1f}%", f"{lbl_pct:.1f}%", f"{box_pct:.1f}%"],
                'Retained_%': [f"{100-img_pct:.1f}%", f"{100-lbl_pct:.1f}%", f"{100-box_pct:.1f}%"]
            })
            summary_df.to_csv(summary_csv, index=False)
            print(f"‚úÖ Saved summary to: {summary_csv}")
        else:
            print("‚ö†Ô∏è  No exported datasets found.")

print("\n" + "=" * 80)

DATASET SIZE COMPARISON: BEFORE vs AFTER CLEANING (PER DATASET)

üìä PER-DATASET COMPARISON:
--------------------------------------------------------------------------------


Unnamed: 0,dataset,before_images,before_labels,before_bboxes,after_images,after_labels,after_bboxes,images_removed,labels_removed,bboxes_removed,images_retained_%,labels_retained_%,bboxes_retained_%
0,f4k,77235,917,3460,794,794,3054,76441,123,406,1.0,86.6,88.3
1,fishclef,53196,14809,23294,14273,14273,22627,38923,536,667,26.8,96.4,97.1
2,deepfish,6517,6518,15464,4505,4505,15463,2012,2013,1,69.1,69.1,100.0
3,luderick,4276,8554,18881,4276,4276,9429,0,4278,9452,100.0,50.0,49.9
4,deepfish_negatives,2012,2012,0,2012,2012,0,0,0,0,100.0,100.0,0.0
5,fish_416,1350,1352,3183,680,680,1582,670,672,1601,50.4,50.3,49.7
6,AquaCoop,1250,1250,13840,1238,1238,13693,12,12,147,99.0,99.0,98.9
7,aquarium,638,640,4854,637,637,4821,1,3,33,99.8,99.5,99.3
8,OzFish,350,350,7540,350,350,7540,0,0,0,100.0,100.0,100.0



üìà OVERALL TOTALS:
--------------------------------------------------------------------------------

BEFORE CLEANING:
  Images:         146,824
  Label Files:    36,402
  Bounding Boxes: 90,516

AFTER CLEANING:
  Images:         28,765
  Label Files:    28,765
  Bounding Boxes: 78,209

REMOVED:
  Images:         118,059 (80.4%)
  Label Files:    7,637 (21.0%)
  Bounding Boxes: 12,307 (13.6%)

RETAINED:
  Images:         28,765 (19.6%)
  Label Files:    28,765 (79.0%)
  Bounding Boxes: 78,209 (86.4%)

‚úÖ Saved detailed comparison to: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/dataset_size_comparison_detailed.csv
‚úÖ Saved summary to: /Users/Marco/Desktop/SmartFISHER/SmartFISHER_DATASET/GitHub_Test/dataset_size_comparison_summary.csv

