# Phase 0-2: Dataset Cleaning

Validate labels, find orphans, and remove unfixable files.

In [None]:
import os
import shutil
from pathlib import Path
from datetime import datetime

from tqdm import tqdm

print(f"Imports OK. CWD: {Path().resolve()}")

In [None]:
# Paths are machine-specific. Adjust PROJECT_ROOT before running on a new machine.
PROJECT_ROOT = "/Users/tyreecruse/Desktop/CS230/Project/Data/Original"
# Or just use the current directory:
# PROJECT_ROOT = os.getcwd()

CONFIG = {
    "dataset_path": os.path.join(PROJECT_ROOT, "master_dataset_pool"),
    "remove_options": {
        "remove_empty_labels": True,
        "remove_wrong_format": True,
        "remove_invalid_format": True,
        "remove_orphan_images": True,
        "remove_orphan_labels": True,
        "remove_corresponding_files": True,
    },
    "create_backup": False,
    "backup_dir": os.path.join(PROJECT_ROOT, "dataset_backups"),
}

print("Config")
print("------")
print(f"dataset_path: {CONFIG['dataset_path']}")
print("removal options:")
for key, enabled in CONFIG["remove_options"].items():
    flag = "yes" if enabled else "no"
    print(f"  {flag:3} {key.replace('_', ' ')}")

dataset_path = Path(CONFIG["dataset_path"])
images_dir = dataset_path / "images"
labels_dir = dataset_path / "labels"

if dataset_path.exists():
    n_img = len(list(images_dir.glob('*'))) if images_dir.exists() else 0
    n_lbl = len(list(labels_dir.glob('*.txt'))) if labels_dir.exists() else 0
    print(f"\nDataset found at {dataset_path}")
    print(f"  images: {n_img:,}")
    print(f"  labels: {n_lbl:,}")
else:
    print(f"\n[warning] dataset not found at {dataset_path}")

In [None]:
def create_backup(dataset_path, backup_dir):
    """Copy dataset_path into backup_dir/<name>_backup_precleaning_<timestamp> and return the new path."""
    dataset_path = Path(dataset_path)
    backup_dir = Path(backup_dir)
    backup_dir.mkdir(parents=True, exist_ok=True)

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = backup_dir / f"{dataset_path.name}_backup_precleaning_{ts}"

    print(f"\nCreating backup at {backup_path}")
    shutil.copytree(dataset_path, backup_path)

    return backup_path

In [None]:
def validate_label_file(label_path):
    """Return (is_valid, reason) for a YOLO-style label file."""
    label_path = Path(label_path)
    try:
        text = label_path.read_text().strip()
    except Exception as exc:
        return False, f"error: {exc}"

    if not text:
        return False, "empty_file"

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue

        parts = line.split()
        if len(parts) != 5:
            return False, "wrong_format"

        try:
            class_id = int(parts[0])
            x, y, w, h = map(float, parts[1:])
        except ValueError:
            return False, "unparseable"

        if not (0 <= x <= 1 and 0 <= y <= 1 and 0 <= w <= 1 and 0 <= h <= 1):
            return False, "invalid_values"

    return True, "valid"""

In [None]:
def find_orphans(images_dir, labels_dir):
    """Return (orphan_images, orphan_labels) given image/label directories."""
    images_dir = Path(images_dir)
    labels_dir = Path(labels_dir)

    image_exts = [".jpg", ".jpeg", ".png", ".bmp"]
    image_files = []
    for ext in image_exts:
        image_files.extend(images_dir.glob(f"*{ext}"))

    label_files = list(labels_dir.glob("*.txt"))

    image_stems = {p.stem: p for p in image_files}
    label_stems = {p.stem: p for p in label_files}

    orphan_image_stems = image_stems.keys() - label_stems.keys()
    orphan_label_stems = label_stems.keys() - image_stems.keys()

    orphan_images = [image_stems[s] for s in orphan_image_stems]
    orphan_labels = [label_stems[s] for s in orphan_label_stems]

    return orphan_images, orphan_labels

In [None]:
def clean_dataset(dataset_path, config):
    """Validate labels, find orphans, and remove selected files. Returns stats dict."""
    dataset_path = Path(dataset_path)
    images_dir = dataset_path / "images"
    labels_dir = dataset_path / "labels"

    stats = {
        "labels_removed": 0,
        "images_removed": 0,
        "empty_labels": 0,
        "wrong_format": 0,
        "unparseable": 0,
        "invalid_values": 0,
        "orphan_images": 0,
        "orphan_labels": 0,
        "files_to_remove": [],
    }

    if not dataset_path.exists():
        raise FileNotFoundError(f"Dataset path does not exist: {dataset_path}")

    # Step 1: validate label files
    labels_to_remove = []
    if labels_dir.exists():
        label_files = list(labels_dir.glob("*.txt"))
        print(f"\nChecking {len(label_files)} label files...")

        for label_path in tqdm(label_files, desc="labels", unit="file"):
            is_valid, reason = validate_label_file(label_path)

            if not is_valid:
                should_remove = False

                if reason == "empty_file" and config["remove_options"]["remove_empty_labels"]:
                    stats["empty_labels"] += 1
                    should_remove = True
                elif reason == "wrong_format" and config["remove_options"]["remove_wrong_format"]:
                    stats["wrong_format"] += 1
                    should_remove = True
                elif reason in ("unparseable", "invalid_values") and config["remove_options"]["remove_invalid_format"]:
                    stats[reason] += 1
                    should_remove = True

                if should_remove:
                    labels_to_remove.append(label_path)
                    stats["files_to_remove"].append(("label", label_path, reason))

        print(f"  invalid labels scheduled for removal: {len(labels_to_remove)}")

    # Step 2: find orphaned files
    orphan_images, orphan_labels = find_orphans(images_dir, labels_dir)
    print(f"\nOrphans: {len(orphan_images)} images, {len(orphan_labels)} labels")

    if config["remove_options"]["remove_orphan_images"]:
        stats["orphan_images"] = len(orphan_images)
        for img in orphan_images:
            stats["files_to_remove"].append(("image", img, "orphan"))

    if config["remove_options"]["remove_orphan_labels"]:
        stats["orphan_labels"] = len(orphan_labels)
        for lbl in orphan_labels:
            stats["files_to_remove"].append(("label", lbl, "orphan"))
            if lbl not in labels_to_remove:
                labels_to_remove.append(lbl)

    # Step 3: apply removals
    print("\nApplying removals...")

    # Remove labels and optionally corresponding images
    if labels_to_remove:
        image_exts = [".jpg", ".jpeg", ".png", ".bmp"]
        for label_path in tqdm(labels_to_remove, desc="labels", unit="file"):
            try:
                label_path.unlink()
                stats["labels_removed"] += 1

                if config["remove_options"]["remove_corresponding_files"]:
                    for ext in image_exts:
                        image_path = images_dir / f"{label_path.stem}{ext}"
                        if image_path.exists():
                            image_path.unlink()
                            stats["images_removed"] += 1
                            break
            except Exception as exc:
                print(f"[warn] error removing {label_path}: {exc}")

    # Remove orphan images (if not already removed)
    if config["remove_options"]["remove_orphan_images"]:
        for image_path in tqdm(orphan_images, desc="orphan images", unit="file"):
            try:
                if image_path.exists():
                    image_path.unlink()
                    stats["images_removed"] += 1
            except Exception as exc:
                print(f"[warn] error removing {image_path}: {exc}")

    return stats

In [None]:
def main():
    """Run optional backup and dataset cleaning; return a results dict."""
    results = {}
    dataset_dir = Path(CONFIG["dataset_path"])

    try:
        if CONFIG.get("create_backup", False):
            backup_path = create_backup(dataset_dir, CONFIG["backup_dir"])
            results["backup_path"] = str(backup_path)

        stats = clean_dataset(dataset_dir, CONFIG)
        results["clean_stats"] = stats

        print("\nCleaning summary")
        print("----------------")
        print(f"  labels removed : {stats['labels_removed']:,}")
        print(f"  images removed : {stats['images_removed']:,}")
        if stats["empty_labels"]:
            print(f"  empty labels   : {stats['empty_labels']:,}")
        if stats["wrong_format"]:
            print(f"  wrong format   : {stats['wrong_format']:,}")
        if stats["unparseable"]:
            print(f"  unparseable    : {stats['unparseable']:,}")
        if stats["invalid_values"]:
            print(f"  invalid values : {stats['invalid_values']:,}")
        if stats["orphan_images"]:
            print(f"  orphan images  : {stats['orphan_images']:,}")
        if stats["orphan_labels"]:
            print(f"  orphan labels  : {stats['orphan_labels']:,}")

        print(f"\nDataset location : {dataset_dir}")
        if "backup_path" in results:
            print(f"Backup location  : {results['backup_path']}")

    except Exception as exc:
        print(f"\nDataset cleaning failed: {exc}")
        results["error"] = str(exc)

    return results


if __name__ == "__main__":
    cleaning_results = main()

In [None]:
dataset_path = Path(CONFIG["dataset_path"])
images_dir = dataset_path / "images"
labels_dir = dataset_path / "labels"

print("\nFinal dataset state")
print("-------------------")

if dataset_path.exists():
    image_files = list(images_dir.glob("*")) if images_dir.exists() else []
    label_files = list(labels_dir.glob("*.txt")) if labels_dir.exists() else []

    n_img, n_lbl = len(image_files), len(label_files)
    print(f"  images : {n_img:,}")
    print(f"  labels : {n_lbl:,}")
    if n_img:
        cov = 100 * n_lbl / n_img if n_img else 0
        print(f"  label/image: {cov:.1f}%")

    if image_files:
        print("\n  sample images:")
        for p in image_files[:5]:
            print("   -", p.name)
        if n_img > 5:
            print(f"   ... (+{n_img - 5:,} more)")
else:
    print(f"  dataset not found at {dataset_path}")

# Show impact from the last run, if available
if 'cleaning_results' in globals() and isinstance(cleaning_results, dict):
    stats = cleaning_results.get("clean_stats", {})
    if stats:
        print("\nCleaning impact")
        print("---------------")
        total_removed = stats.get("labels_removed", 0) + stats.get("images_removed", 0)
        print(f"  labels removed : {stats.get('labels_removed', 0):,}")
        print(f"  images removed : {stats.get('images_removed', 0):,}")
        print(f"  total removed  : {total_removed:,}")