In [3]:
from pathlib import Path
import shutil
from ultralytics.data.converter import convert_coco

# 1) Point to your original COCO root
coco_root = Path("/mnt/ssd2/santana-coco/data/coco")

# 2) Make a detection-only annotations folder (instances_* only)
det_ann_dir = coco_root / "annotations_det"
det_ann_dir.mkdir(exist_ok=True)

for name in ["instances_train2017.json", "instances_val2017.json"]:
    src = coco_root / "annotations" / name
    dst = det_ann_dir / name
    if not dst.exists():
        shutil.copy(src, dst)
        print("Copied", src, "->", dst)
    else:
        print("Already exists:", dst)

# 3) Choose a directory where YOLO labels will be saved
save_dir = coco_root / "coco_yolo_raw"  # this will be created

# 4) Run the conversion
convert_coco(
    labels_dir=str(det_ann_dir),
    save_dir=str(save_dir),
    use_segments=False,
    use_keypoints=False,
    cls91to80=True,  # COCO 91 → YOLO 80-class mapping
)

print("Done. YOLO labels should be under:", save_dir)


Already exists: /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_train2017.json
Already exists: /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_val2017.json
[KAnnotations /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_train2017.json: 100% ━━━━━━━━━━━━ 117266/117266 15.2Kit/s 7.7s0.1s
[KAnnotations /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_val2017.json: 100% ━━━━━━━━━━━━ 4952/4952 15.6Kit/s 0.3s0.0s
COCO data converted successfully.
Results saved to /mnt/ssd2/santana-coco/data/coco/coco_yolo_raw
Done. YOLO labels should be under: /mnt/ssd2/santana-coco/data/coco/coco_yolo_raw


In [5]:
# === Cell 1: Config & imports ==========================================

from pathlib import Path
import random
import shutil

import yaml
from ultralytics.data.converter import convert_coco

# ---- EDIT THESE ARGUMENTS AS YOU LIKE ----

# Path to your original COCO root (from your bash script)
COCO_ROOT = Path("/mnt/ssd2/santana-coco/data/coco")

# Output folder for the subset YOLO dataset
OUT_ROOT = Path("/mnt/ssd2/santana-coco/data/coco_subset_500_100")

# How many images you want in train / val
TRAIN_COUNT = 500
VAL_COUNT = 100

# Random seed for sampling
SEED = 0

# If True: copy files. If False: create symlinks (saves disk space).
COPY_FILES = False

# -----------------------------------------------------------------------

COCO80_NAMES = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
    "truck", "boat", "traffic light", "fire hydrant", "stop sign",
    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
    "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
    "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
    "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
    "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
    "teddy bear", "hair drier", "toothbrush",
]

print("COCO_ROOT:", COCO_ROOT)
print("OUT_ROOT:", OUT_ROOT)

# === Cell 2: Helper functions ==========================================

def ensure_detection_annotations(coco_root: Path) -> Path:
    """Create coco_root/annotations_det with only instances_train/val JSONs."""
    ann_src = coco_root / "annotations"
    det_dir = coco_root / "annotations_det"
    det_dir.mkdir(exist_ok=True)

    for name in ["instances_train2017.json", "instances_val2017.json"]:
        src = ann_src / name
        if not src.exists():
            raise FileNotFoundError(f"Missing {src}")
        dst = det_dir / name
        if not dst.exists():
            shutil.copy(src, dst)
            print(f"Copied {src} -> {dst}")
        else:
            print(f"Found existing {dst}")
    return det_dir


def ensure_yolo_labels(det_ann_dir: Path, coco_root: Path) -> Path:
    """
    Run convert_coco once to create YOLO labels under coco_root/coco_yolo_raw.

    If labels already exist there, reuse them.
    """
    yolo_raw_dir = coco_root / "coco_yolo_raw"

    train_labels_dir = yolo_raw_dir / "labels" / "train2017"
    val_labels_dir = yolo_raw_dir / "labels" / "val2017"

    if train_labels_dir.exists() and val_labels_dir.exists():
        print(f"Reusing existing YOLO labels at {yolo_raw_dir}")
        return yolo_raw_dir

    if yolo_raw_dir.exists():
        raise RuntimeError(
            f"{yolo_raw_dir} exists but does not look like a converted dataset.\n"
            f"Delete or rename it and re-run if needed."
        )

    print("Converting COCO annotations -> YOLO labels...")
    convert_coco(
        labels_dir=str(det_ann_dir),
        save_dir=str(yolo_raw_dir),
        use_segments=False,
        use_keypoints=False,
        cls91to80=True,
    )
    print(f"Done. YOLO labels written to: {yolo_raw_dir}")
    return yolo_raw_dir


def link_or_copy(src: Path, dst: Path, do_copy: bool):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        return
    if do_copy:
        shutil.copy(src, dst)
    else:
        dst.symlink_to(src)


def sample_subset(
    labels_dir: Path,
    images_src_dir: Path,
    images_dst_dir: Path,
    labels_dst_dir: Path,
    count: int,
    seed: int,
    copy_files: bool,
):
    """
    Sample 'count' label files from labels_dir and create subset images/labels.

    We sample based on label files (so every chosen image has at least one box).
    """
    all_labels = sorted(labels_dir.glob("*.txt"))
    if not all_labels:
        raise RuntimeError(f"No label files found in {labels_dir}")

    if count > len(all_labels):
        print(
            f"Requested {count} but only {len(all_labels)} annotated images available. "
            f"Using {len(all_labels)} instead."
        )
        count = len(all_labels)

    random.seed(seed)
    subset_labels = random.sample(all_labels, count)

    print(
        f"Creating subset with {count} images from {images_src_dir.name} "
        f"(labels in {labels_dir.name})"
    )

    for label_path in subset_labels:
        stem = label_path.stem  # e.g. 000000000139
        img_path = images_src_dir / f"{stem}.jpg"
        if not img_path.exists():
            # COCO images are .jpg; you can add .png fallback here if needed
            raise FileNotFoundError(f"Image not found for label {label_path}: {img_path}")

        # link/copy image and label
        dst_img = images_dst_dir / img_path.name
        dst_label = labels_dst_dir / label_path.name

        link_or_copy(img_path, dst_img, copy_files)
        link_or_copy(label_path, dst_label, copy_files)


def write_yolo_yaml(out_root: Path):
    yaml_path = out_root / "coco_subset.yaml"
    cfg = {
        "path": str(out_root.resolve()),
        "train": "images/train",
        "val": "images/val",
        "test": "",
        "nc": 80,
        "names": COCO80_NAMES,
    }
    with open(yaml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)
    print(f"Wrote YOLO data yaml to: {yaml_path}")
    return yaml_path


# === Cell 3: Run subset creation =======================================

# 0) Basic sanity checks
if not COCO_ROOT.exists():
    raise FileNotFoundError(f"COCO_ROOT does not exist: {COCO_ROOT}")

if OUT_ROOT.exists() and any(OUT_ROOT.iterdir()):
    raise RuntimeError(
        f"OUT_ROOT {OUT_ROOT} already exists and is not empty.\n"
        f"Choose a new OUT_ROOT or clear it first."
    )

print("Using COCO_ROOT:", COCO_ROOT)
print("Using OUT_ROOT:", OUT_ROOT)
print("TRAIN_COUNT:", TRAIN_COUNT)
print("VAL_COUNT:", VAL_COUNT)
print("SEED:", SEED)
print("COPY_FILES:", COPY_FILES)

# 1) Ensure we have detection-only annotations
det_ann_dir = ensure_detection_annotations(COCO_ROOT)

# 2) Ensure / create YOLO label files from COCO
yolo_raw_dir = ensure_yolo_labels(det_ann_dir, COCO_ROOT)

labels_train_src = yolo_raw_dir / "labels" / "train2017"
labels_val_src = yolo_raw_dir / "labels" / "val2017"

images_train_src = COCO_ROOT / "train2017"
images_val_src = COCO_ROOT / "val2017"

# 3) Create subset directory structure
images_train_dst = OUT_ROOT / "images" / "train"
images_val_dst = OUT_ROOT / "images" / "val"
labels_train_dst = OUT_ROOT / "labels" / "train"
labels_val_dst = OUT_ROOT / "labels" / "val"

# 4) Sample and create subsets
sample_subset(
    labels_dir=labels_train_src,
    images_src_dir=images_train_src,
    images_dst_dir=images_train_dst,
    labels_dst_dir=labels_train_dst,
    count=TRAIN_COUNT,
    seed=SEED,
    copy_files=COPY_FILES,
)

sample_subset(
    labels_dir=labels_val_src,
    images_src_dir=images_val_src,
    images_dst_dir=images_val_dst,
    labels_dst_dir=labels_val_dst,
    count=VAL_COUNT,
    seed=SEED + 1,  # different seed for val
    copy_files=COPY_FILES,
)

# 5) Write YOLO yaml
yaml_path = write_yolo_yaml(OUT_ROOT)

print("\nDONE.")
print("Subset dataset created at:", OUT_ROOT)
print("YOLO data config:", yaml_path)


COCO_ROOT: /mnt/ssd2/santana-coco/data/coco
OUT_ROOT: /mnt/ssd2/santana-coco/data/coco_subset_500_100
Using COCO_ROOT: /mnt/ssd2/santana-coco/data/coco
Using OUT_ROOT: /mnt/ssd2/santana-coco/data/coco_subset_500_100
TRAIN_COUNT: 500
VAL_COUNT: 100
SEED: 0
COPY_FILES: False
Found existing /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_train2017.json
Found existing /mnt/ssd2/santana-coco/data/coco/annotations_det/instances_val2017.json
Reusing existing YOLO labels at /mnt/ssd2/santana-coco/data/coco/coco_yolo_raw
Creating subset with 500 images from train2017 (labels in train2017)
Creating subset with 100 images from val2017 (labels in val2017)
Wrote YOLO data yaml to: /mnt/ssd2/santana-coco/data/coco_subset_500_100/coco_subset.yaml

DONE.
Subset dataset created at: /mnt/ssd2/santana-coco/data/coco_subset_500_100
YOLO data config: /mnt/ssd2/santana-coco/data/coco_subset_500_100/coco_subset.yaml
