In [None]:
import os
import subprocess
import json
import pandas as pd
from tqdm import tqdm

# Base directory for datasets
base_dir = "../data"

# List of required classes
classes = ["Bicycle", "Car", "Motorcycle", "Airplane", "Bus", "Train", "Truck", "Boat",
           "Traffic Light", "Stop Sign", "Parking Meter", "Bench", "Bird", "Cat", "Dog",
           "Horse", "Sheep", "Cow", "Elephant", "Bear", "Zebra", "Backpack", "Umbrella",
           "Handbag", "Tie", "Skis", "Sports Ball", "Kite", "Tennis Racket", "Bottle",
           "Wine Glass", "Cup", "Knife", "Spoon", "Bowl", "Banana", "Apple", "Orange",
           "Broccoli", "Hot Dog", "Pizza", "Donut", "Chair", "Couch", "Potted Plant", "Bed",
           "Dining Table", "Toilet", "TV", "Laptop", "Mouse", "Remote", "Keyboard", "Cell Phone",
           "Microwave", "Oven", "Toaster", "Sink", "Refrigerator", "Book", "Clock", "Vase",
           "Teddy Bear", "Hair Drier"]
           
classes = [c.lower() for c in classes]

In [None]:
def create_dataset_structure(dataset_name):
    """Create train, val, test folders for a dataset"""
    dataset_path = os.path.join(base_dir, dataset_name)
    for split in ["train", "val", "test"]:
        for cls in classes:
            os.makedirs(os.path.join(dataset_path, split, cls), exist_ok=True)
    return dataset_path


In [None]:
def download_coco():
    """Download COCO dataset (Train & Val)"""
    print("🚀 Downloading COCO dataset...")

    urls = {
        "train": "http://images.cocodataset.org/zips/train2017.zip",
        "val": "http://images.cocodataset.org/zips/val2017.zip",
        "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    }

    for split, url in urls.items():
        zip_path = os.path.join(coco_dir, f"{split}.zip")
        subprocess.run(["wget", "-O", zip_path, url])
        subprocess.run(["unzip", "-q", "-d", coco_dir, zip_path])
        os.remove(zip_path)

    print("✅ COCO dataset downloaded and extracted!")

In [None]:
def move_images_to_class_folders(split):
    """Move COCO images into class folders based on labels."""
    annotations_file = os.path.join(coco_dir, "annotations", f"instances_{split}2017.json")

    # Load COCO annotations
    with open(annotations_file, "r") as f:
        coco_data = json.load(f)

    # Map category ID to category name
    category_map = {cat["id"]: cat["name"] for cat in coco_data["categories"]}

    #print(category_map)

    # Keep only the required classes
    category_map = {k: v for k, v in category_map.items() if v in classes}

    #print(category_map)
    
    filtered_annotations = [ann for ann in coco_data["annotations"] if ann["category_id"] in category_map]


    if not filtered_annotations:
        print(f"⚠️ No images found for {split}! Check annotation file.")

    for ann in tqdm(filtered_annotations, desc=f"Moving {split} images"):
        image_id = ann["image_id"]
        category_id = ann["category_id"]
        category_name = category_map[category_id]
        file_name = next((img["file_name"] for img in coco_data["images"] if img["id"] == image_id), None)

        if file_name is None:
            print(f"⚠️ Warning: No file found for image ID {image_id}")
            continue

        src_path = os.path.join(coco_dir, f"{split}2017", file_name)
        dest_folder = os.path.join(coco_dir, split, category_name)
        dest_path = os.path.join(dest_folder, file_name)

        # Ensure class folder exists
        os.makedirs(dest_folder, exist_ok=True)

        # Move the file only if it exists
        if os.path.exists(src_path):
            os.rename(src_path, dest_path)
            print(f"✅ Moved: {src_path} → {dest_path}")
        else:
            print(f"⚠️ File missing: {src_path}")

In [None]:
coco_dir = create_dataset_structure("COCO")

In [None]:
coco_dir

In [None]:
# UNCOMMENT TO DOWNLOAD COCO DATASET

#download_coco()         


In [None]:
move_images_to_class_folders("train")  # Move Train images


In [None]:
move_images_to_class_folders("val") 