In [1]:
import os
import random
import shutil

def create_ade20k_subset(
    src_root,              # e.g. "./data/ADE20K_traevl/images/ADE/training"
    dest_root,             # e.g. "./data/ADE20K_subset/train"
    subset_size=10000,
    image_exts=(".jpg", ".png")
):
    """
    1) Recursively gather all image file paths from ADE20K's 'training' folder
       which itself has multiple subfolders.
    2) Randomly sample up to 'subset_size' of them.
    3) Copy each image and its matching label file to a new subset folder structure.
    
    Note: This assumes your label files have the same name but with "_seg.png"
          or something similar. If the naming is different, adapt the code accordingly.
    """
    os.makedirs(dest_root, exist_ok=True)

    # 1) Gather all images recursively
    image_files = []
    for root, dirs, files in os.walk(src_root):
        for fname in files:
            # If it's an image we care about
            if fname.lower().endswith(image_exts):
                full_path = os.path.join(root, fname)
                image_files.append(full_path)

    print(f"Found {len(image_files)} total images in {src_root}.")

    # 2) Randomly sample
    if len(image_files) > subset_size:
        sampled_files = random.sample(image_files, subset_size)
    else:
        sampled_files = image_files

    # 3) Copy images + corresponding labels
    for img_path in sampled_files:
        # Rebuild a relative path from src_root
        rel_path = os.path.relpath(img_path, src_root)
        # Destination image path
        dest_img_path = os.path.join(dest_root, rel_path)
        os.makedirs(os.path.dirname(dest_img_path), exist_ok=True)

        # Copy the image
        shutil.copy2(img_path, dest_img_path)

        # Attempt to copy the matching label (if it exists)
        # e.g., "ADE_train_00001472.jpg" -> "ADE_train_00001472_seg.png"
        base, ext = os.path.splitext(img_path)
        seg_name  = base + "_seg.png"  # or whatever your label naming is
        if os.path.exists(seg_name):
            rel_seg_path = os.path.relpath(seg_name, src_root)
            dest_seg_path = os.path.join(dest_root, rel_seg_path)
            os.makedirs(os.path.dirname(dest_seg_path), exist_ok=True)
            shutil.copy2(seg_name, dest_seg_path)

    print(f"Subset of size {len(sampled_files)} created at {dest_root}")


In [2]:

src_training = "./data/ADE20K_traevl/images/ADE/training"
dest_training_subset = "./data/ADE20K_subset/train"

src_validation = "./data/ADE20K_traevl/images/ADE/validation"
dest_validation_subset = "./data/ADE20K_subset/validation"


create_ade20k_subset(
    src_root=src_training,
    dest_root=dest_training_subset,
    subset_size=10000,
    image_exts=(".jpg", ".png")
)

create_ade20k_subset(
    src_root=src_validation,
    dest_root=dest_validation_subset,
    subset_size=10000,
    image_exts=(".jpg", ".png")
)


Found 701150 total images in ./data/ADE20K_traevl/images/ADE/training.
Subset of size 10000 created at ./data/ADE20K_subset/train
Found 79076 total images in ./data/ADE20K_traevl/images/ADE/validation.
Subset of size 10000 created at ./data/ADE20K_subset/validation
