In [None]:
from pathlib import Path
import os
import glob
import pandas as pd
from pathlib import Path
import shutil
from collections import Counter

In [10]:
coco_classes = {
    0: 'person',
    1: 'bicycle',
    2: 'car',
    3: 'motorcycle',
    4: 'airplane',
    5: 'bus',
    6: 'train',
    7: 'truck',
    8: 'boat',
    9: 'traffic light',
    10: 'fire hydrant',
    11: 'stop sign',
    12: 'parking meter',
    13: 'bench',
    14: 'bird',
    15: 'cat',
    16: 'dog',
    17: 'horse',
    18: 'sheep',
    19: 'cow',
    20: 'elephant',
    21: 'bear',
    22: 'zebra',
    23: 'giraffe',
    24: 'backpack',
    25: 'umbrella',
    26: 'handbag',
    27: 'tie',
    28: 'suitcase',
    29: 'frisbee',
    30: 'skis',
    31: 'snowboard',
    32: 'sports ball',
    33: 'kite',
    34: 'baseball bat',
    35: 'baseball glove',
    36: 'skateboard',
    37: 'surfboard',
    38: 'tennis racket',
    39: 'bottle',
    40: 'wine glass',
    41: 'cup',
    42: 'fork',
    43: 'knife',
    44: 'spoon',
    45: 'bowl',
    46: 'banana',
    47: 'apple',
    48: 'sandwich',
    49: 'orange',
    50: 'broccoli',
    51: 'carrot',
    52: 'hot dog',
    53: 'pizza',
    54: 'donut',
    55: 'cake',
    56: 'chair',
    57: 'couch',
    58: 'potted plant',
    59: 'bed',
    60: 'dining table',
    61: 'toilet',
    62: 'tv',
    63: 'laptop',
    64: 'mouse',
    65: 'remote',
    66: 'keyboard',
    67: 'cell phone',
    68: 'microwave',
    69: 'oven',
    70: 'toaster',
    71: 'sink',
    72: 'refrigerator',
    73: 'book',
    74: 'clock',
    75: 'vase',
    76: 'scissors',
    77: 'teddy bear',
    78: 'hair drier',
    79: 'toothbrush'
}

In [None]:
# Paths
train_image_path = Path.cwd() / 'coco' / 'images' / 'train2017'
val_image_path = Path.cwd() / 'coco' / 'images' / 'val2017'
train_labels_path = Path.cwd() / 'coco' / 'labels' / 'train2017'
val_labels_path = Path.cwd() / 'coco' / 'labels' / 'val2017'


In [None]:
def get_class_counts(labels_path):
    """Count instances of each class using pandas value_counts"""
    all_classes = []
    for label_file in labels_path.glob('*.txt'):
        with open(label_file, 'r') as f:
            for line in f:
                if line.strip():
                    class_id = int(line.split()[0])
                    all_classes.append(class_id)
    
    return pd.Series(all_classes).value_counts()

# Get counts
train_counts = get_class_counts(train_labels_path)
val_counts = get_class_counts(val_labels_path)


In [83]:
# Filter subsets using pandas indexing
print(train_counts[(train_counts > 6000) & (train_counts < 6500)].index.tolist())
print(train_counts[(train_counts > 2600) & (train_counts < 2900)].index.tolist())

[27, 67, 74, 49, 32, 55, 44, 28, 37, 5]
[52, 66, 31, 29, 72]


In [66]:
# Label IDs to keep
labels_to_use_big = [27, 67, 74, 49, 32, 55, 44, 28, 37, 5] # 10 classes, 6000 instances
labels_to_use_small = [52, 66, 31, 29, 72] # 5 classes, 2800 instances

In [64]:
def extract_coco_subset(labels_path, output_dir_images, output_dir_labels, labels_to_use):

    # Statistics
    stats = {
        'total_processed': 0,
        'images_with_target_classes': 0,
        'class_distribution': Counter()
    }

    labels_paths = glob.glob(str(labels_path / '*.txt'))

    for label_idx, label_path in enumerate(labels_paths):
        label_path = Path(label_path)
        
        # Read and filter labels
        filtered_labels = []
        with open(label_path, 'r') as f:
            for line in f:
                class_id = int(line.split()[0])
                if class_id in labels_to_use:
                    filtered_labels.append(line)
                    stats['class_distribution'][class_id] += 1
        
        stats['total_processed'] += 1
        
        # If we found any matching labels, save label and image
        if filtered_labels:
            # Save filtered labels
            output_label = output_dir_labels / f'{label_path.stem}.txt'
            with open(output_label, 'w') as f:
                f.writelines(filtered_labels)
            
            # Copy corresponding image
            image_path = label_path.parent.parent.parent / 'images' / label_path.parent.name / f'{label_path.stem}.jpg'
            
            # Alternative if you prefer string replace:
            # image_path = Path(str(label_path).replace('/labels/', '/images/').replace('.txt', '.jpg'))
            
            if image_path.exists():
                output_image = output_dir_images / f'{label_path.stem}.jpg'
                shutil.copy(image_path, output_image)
                stats['images_with_target_classes'] += 1
                print(f'✓ {label_path.stem}: {len(filtered_labels)} objects')
            else:
                print(f'✗ Image not found: {image_path}')

    # Print summary
    print(f'\n{"="*50}')
    print(f'Summary:')
    print(f'{"="*50}')
    print(f'Total labels checked: {stats["total_processed"]}')
    print(f'Images with target classes: {stats["images_with_target_classes"]}')
    print(f'\nClass distribution:')
    for class_id, count in sorted(stats['class_distribution'].items()):
        print(f'  Class {class_id}: {count} instances')

In [None]:
# Create Coco subset 5 (small)
# Output directories 
output_dir = Path.cwd() / 'coco_5_classes'

# Output directories train
output_dir_labels_train = output_dir / 'labels' / 'train2017'
output_dir_images_train = output_dir / 'images' / 'train2017'

output_dir_labels_train.mkdir(parents=True, exist_ok=True)
output_dir_images_train.mkdir(parents=True, exist_ok=True)

# Output directories val
output_dir_labels_val = output_dir / 'labels' / 'val2017'
output_dir_images_val = output_dir / 'images' / 'val2017'

output_dir_labels_val.mkdir(parents=True, exist_ok=True)
output_dir_images_val.mkdir(parents=True, exist_ok=True)

# Extract COCO subset
extract_coco_subset(train_labels_path, output_dir_images_train, output_dir_labels_train, labels_to_use_small)
extract_coco_subset(val_labels_path, output_dir_images_val, output_dir_labels_val, labels_to_use_small)


✓ 000000560011: 1 objects
✓ 000000027620: 3 objects
✓ 000000428280: 1 objects
✓ 000000148620: 2 objects
✓ 000000056344: 1 objects
✓ 000000094944: 2 objects
✓ 000000468505: 6 objects
✓ 000000407868: 1 objects
✓ 000000333772: 2 objects
✓ 000000530975: 1 objects
✓ 000000319617: 1 objects
✓ 000000080273: 1 objects
✓ 000000017899: 1 objects
✓ 000000575970: 1 objects
✓ 000000172595: 2 objects
✓ 000000153229: 2 objects
✓ 000000529148: 1 objects
✓ 000000160556: 1 objects
✓ 000000283113: 2 objects
✓ 000000402118: 1 objects
✓ 000000350405: 1 objects
✓ 000000400082: 1 objects
✓ 000000093353: 1 objects
✓ 000000279730: 1 objects
✓ 000000097679: 1 objects
✓ 000000453341: 1 objects
✓ 000000178469: 1 objects
✓ 000000513567: 2 objects
✓ 000000491216: 1 objects
✓ 000000239857: 1 objects
✓ 000000367095: 1 objects
✓ 000000335658: 1 objects
✓ 000000020992: 1 objects
✓ 000000248314: 1 objects
✓ 000000009483: 1 objects
✓ 000000429598: 1 objects
✓ 000000006954: 2 objects
✓ 000000261036: 3 objects
✓ 0000004041

In [67]:
# Create Coco subset 10 (big)
# Output directories 
output_dir = Path.cwd() / 'coco_10_classes'

# Output directories train
output_dir_labels_train = output_dir / 'labels' / 'train2017'
output_dir_images_train = output_dir / 'images' / 'train2017'

output_dir_labels_train.mkdir(parents=True, exist_ok=True)
output_dir_images_train.mkdir(parents=True, exist_ok=True)

# Output directories val
output_dir_labels_val = output_dir / 'labels' / 'val2017'
output_dir_images_val = output_dir / 'images' / 'val2017'

output_dir_labels_val.mkdir(parents=True, exist_ok=True)
output_dir_images_val.mkdir(parents=True, exist_ok=True)

# Extract COCO subset
labels_to_use_big = [27, 67, 74, 49, 32, 55, 44, 28, 37, 5] # 10 classes, 6000 instances
extract_coco_subset(train_labels_path, output_dir_images_train, output_dir_labels_train, labels_to_use_big)
extract_coco_subset(val_labels_path, output_dir_images_val, output_dir_labels_val, labels_to_use_big)


✓ 000000295957: 1 objects
✓ 000000049648: 1 objects
✓ 000000550844: 5 objects
✓ 000000114229: 4 objects
✓ 000000027191: 3 objects
✓ 000000153162: 4 objects
✓ 000000201844: 1 objects
✓ 000000188766: 2 objects
✓ 000000314285: 4 objects
✓ 000000541949: 1 objects
✓ 000000231987: 1 objects
✓ 000000015645: 1 objects
✓ 000000431504: 1 objects
✓ 000000080448: 1 objects
✓ 000000468471: 1 objects
✓ 000000118413: 3 objects
✓ 000000274203: 1 objects
✓ 000000099119: 1 objects
✓ 000000418196: 2 objects
✓ 000000281922: 1 objects
✓ 000000534468: 2 objects
✓ 000000326237: 1 objects
✓ 000000469009: 1 objects
✓ 000000437613: 4 objects
✓ 000000356908: 5 objects
✓ 000000117046: 2 objects
✓ 000000086987: 4 objects
✓ 000000288906: 8 objects
✓ 000000262307: 3 objects
✓ 000000168692: 1 objects
✓ 000000148542: 2 objects
✓ 000000116358: 1 objects
✓ 000000109278: 1 objects
✓ 000000330455: 1 objects
✓ 000000463242: 1 objects
✓ 000000021286: 1 objects
✓ 000000443192: 1 objects
✓ 000000153604: 1 objects
✓ 0000005576