## Step 1: Setup & Imports

In [3]:
from pathlib import Path
import shutil
import random
from collections import defaultdict
from tqdm import tqdm
from PIL import Image
import numpy as np

random.seed(42)

## Step 2: Configure Paths

In [4]:
# Source datasets
YAHOO_DATASET = Path("datasets/preprocessed/yahoo_human_balls/ready")
COCO_DATASET = Path("datasets/processed/coco_persons")  # Existing COCO processing

# Output dataset
OUTPUT_DATASET = Path("datasets/ready/combined_train")
OUTPUT_IMAGES = OUTPUT_DATASET / "images"
OUTPUT_LABELS = OUTPUT_DATASET / "labels"

# Create output directories
OUTPUT_IMAGES.mkdir(parents=True, exist_ok=True)
OUTPUT_LABELS.mkdir(parents=True, exist_ok=True)

print(f"‚úì Output dataset: {OUTPUT_DATASET}")

‚úì Output dataset: datasets/ready/combined_train


## Step 3: Define Class Mapping

Map source class names to unified class IDs:
- 0: red ball
- 1: human

In [5]:
CLASS_MAPPING = {
    "red ball": 0,
    "human": 1
}

CLASS_NAMES = ["red ball", "human"]

print("Class mapping:")
for name, idx in CLASS_MAPPING.items():
    print(f"  {idx}: {name}")

Class mapping:
  0: red ball
  1: human


## Step 4: Copy Yahoo Dataset

Copy images and labels from yahoo_human_balls dataset

In [6]:
def copy_dataset_with_class_conversion(source_ready_path: Path, output_images: Path, output_labels: Path, class_mapping: dict):
    """
    Copy a dataset in the new format (ready/images + ready/labels/{class}/)
    to a unified format with class ID conversion
    """
    source_images = source_ready_path / "images"
    source_labels = source_ready_path / "labels"
    
    if not source_images.exists():
        print(f"‚ö†Ô∏è  No images found at {source_images}")
        return 0
    
    # Get all images
    images = list(source_images.glob("*.jpg")) + list(source_images.glob("*.jpeg")) + \
             list(source_images.glob("*.JPG")) + list(source_images.glob("*.JPEG")) + \
             list(source_images.glob("*.png")) + list(source_images.glob("*.PNG"))
    
    print(f"üìÅ Found {len(images)} images")
    
    # Build image -> labels mapping
    image_labels = defaultdict(list)
    
    for class_name, class_id in class_mapping.items():
        class_label_dir = source_labels / class_name
        if not class_label_dir.exists():
            continue
        
        txt_files = list(class_label_dir.glob("*.txt"))
        print(f"  - {class_name} ({class_id}): {len(txt_files)} label files")
        
        for txt_file in txt_files:
            image_labels[txt_file.stem].append((class_id, txt_file))
    
    # Copy images and merge labels
    copied_count = 0
    
    for img_path in tqdm(images, desc="Copying"):
        stem = img_path.stem
        
        # Copy image
        dest_img = output_images / img_path.name
        if not dest_img.exists():
            shutil.copy(img_path, dest_img)
        
        # Merge all labels for this image
        if stem in image_labels:
            merged_labels = []
            
            for class_id, txt_file in image_labels[stem]:
                with open(txt_file, 'r') as f:
                    lines = f.readlines()
                
                # Convert class ID (first element of each line)
                for line in lines:
                    line = line.strip()
                    if line:
                        parts = line.split()
                        # Replace class ID with our mapping
                        parts[0] = str(class_id)
                        merged_labels.append(" ".join(parts))
            
            # Write merged label file
            dest_txt = output_labels / f"{stem}.txt"
            dest_txt.write_text("\n".join(merged_labels))
            copied_count += 1
    
    print(f"‚úÖ Copied {copied_count} image-label pairs")
    return copied_count

In [7]:
print("\n" + "="*60)
print("üì¶ COPYING YAHOO_HUMAN_BALLS DATASET")
print("="*60)

yahoo_count = copy_dataset_with_class_conversion(
    YAHOO_DATASET,
    OUTPUT_IMAGES,
    OUTPUT_LABELS,
    CLASS_MAPPING
)


üì¶ COPYING YAHOO_HUMAN_BALLS DATASET
üìÅ Found 495 images
  - red ball (0): 456 label files
  - human (1): 457 label files


Copying: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 495/495 [00:01<00:00, 374.46it/s]

‚úÖ Copied 495 image-label pairs





## Step 5: Add COCO Persons

Copy COCO person images and labels (class ID = 1)

In [8]:
def copy_coco_persons(coco_dir: Path, output_images: Path, output_labels: Path, max_images: int = None):
    """
    Copy COCO person dataset (old format with images/ and labels/ directly)
    """
    coco_images = coco_dir / "images"
    coco_labels = coco_dir / "labels"
    
    if not coco_images.exists() or not coco_labels.exists():
        print(f"‚ö†Ô∏è  COCO dataset not found at {coco_dir}")
        return 0
    
    # Get image-label pairs
    images = list(coco_images.glob("*.jpg")) + list(coco_images.glob("*.jpeg")) + list(coco_images.glob("*.png"))
    
    if max_images:
        images = random.sample(images, min(len(images), max_images))
    
    print(f"üìÅ Found {len(images)} COCO images")
    
    copied_count = 0
    
    for img_path in tqdm(images, desc="Copying COCO"):
        stem = img_path.stem
        txt_path = coco_labels / f"{stem}.txt"
        
        if not txt_path.exists():
            continue
        
        # Copy image
        dest_img = output_images / f"coco_{img_path.name}"  # Prefix to avoid conflicts
        if not dest_img.exists():
            shutil.copy(img_path, dest_img)
        
        # Copy label (already in class ID 1 format for persons)
        dest_txt = output_labels / f"coco_{stem}.txt"
        if not dest_txt.exists():
            shutil.copy(txt_path, dest_txt)
            copied_count += 1
    
    print(f"‚úÖ Copied {copied_count} COCO image-label pairs")
    return copied_count

In [9]:
print("\n" + "="*60)
print("üì¶ ADDING COCO PERSONS")
print("="*60)

coco_count = copy_coco_persons(
    COCO_DATASET,
    OUTPUT_IMAGES,
    OUTPUT_LABELS,
    max_images=500  # Limit COCO images to balance dataset
)


üì¶ ADDING COCO PERSONS
üìÅ Found 200 COCO images


Copying COCO: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:00<00:00, 663.50it/s]

‚úÖ Copied 200 COCO image-label pairs





## Step 6: Dataset Statistics

In [10]:
# Count final dataset
total_images = len(list(OUTPUT_IMAGES.glob("*")))
total_labels = len(list(OUTPUT_LABELS.glob("*.txt")))

# Count instances per class
class_counts = defaultdict(int)

for txt_file in OUTPUT_LABELS.glob("*.txt"):
    with open(txt_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                class_id = int(line.split()[0])
                class_counts[class_id] += 1

print("\n" + "="*60)
print("üìä FINAL DATASET STATISTICS")
print("="*60)
print(f"Total images: {total_images}")
print(f"Total labels: {total_labels}")
print(f"\nInstances per class:")
for class_id in sorted(class_counts.keys()):
    class_name = CLASS_NAMES[class_id]
    print(f"  {class_id} ({class_name}): {class_counts[class_id]} instances")

print(f"\n‚úÖ Combined dataset ready at: {OUTPUT_DATASET}")
print(f"  - Images: {OUTPUT_IMAGES}")
print(f"  - Labels: {OUTPUT_LABELS}")


üìä FINAL DATASET STATISTICS
Total images: 695
Total labels: 695

Instances per class:
  0 (red ball): 461 instances
  1 (human): 1277 instances

‚úÖ Combined dataset ready at: datasets/ready/combined_train
  - Images: datasets/ready/combined_train/images
  - Labels: datasets/ready/combined_train/labels
