In [1]:
1+8

9

In [7]:
## STEP 1: IMPORTS AND CONFIGURATION

import os
import json
from datasets import load_dataset
from PIL import Image
from collections import defaultdict
from tqdm import tqdm
import random

# 25 Selected Classes (CORRECT indices from detection-datasets/coco)

SELECTED_CLASSES = {
    'person': 0,
    'bicycle': 1,
    'car': 2,
    'motorcycle': 3,
    'airplane': 4,
    'bus': 5,
    'train': 6,
    'truck': 7,
    'traffic light': 9,
    'stop sign': 11,
    'bench': 13,
    'bird': 14,
    'cat': 15,
    'dog': 16,
    'horse': 17,
    'cow': 19,
    'elephant': 20,
    'bottle': 39,
    'cup': 41,
    'bowl': 45,
    'pizza': 53,
    'cake': 55,
    'chair': 56,
    'couch': 57,
    'potted plant': 58,
    'bed': 59
}

IMAGES_PER_CLASS = 100
BASE_DIR = "smartvision_dataset"

In [8]:
## STEP 2: LOAD COCO DATASET FROM HUGGING FACE

print("üì• Loading COCO dataset in STREAMING mode (no download)...")
dataset = load_dataset("detection-datasets/coco", split="train", streaming=True)
print("‚úÖ Dataset loaded in streaming mode!")

üì• Loading COCO dataset in STREAMING mode (no download)...




‚úÖ Dataset loaded in streaming mode!


In [9]:
# STEP 3: COLLECT IMAGES FROM STREAM

print("\nüîç Starting image collection from COCO dataset stream...")
print(f"üéØ Target: {IMAGES_PER_CLASS} images per class")
print()

# Initialize storage for collected images
class_images = {class_name: [] for class_name in SELECTED_CLASSES.keys()}
class_counts = {class_name: 0 for class_name in SELECTED_CLASSES.keys()}

# Progress tracking
total_collected = 0
images_processed = 0
max_iterations = 50000  # Safety limit

print("‚è≥ Processing images from stream...")
print("üí° Progress updates every 100 images collected")
print()

try:
    # Iterate through streaming dataset
    for idx, item in enumerate(dataset):
        images_processed += 1

        # Progress update every 1000 images processed
        if images_processed % 1000 == 0:
            print(f"üìä Processed {images_processed} images | Collected {total_collected}/{len(SELECTED_CLASSES) * IMAGES_PER_CLASS}")

        # Safety check
        if images_processed >= max_iterations:
            print(f"‚ö†Ô∏è Reached safety limit of {max_iterations} iterations")
            break

        # Check if we have enough images for ALL classes
        if all(count >= IMAGES_PER_CLASS for count in class_counts.values()):
            print("üéâ Successfully collected enough images for ALL classes!")
            break

        # Safely get annotations from current image
        if 'objects' not in item:
            continue
            
        annotations = item['objects']
        
        # Check if categories exist in annotations
        if 'category' not in annotations:
            continue
            
        categories = annotations['category']

        # Ensure categories is iterable
        if not hasattr(categories, '__iter__'):
            categories = [categories]

        # Check if any of our target classes are in this image
        for cat_id in categories:
            for class_name, class_id in SELECTED_CLASSES.items():
                if cat_id == class_id and class_counts[class_name] < IMAGES_PER_CLASS:
                    
                    # Verify image data exists
                    if 'image' not in item:
                        continue

                    # Store the ACTUAL image data (not just index!)
                    class_images[class_name].append({
                        'image': item['image'],           # PIL Image object
                        'annotations': item['objects'],   # Annotations
                        'idx': images_processed           # For naming
                    })

                    class_counts[class_name] += 1
                    total_collected += 1

                    # Progress update every 100 collected
                    if total_collected % 100 == 0:
                        print(f"‚úì Collected {total_collected}/{len(SELECTED_CLASSES) * IMAGES_PER_CLASS} images")

                    break  # Only count once per class per image

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Collection interrupted by user")
except Exception as e:
    print(f"\n‚ùå Error during collection: {str(e)}")
    print(f"Last processed index: {images_processed}")

print()
print("="*60)
print("üìä COLLECTION COMPLETE:")
print("="*60)
print(f"Images Processed: {images_processed}")
print(f"Images Collected: {total_collected}")
print()

for class_name, count in sorted(class_counts.items()):
    status = "‚úÖ" if count >= IMAGES_PER_CLASS else "‚ö†Ô∏è"
    print(f"{status} {class_name:20s}: {count:3d} images")

print("="*60)

# Check if collection was successful
if total_collected < len(SELECTED_CLASSES) * IMAGES_PER_CLASS:
    print("\n‚ö†Ô∏è Warning: Not all target images were collected")
    missing = len(SELECTED_CLASSES) * IMAGES_PER_CLASS - total_collected
    print(f"Missing: {missing} images")


üîç Starting image collection from COCO dataset stream...
üéØ Target: 100 images per class

‚è≥ Processing images from stream...
üí° Progress updates every 100 images collected

‚úì Collected 100/2600 images
‚úì Collected 200/2600 images
‚úì Collected 300/2600 images
‚úì Collected 400/2600 images
‚úì Collected 500/2600 images
‚úì Collected 600/2600 images
‚úì Collected 700/2600 images
‚úì Collected 800/2600 images
‚úì Collected 900/2600 images
‚úì Collected 1000/2600 images
‚úì Collected 1100/2600 images
‚úì Collected 1200/2600 images
‚úì Collected 1300/2600 images
‚úì Collected 1400/2600 images
‚úì Collected 1500/2600 images
‚úì Collected 1600/2600 images
üìä Processed 1000 images | Collected 1682/2600
‚úì Collected 1700/2600 images
‚úì Collected 1800/2600 images
‚úì Collected 1900/2600 images
‚úì Collected 2000/2600 images
‚úì Collected 2100/2600 images
‚úì Collected 2200/2600 images
‚úì Collected 2300/2600 images
üìä Processed 2000 images | Collected 2364/2600
‚úì Collected 24

In [10]:
## STEP 4: CREATE FOLDER STRUCTURE

print("\nüìÅ Creating project folder structure...")
print()

# Create main directory
os.makedirs(BASE_DIR, exist_ok=True)

# Create subdirectories for Classification task
os.makedirs(f"{BASE_DIR}/classification/train", exist_ok=True)
os.makedirs(f"{BASE_DIR}/classification/val", exist_ok=True)
os.makedirs(f"{BASE_DIR}/classification/test", exist_ok=True)

# Create subdirectories for Detection task
os.makedirs(f"{BASE_DIR}/detection/images", exist_ok=True)
os.makedirs(f"{BASE_DIR}/detection/labels", exist_ok=True)

# Create class folders inside train/val/test
for class_name in SELECTED_CLASSES.keys():
    os.makedirs(f"{BASE_DIR}/classification/train/{class_name}", exist_ok=True)
    os.makedirs(f"{BASE_DIR}/classification/val/{class_name}", exist_ok=True)
    os.makedirs(f"{BASE_DIR}/classification/test/{class_name}", exist_ok=True)

print("‚úÖ Folder structure created successfully!")
print()
print("üìÇ Structure:")
print(f"""
{BASE_DIR}/
‚îú‚îÄ‚îÄ classification/
‚îÇ   ‚îú‚îÄ‚îÄ train/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ person/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ car/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ   ‚îú‚îÄ‚îÄ val/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ   ‚îî‚îÄ‚îÄ test/
‚îÇ       ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ
‚îî‚îÄ‚îÄ detection/
    ‚îú‚îÄ‚îÄ images/
    ‚îî‚îÄ‚îÄ labels/
""")


üìÅ Creating project folder structure...

‚úÖ Folder structure created successfully!

üìÇ Structure:

smartvision_dataset/
‚îú‚îÄ‚îÄ classification/
‚îÇ   ‚îú‚îÄ‚îÄ train/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ person/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ car/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ   ‚îú‚îÄ‚îÄ val/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ   ‚îî‚îÄ‚îÄ test/
‚îÇ       ‚îî‚îÄ‚îÄ ... (25 class folders)
‚îÇ
‚îî‚îÄ‚îÄ detection/
    ‚îú‚îÄ‚îÄ images/
    ‚îî‚îÄ‚îÄ labels/



In [11]:
## STEP 5: TRAIN/VAL/TEST SPLIT (70/15/15)

print("="*70)
print("üîÄ Preparing Train/Val/Test splits...")
print("üìä Split Ratio: 70% Train / 15% Val / 15% Test")
print("="*70)
print()

# Initialize metadata dictionary
metadata = {
    'total_images': 0,
    'classes': {},
    'splits': {'train': 0, 'val': 0, 'test': 0}
}

# Create split dictionaries for each class
train_data = {}
val_data = {}
test_data = {}

# Process each class
for class_name in SELECTED_CLASSES.keys():

    all_items = class_images.get(class_name, [])

    if not all_items:
        print(f"‚ö†Ô∏è Warning: No images found for {class_name}")
        continue

    # Calculate split indices
    n = len(all_items)
    train_split = int(0.7 * n)   # 70% for training
    val_split = int(0.85 * n)    # 15% for validation
    # Remaining 15% for test

    # Split the data
    train_data[class_name] = all_items[:train_split]
    val_data[class_name] = all_items[train_split:val_split]
    test_data[class_name] = all_items[val_split:]

    # Store split info in metadata
    metadata['classes'][class_name] = {
        'train': len(train_data[class_name]),
        'val': len(val_data[class_name]),
        'test': len(test_data[class_name]),
        'total': len(all_items)
    }

    metadata['splits']['train'] += len(train_data[class_name])
    metadata['splits']['val'] += len(val_data[class_name])
    metadata['splits']['test'] += len(test_data[class_name])
    metadata['total_images'] += len(all_items)

    print(f"{class_name:20s}: Train={len(train_data[class_name]):3d} | Val={len(val_data[class_name]):2d} | Test={len(test_data[class_name]):2d}")

üîÄ Preparing Train/Val/Test splits...
üìä Split Ratio: 70% Train / 15% Val / 15% Test

person              : Train= 70 | Val=15 | Test=15
bicycle             : Train= 70 | Val=15 | Test=15
car                 : Train= 70 | Val=15 | Test=15
motorcycle          : Train= 70 | Val=15 | Test=15
airplane            : Train= 70 | Val=15 | Test=15
bus                 : Train= 70 | Val=15 | Test=15
train               : Train= 70 | Val=15 | Test=15
truck               : Train= 70 | Val=15 | Test=15
traffic light       : Train= 70 | Val=15 | Test=15
stop sign           : Train= 70 | Val=15 | Test=15
bench               : Train= 70 | Val=15 | Test=15
bird                : Train= 70 | Val=15 | Test=15
cat                 : Train= 70 | Val=15 | Test=15
dog                 : Train= 70 | Val=15 | Test=15
horse               : Train= 70 | Val=15 | Test=15
cow                 : Train= 70 | Val=15 | Test=15
elephant            : Train= 70 | Val=15 | Test=15
bottle              : Train= 70 | Val=15 | 

In [12]:
import os
from PIL import Image
from tqdm import tqdm
import json

print("="*70)
print("üíæ STEP 6: SAVING IMAGES TO DISK")
print("="*70)
print()

# PART A: SAVE CLASSIFICATION IMAGES


print("üìÅ PART A: Saving Classification Images...")
print("   Format: Cropped objects, 224x224 pixels\n")

classification_stats = {'train': 0, 'val': 0, 'test': 0}

# Process each split
for split_name, split_data in [('train', train_data), ('val', val_data), ('test', test_data)]:

    print(f"üìÇ Processing {split_name.upper()} split...")

    # Process each class
    for class_name, items in tqdm(split_data.items(), desc=f"  {split_name}"):

        class_folder = f"{BASE_DIR}/classification/{split_name}/{class_name}"

        # Save each image
        for img_idx, item in enumerate(items):

            img = item['image']
            annotations = item['annotations']
            bboxes = annotations['bbox']
            categories = annotations['category']

            class_id = SELECTED_CLASSES[class_name]

            # Find bbox for this class
            for bbox, cat_id in zip(bboxes, categories):
                if cat_id == class_id:
                    x, y, w, h = bbox

                    try:
                        # Crop and resize
                        cropped_img = img.crop((x, y, x + w, y + h))
                        cropped_img = cropped_img.resize((224, 224), Image.LANCZOS)

                        # Save
                        img_filename = f"{class_name}_{split_name}_{img_idx:04d}.jpg"
                        img_path = os.path.join(class_folder, img_filename)
                        cropped_img.save(img_path, quality=95)

                        classification_stats[split_name] += 1

                    except Exception as e:
                        print(f"‚ö†Ô∏è Error: {class_name} image {img_idx}: {e}")

                    break

print()
print("="*70)
print("‚úÖ CLASSIFICATION IMAGES SAVED!")
print("="*70)
print(f"üìä Train: {classification_stats['train']} images")
print(f"üìä Val:   {classification_stats['val']} images")
print(f"üìä Test:  {classification_stats['test']} images")
print(f"üìä Total: {sum(classification_stats.values())} images")
print()

üíæ STEP 6: SAVING IMAGES TO DISK

üìÅ PART A: Saving Classification Images...
   Format: Cropped objects, 224x224 pixels

üìÇ Processing TRAIN split...


  train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [01:16<00:00,  2.94s/it]


üìÇ Processing VAL split...


  val: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:16<00:00,  1.62it/s]


üìÇ Processing TEST split...


  test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:13<00:00,  1.88it/s]


‚úÖ CLASSIFICATION IMAGES SAVED!
üìä Train: 1820 images
üìä Val:   390 images
üìä Test:  390 images
üìä Total: 2600 images




