# Image Dataset Splitter
This notebook splits images into train, validation, and test sets and saves them in a new folder structure.

## Import Required Libraries

In [18]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm

# Set random seed for reproducibility
random.seed(42)

## Configuration

In [19]:
# Source directory (change this to your dataset path)
SOURCE_DIR = 'RealWaste'

# Destination directory
DEST_DIR = r'E:\CNN-A03\dataloader'

# Split ratios
TRAIN_RATIO = 0.7  # 70% for training
VAL_RATIO = 0.15   # 15% for validation
TEST_RATIO = 0.15  # 15% for testing

print(f"Source directory: {SOURCE_DIR}")
print(f"Destination directory: {DEST_DIR}")
print(f"Split ratios - Train: {TRAIN_RATIO}, Val: {VAL_RATIO}, Test: {TEST_RATIO}")

Source directory: RealWaste
Destination directory: E:\CNN-A03\dataloader
Split ratios - Train: 0.7, Val: 0.15, Test: 0.15


## Create Directory Structure

In [20]:
def create_directory_structure(base_dir, classes):
    """
    Create train, val, and test directories with class subdirectories
    """
    for split in ['train', 'val', 'test']:
        for class_name in classes:
            dir_path = os.path.join(base_dir, split, class_name)
            os.makedirs(dir_path, exist_ok=True)
    print(f"Created directory structure in {base_dir}")

# Get class names from the source directory
train_dir = os.path.join(SOURCE_DIR, 'train')
if os.path.exists(train_dir):
    classes = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
else:
    # If train directory doesn't exist, list all subdirectories in source
    classes = [d for d in os.listdir(SOURCE_DIR) if os.path.isdir(os.path.join(SOURCE_DIR, d))]

print(f"Found {len(classes)} classes: {classes}")
create_directory_structure(DEST_DIR, classes)

Found 9 classes: ['Cardboard', 'Food Organics', 'Glass', 'Metal', 'Miscellaneous Trash', 'Paper', 'Plastic', 'Textile Trash', 'Vegetation']
Created directory structure in E:\CNN-A03\dataloader


## Collect All Images from Source

In [21]:
def collect_images_from_source(source_dir, classes):
    """
    Collect all images from source directory (train, val, test folders)
    Returns a dictionary with class names as keys and list of image paths as values
    """
    images_by_class = {class_name: [] for class_name in classes}
    
    # Check if source has train/val/test structure
    has_splits = all(os.path.exists(os.path.join(source_dir, split)) 
                     for split in ['train', 'val', 'test'])
    
    if has_splits:
        # Collect from train, val, and test folders
        for split in ['train', 'val', 'test']:
            split_dir = os.path.join(source_dir, split)
            for class_name in classes:
                class_dir = os.path.join(split_dir, class_name)
                if os.path.exists(class_dir):
                    for img_file in os.listdir(class_dir):
                        if img_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                            images_by_class[class_name].append(os.path.join(class_dir, img_file))
    else:
        # Collect directly from class folders
        for class_name in classes:
            class_dir = os.path.join(source_dir, class_name)
            if os.path.exists(class_dir):
                for img_file in os.listdir(class_dir):
                    if img_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                        images_by_class[class_name].append(os.path.join(class_dir, img_file))
    
    return images_by_class

images_by_class = collect_images_from_source(SOURCE_DIR, classes)

# Print statistics
print("\nImage count per class:")
total_images = 0
for class_name, images in images_by_class.items():
    print(f"  {class_name}: {len(images)} images")
    total_images += len(images)
print(f"\nTotal images: {total_images}")


Image count per class:
  Cardboard: 461 images
  Food Organics: 411 images
  Glass: 420 images
  Metal: 790 images
  Miscellaneous Trash: 495 images
  Paper: 500 images
  Plastic: 921 images
  Textile Trash: 318 images
  Vegetation: 436 images

Total images: 4752


## Split and Copy Images

In [22]:
def split_and_copy_images(images_by_class, dest_dir, train_ratio, val_ratio, test_ratio):
    """
    Split images into train, val, and test sets and copy them to destination
    """
    stats = {'train': 0, 'val': 0, 'test': 0}
    
    for class_name, images in tqdm(images_by_class.items(), desc="Processing classes"):
        if len(images) == 0:
            print(f"Warning: No images found for class {class_name}")
            continue
        
        # Shuffle images
        random.shuffle(images)
        
        # Calculate split indices
        n_total = len(images)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        
        # Split images
        train_images = images[:n_train]
        val_images = images[n_train:n_train + n_val]
        test_images = images[n_train + n_val:]
        
        # Copy images to respective folders
        for split_name, split_images in [('train', train_images), 
                                          ('val', val_images), 
                                          ('test', test_images)]:
            dest_class_dir = os.path.join(dest_dir, split_name, class_name)
            for img_path in split_images:
                img_name = os.path.basename(img_path)
                dest_path = os.path.join(dest_class_dir, img_name)
                shutil.copy2(img_path, dest_path)
                stats[split_name] += 1
    
    return stats

print("Starting to split and copy images...")
stats = split_and_copy_images(images_by_class, DEST_DIR, TRAIN_RATIO, VAL_RATIO, TEST_RATIO)

print("\n" + "="*50)
print("SPLITTING COMPLETE!")
print("="*50)
print(f"\nImages copied:")
print(f"  Training set: {stats['train']} images")
print(f"  Validation set: {stats['val']} images")
print(f"  Test set: {stats['test']} images")
print(f"  Total: {sum(stats.values())} images")
print(f"\nData saved in: {DEST_DIR}")

Starting to split and copy images...


Processing classes: 100%|██████████| 9/9 [00:02<00:00,  3.35it/s]


SPLITTING COMPLETE!

Images copied:
  Training set: 3323 images
  Validation set: 710 images
  Test set: 719 images
  Total: 4752 images

Data saved in: E:\CNN-A03\dataloader





## Verify the Split

In [23]:
def verify_split(dest_dir):
    """
    Verify the split by counting images in each split
    """
    print("\nVerification of split:")
    print("="*50)
    
    for split in ['train', 'val', 'test']:
        split_dir = os.path.join(dest_dir, split)
        print(f"\n{split.upper()} SET:")
        
        total_in_split = 0
        classes = [d for d in os.listdir(split_dir) if os.path.isdir(os.path.join(split_dir, d))]
        
        for class_name in sorted(classes):
            class_dir = os.path.join(split_dir, class_name)
            n_images = len([f for f in os.listdir(class_dir) 
                           if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))])
            print(f"  {class_name}: {n_images} images")
            total_in_split += n_images
        
        print(f"  Total: {total_in_split} images")

verify_split(DEST_DIR)


Verification of split:

TRAIN SET:
  Cardboard: 322 images
  Food Organics: 287 images
  Glass: 294 images
  Metal: 553 images
  Miscellaneous Trash: 346 images
  Paper: 350 images
  Plastic: 644 images
  Textile Trash: 222 images
  Vegetation: 305 images
  Total: 3323 images

VAL SET:
  Cardboard: 69 images
  Food Organics: 61 images
  Glass: 63 images
  Metal: 118 images
  Miscellaneous Trash: 74 images
  Paper: 75 images
  Plastic: 138 images
  Textile Trash: 47 images
  Vegetation: 65 images
  Total: 710 images

TEST SET:
  Cardboard: 70 images
  Food Organics: 63 images
  Glass: 63 images
  Metal: 119 images
  Miscellaneous Trash: 75 images
  Paper: 75 images
  Plastic: 139 images
  Textile Trash: 49 images
  Vegetation: 66 images
  Total: 719 images


## Optional: Create a Summary Report