In [None]:
import os
import numpy as np
import shutil
from sklearn.model_selection import train_test_split

def split_dataset(source_path, output_path, train_ratio=0.8, random_state=42):
    """
    Split dataset into train and validation sets with stratified sampling
    
    Parameters:
    -----------
    source_path : str
        Path to original dataset folder
    output_path : str
        Path where split dataset will be saved
    train_ratio : float
        Ratio of training data (default 0.8 for 80%)
    random_state : int
        Random seed for reproducibility
    """
    
    print("="*80)
    print("DATASET SPLITTING - TRAIN/VAL (80:20)")
    print("="*80)
    
    # Set random seed for reproducibility
    np.random.seed(random_state)
    
    # Get class folders
    classes = [d for d in os.listdir(source_path) 
               if os.path.isdir(os.path.join(source_path, d))]
    
    print(f"\nClasses found: {classes}")
    
    # Create output directory structure
    for split in ['train', 'val']:
        for cls in classes:
            os.makedirs(os.path.join(output_path, split, cls), exist_ok=True)
    
    # Split data for each class
    total_train = 0
    total_val = 0
    
    for cls in classes:
        print(f"\nProcessing class: {cls}")
        
        # Get all image files
        class_path = os.path.join(source_path, cls)
        all_files = [f for f in os.listdir(class_path) 
                    if f.endswith(('.png', '.jpg', '.jpeg'))]
        
        # Create labels (same class for all files)
        labels = [cls] * len(all_files)
        
        # Stratified split
        train_files, val_files = train_test_split(
            all_files, 
            test_size=(1 - train_ratio), 
            random_state=random_state,
            stratify=None  # Not needed for single class, but ensures consistency
        )
        
        # Copy files to respective folders
        print(f"  Total images: {len(all_files)}")
        print(f"  Training: {len(train_files)} ({len(train_files)/len(all_files)*100:.2f}%)")
        print(f"  Validation: {len(val_files)} ({len(val_files)/len(all_files)*100:.2f}%)")
        
        # Copy training files
        for filename in train_files:
            src = os.path.join(class_path, filename)
            dst = os.path.join(output_path, 'train', cls, filename)
            shutil.copy2(src, dst)
        
        # Copy validation files
        for filename in val_files:
            src = os.path.join(class_path, filename)
            dst = os.path.join(output_path, 'val', cls, filename)
            shutil.copy2(src, dst)
        
        total_train += len(train_files)
        total_val += len(val_files)
    
    # Final summary
    print("\n" + "="*80)
    print("SPLITTING COMPLETE")
    print("="*80)
    print(f"\nTotal Training Images: {total_train}")
    print(f"Total Validation Images: {total_val}")
    print(f"Total Images: {total_train + total_val}")
    
    print(f"\nDataset saved to: {output_path}")
    print("  Structure:")
    print("  ├── train/")
    for cls in classes:
        train_count = len(os.listdir(os.path.join(output_path, 'train', cls)))
        print(f"  │   ├── {cls}/ ({train_count} images)")
    print("  └── val/")
    for cls in classes:
        val_count = len(os.listdir(os.path.join(output_path, 'val', cls)))
        print(f"      └── {cls}/ ({val_count} images)")
    
    print("\n✓ Dataset ready for training!")


# Run the split
source_path = r"C:\Users\USER\honors-final-project\models\malaria_dataset"
output_path = r"C:\Users\USER\honors-final-project\models\malaria_dataset\split_data"

split_dataset(source_path, output_path, train_ratio=0.8, random_state=42)


DATASET SPLITTING - TRAIN/VAL (80:20)

Classes found: ['Parasitized', 'Uninfected']

Processing class: Parasitized
  Total images: 13779
  Training: 11023 (80.00%)
  Validation: 2756 (20.00%)

Processing class: Uninfected
  Total images: 13779
  Training: 11023 (80.00%)
  Validation: 2756 (20.00%)
