In [18]:
import os
import yaml
import shutil
import numpy as np
from collections import defaultdict, Counter
from sklearn.model_selection import StratifiedKFold
import random

# Configuration
dataset_path = "../../datasets/roboflow_2"
output_path = "../../datasets/k_fold_cv"
k_folds = 5
random_seed = 42

# Set random seeds for reproducibility
random.seed(random_seed)
np.random.seed(random_seed)

print(f"Setting up {k_folds}-fold cross validation")
print(f"Source dataset: {dataset_path}")
print(f"Output directory: {output_path}")

# Load data.yaml to get class information
with open(os.path.join(dataset_path, "data.yaml"), 'r') as f:
    data_config = yaml.safe_load(f)

print(f"Dataset classes: {data_config.get('names', [])}")
print(f"Number of classes: {data_config.get('nc', 0)}")

# Create output directory
os.makedirs(output_path, exist_ok=True)

Setting up 5-fold cross validation
Source dataset: ../../datasets/roboflow_2
Output directory: ../../datasets/k_fold_cv
Dataset classes: ['glass', 'metal', 'organic', 'paper', 'plastic']
Number of classes: 5


In [19]:
def collect_dataset_files(dataset_path):
    """Collect all image and label files from train, test, val directories"""
    all_files = []
    
    for split in ['train', 'test', 'val']:
        images_dir = os.path.join(dataset_path, split, 'images')
        labels_dir = os.path.join(dataset_path, split, 'labels')
        
        if not os.path.exists(images_dir) or not os.path.exists(labels_dir):
            print(f"Warning: {split} directory not found, skipping...")
            continue
            
        # Get all image files
        image_files = [f for f in os.listdir(images_dir) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        
        for img_file in image_files:
            # Find corresponding label file
            base_name = os.path.splitext(img_file)[0]
            label_file = base_name + '.txt'
            
            img_path = os.path.join(images_dir, img_file)
            label_path = os.path.join(labels_dir, label_file)
            
            if os.path.exists(label_path):
                all_files.append({
                    'image_path': img_path,
                    'label_path': label_path,
                    'filename': img_file,
                    'original_split': split
                })
            else:
                print(f"Warning: No label file found for {img_file}")
    
    return all_files

def analyze_class_distribution(label_path):
    """Analyze class distribution in a label file"""
    classes = []
    if os.path.exists(label_path) and os.path.getsize(label_path) > 0:
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:  # class_id x y w h
                    classes.append(int(parts[0]))
    return classes

# Collect all files
print("Collecting dataset files...")
all_files = collect_dataset_files(dataset_path)
print(f"Found {len(all_files)} image-label pairs")

# Analyze class distribution for each file
print("Analyzing class distributions...")
file_classes = {}
overall_class_counts = Counter()

for file_info in all_files:
    classes = analyze_class_distribution(file_info['label_path'])
    file_classes[file_info['filename']] = classes
    overall_class_counts.update(classes)
    
print(f"Overall class distribution: {dict(overall_class_counts)}")

Collecting dataset files...
Found 12012 image-label pairs
Analyzing class distributions...
Overall class distribution: {1: 4880, 3: 2855, 2: 1634, 4: 6730, 0: 1262}
Overall class distribution: {1: 4880, 3: 2855, 2: 1634, 4: 6730, 0: 1262}


In [20]:
def get_dominant_class_strategy(file_classes, all_files):
    """
    Assign each image to its dominant class for stratification.
    If no objects, assign to a special 'background' class.
    """
    stratification_labels = []
    
    for file_info in all_files:
        filename = file_info['filename']
        classes = file_classes[filename]
        
        if not classes:
            # No objects in this image - assign to background class
            stratification_labels.append(-1)
        else:
            # Find the most common class in this image
            class_counts = Counter(classes)
            dominant_class = class_counts.most_common(1)[0][0]
            stratification_labels.append(dominant_class)
    
    return stratification_labels

def create_balanced_splits(all_files, stratification_labels, k_folds):
    """Create k-fold splits with balanced class distribution (80% train, 20% test, + validation from train)"""
    
    # Convert to numpy arrays for easier handling
    file_indices = np.arange(len(all_files))
    
    # Use StratifiedKFold to create 80/20 train/test splits
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)  # 5-fold = 80/20 split
    
    fold_splits = []
    splits_list = list(skf.split(file_indices, stratification_labels))
    
    for fold_idx in range(k_folds):
        # Use different splits for each fold
        train_test_indices, test_indices = splits_list[fold_idx % len(splits_list)]
        
        # Now split the train_test_indices into train and validation
        # Extract validation from the train set (15% of train set = ~12% of total)
        train_test_labels = [stratification_labels[i] for i in train_test_indices]
        
        # Create another stratified split for train/val
        val_skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=random_seed)  # ~85/15 split
        inner_splits = list(val_skf.split(train_test_indices, train_test_labels))
        
        # Take the first split
        train_indices_inner, val_indices_inner = inner_splits[0]
        train_indices = train_test_indices[train_indices_inner]
        val_indices = train_test_indices[val_indices_inner]
        
        fold_splits.append({
            'train': train_indices,
            'val': val_indices,
            'test': test_indices
        })
        
        total = len(train_indices) + len(val_indices) + len(test_indices)
        print(f"Fold {fold_idx + 1}: Train={len(train_indices)} ({len(train_indices)/total*100:.1f}%), "
              f"Val={len(val_indices)} ({len(val_indices)/total*100:.1f}%), "
              f"Test={len(test_indices)} ({len(test_indices)/total*100:.1f}%)")
    
    return fold_splits

# Get stratification labels using dominant class strategy
print("Determining stratification strategy...")
stratification_labels = get_dominant_class_strategy(file_classes, all_files)

# Check distribution of stratification labels
strat_counts = Counter(stratification_labels)
print(f"Stratification label distribution: {dict(strat_counts)}")

# Create balanced k-fold splits
print(f"\nCreating {k_folds}-fold splits...")
fold_splits = create_balanced_splits(all_files, stratification_labels, k_folds)

Determining stratification strategy...
Stratification label distribution: {1: 3768, 3: 2628, 2: 1447, 4: 3220, 0: 949}

Creating 5-fold splits...
Fold 1: Train=8236 (68.6%), Val=1373 (11.4%), Test=2403 (20.0%)
Fold 2: Train=8236 (68.6%), Val=1373 (11.4%), Test=2403 (20.0%)
Fold 3: Train=8237 (68.6%), Val=1373 (11.4%), Test=2402 (20.0%)
Fold 4: Train=8237 (68.6%), Val=1373 (11.4%), Test=2402 (20.0%)
Fold 5: Train=8237 (68.6%), Val=1373 (11.4%), Test=2402 (20.0%)


In [21]:
def create_fold_directories(output_path, fold_idx):
    """Create directory structure for a specific fold"""
    fold_dir = os.path.join(output_path, f"fold_{fold_idx}")
    
    # Create directories (train, val, and test)
    for split in ['train', 'val', 'test']:
        for subdir in ['images', 'labels']:
            dir_path = os.path.join(fold_dir, split, subdir)
            os.makedirs(dir_path, exist_ok=True)
    
    return fold_dir

def copy_files_for_split(all_files, indices, fold_dir, split_name):
    """Copy files for a specific split (train/val/test) to the fold directory"""
    images_dir = os.path.join(fold_dir, split_name, 'images')
    labels_dir = os.path.join(fold_dir, split_name, 'labels')
    
    copied_count = 0
    for idx in indices:
        file_info = all_files[idx]
        
        # Copy image file
        src_img = file_info['image_path']
        dst_img = os.path.join(images_dir, file_info['filename'])
        shutil.copy2(src_img, dst_img)
        
        # Copy label file
        src_label = file_info['label_path']
        label_filename = os.path.splitext(file_info['filename'])[0] + '.txt'
        dst_label = os.path.join(labels_dir, label_filename)
        shutil.copy2(src_label, dst_label)
        
        copied_count += 1
    
    return copied_count

def create_data_yaml(fold_dir, data_config, fold_idx):
    """Create data.yaml file for the fold"""
    # Update paths to be relative to the fold directory (train, val, and test)
    fold_data_config = data_config.copy()
    fold_data_config['train'] = 'train/images'
    fold_data_config['val'] = 'val/images'
    fold_data_config['test'] = 'test/images'
    
    # Write the data.yaml file
    yaml_path = os.path.join(fold_dir, 'data.yaml')
    with open(yaml_path, 'w') as f:
        yaml.dump(fold_data_config, f, default_flow_style=False)
    
    return yaml_path

def analyze_fold_distribution(all_files, indices, file_classes):
    """Analyze class distribution for a specific fold"""
    class_counts = Counter()
    
    for idx in indices:
        filename = all_files[idx]['filename']
        classes = file_classes[filename]
        class_counts.update(classes)
    
    return dict(class_counts)

print("Creating fold directories and copying files...")

for fold_idx in range(k_folds):
    print(f"\nProcessing Fold {fold_idx + 1}/{k_folds}")
    
    # Create fold directory structure
    fold_dir = create_fold_directories(output_path, fold_idx + 1)
    
    # Get splits for this fold
    splits = fold_splits[fold_idx]
    
    # Copy files for each split
    for split_name, indices in splits.items():
        copied_count = copy_files_for_split(all_files, indices, fold_dir, split_name)
        print(f"  {split_name}: copied {copied_count} files")
        
        # Analyze class distribution for this split
        class_dist = analyze_fold_distribution(all_files, indices, file_classes)
        print(f"    Class distribution: {class_dist}")
    
    # Create data.yaml for this fold
    yaml_path = create_data_yaml(fold_dir, data_config, fold_idx + 1)
    print(f"  Created: {yaml_path}")

print(f"\nK-fold cross validation setup complete!")
print(f"Output directory: {output_path}")
print(f"Each fold contains train (~68%), val (~12%), test (20%) splits with balanced class distributions")

Creating fold directories and copying files...

Processing Fold 1/5
  train: copied 8236 files
    Class distribution: {3: 1974, 4: 4594, 2: 1116, 1: 3268, 0: 851}
  val: copied 1373 files
    Class distribution: {1: 597, 2: 186, 4: 768, 3: 315, 0: 160}
  train: copied 8236 files
    Class distribution: {3: 1974, 4: 4594, 2: 1116, 1: 3268, 0: 851}
  val: copied 1373 files
    Class distribution: {1: 597, 2: 186, 4: 768, 3: 315, 0: 160}
  test: copied 2403 files
    Class distribution: {4: 1368, 0: 251, 2: 332, 1: 1015, 3: 566}
  Created: ../../datasets/k_fold_cv/fold_1/data.yaml

Processing Fold 2/5
  test: copied 2403 files
    Class distribution: {4: 1368, 0: 251, 2: 332, 1: 1015, 3: 566}
  Created: ../../datasets/k_fold_cv/fold_1/data.yaml

Processing Fold 2/5
  train: copied 8236 files
    Class distribution: {4: 4690, 2: 1125, 0: 853, 3: 1946, 1: 3358}
  val: copied 1373 files
    Class distribution: {1: 559, 3: 340, 2: 189, 4: 754, 0: 169}
  train: copied 8236 files
    Class dis

In [22]:
# Final validation and summary
print("\n" + "="*60)
print("K-FOLD CROSS VALIDATION SUMMARY")
print("="*60)

total_files_processed = 0
for fold_idx in range(k_folds):
    fold_dir = os.path.join(output_path, f"fold_{fold_idx + 1}")
    
    # Count files in each split (train, val, and test)
    train_count = len([f for f in os.listdir(os.path.join(fold_dir, 'train', 'images')) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
    val_count = len([f for f in os.listdir(os.path.join(fold_dir, 'val', 'images'))
                    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
    test_count = len([f for f in os.listdir(os.path.join(fold_dir, 'test', 'images'))
                     if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
    
    fold_total = train_count + val_count + test_count
    total_files_processed += fold_total
    
    print(f"Fold {fold_idx + 1}:")
    print(f"  Train: {train_count} images ({train_count/fold_total*100:.1f}%)")
    print(f"  Val:   {val_count} images ({val_count/fold_total*100:.1f}%)")
    print(f"  Test:  {test_count} images ({test_count/fold_total*100:.1f}%)")
    print(f"  Total: {fold_total} images")
    print()

print(f"Original dataset: {len(all_files)} images")
print(f"Total processed: {total_files_processed // k_folds} images per fold")
print(f"Class names: {data_config.get('names', [])}")

# Verify that each fold directory has the correct structure
print("\nDirectory structure verification:")
for fold_idx in range(k_folds):
    fold_dir = os.path.join(output_path, f"fold_{fold_idx + 1}")
    data_yaml_exists = os.path.exists(os.path.join(fold_dir, 'data.yaml'))
    print(f"Fold {fold_idx + 1}: data.yaml exists = {data_yaml_exists}")

print(f"\nSetup complete! You can now use each fold for cross-validation training.")
print(f"Each fold is located in: {output_path}/fold_X/")
print(f"Each fold contains a data.yaml file ready for YOLO training.")


K-FOLD CROSS VALIDATION SUMMARY
Fold 1:
  Train: 8236 images (68.6%)
  Val:   1373 images (11.4%)
  Test:  2403 images (20.0%)
  Total: 12012 images

Fold 2:
  Train: 8236 images (68.6%)
  Val:   1373 images (11.4%)
  Test:  2403 images (20.0%)
  Total: 12012 images

Fold 3:
  Train: 8237 images (68.6%)
  Val:   1373 images (11.4%)
  Test:  2402 images (20.0%)
  Total: 12012 images

Fold 4:
  Train: 8237 images (68.6%)
  Val:   1373 images (11.4%)
  Test:  2402 images (20.0%)
  Total: 12012 images

Fold 5:
  Train: 8237 images (68.6%)
  Val:   1373 images (11.4%)
  Test:  2402 images (20.0%)
  Total: 12012 images

Original dataset: 12012 images
Total processed: 12012 images per fold
Class names: ['glass', 'metal', 'organic', 'paper', 'plastic']

Directory structure verification:
Fold 1: data.yaml exists = True
Fold 2: data.yaml exists = True
Fold 3: data.yaml exists = True
Fold 4: data.yaml exists = True
Fold 5: data.yaml exists = True

Setup complete! You can now use each fold for cr