# K-Fold CV Dataset Subsampling

Stratified random subsampling of the **k_fold_cv_augmented** dataset to create a smaller **k_fold_cv_subsampled** dataset. It processes each fold independently, sampling from each fold's training set while preserving the original class distribution.

In [None]:
# Import required libraries
import os
import sys
import shutil
from pathlib import Path
import yaml
import random
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
random.seed(17)
np.random.seed(17)

# Set up visualization
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# K-Fold CV dataset paths
SOURCE_DIR = Path("datasets/k_fold_cv_augmented")
TARGET_DIR = Path("datasets/k_fold_cv_subsampled")
NUM_FOLDS = 5
TARGET_TRAIN_SIZE_PER_FOLD = 10000

# Waste classes
CLASS_NAMES = ['glass', 'metal', 'organic', 'paper', 'plastic']
COLORS = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']

print("🚀 K-Fold CV Dataset Subsampling Setup Complete")
print(f"📁 Source: {SOURCE_DIR}")
print(f"📁 Target: {TARGET_DIR}")
print(f"🔢 Number of folds: {NUM_FOLDS}")
print(f"🎯 Target train size per fold: {TARGET_TRAIN_SIZE_PER_FOLD:,} images")
print(f"🎯 Total target train size: {TARGET_TRAIN_SIZE_PER_FOLD * NUM_FOLDS:,} images")
print(f"📊 Classes: {CLASS_NAMES}")

In [None]:
def analyze_kfold_dataset(dataset_dir):
    """
    Analyze the k-fold CV dataset to understand class distribution across all folds
    """
    print("🔍 ANALYZING K-FOLD CV DATASET")
    print("=" * 60)
    
    if not dataset_dir.exists():
        raise FileNotFoundError(f"Source directory not found: {dataset_dir}")
    
    fold_stats = {}
    total_stats = {
        'class_counts': Counter(),
        'file_to_classes': {},
        'valid_images': [],
        'total_annotations': 0,
        'total_images': 0
    }
    
    for fold_idx in range(NUM_FOLDS):
        fold_dir = dataset_dir / f"fold_{fold_idx}"
        print(f"\n📁 Analyzing fold_{fold_idx}...")
        
        if not fold_dir.exists():
            print(f"⚠️  Fold_{fold_idx} not found, skipping...")
            continue
        
        train_dir = fold_dir / "train"
        labels_dir = train_dir / "labels"
        images_dir = train_dir / "images"
        
        if not labels_dir.exists() or not images_dir.exists():
            print(f"❌ Train directories not found for fold_{fold_idx}")
            continue
        
        # Count class occurrences for this fold
        fold_class_counts = Counter()
        fold_file_to_classes = {}
        
        # Get all label files
        label_files = list(labels_dir.glob("*.txt"))
        print(f"  📄 Found {len(label_files)} label files")
        
        # Analyze each label file
        for label_file in label_files:
            classes_in_file = set()
            
            if label_file.stat().st_size == 0:
                continue  # Skip empty files
                
            try:
                with open(label_file, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            parts = line.split()
                            if len(parts) >= 5:
                                class_id = int(parts[0])
                                if 0 <= class_id < len(CLASS_NAMES):
                                    class_name = CLASS_NAMES[class_id]
                                    fold_class_counts[class_name] += 1
                                    classes_in_file.add(class_name)
            except Exception as e:
                print(f"    ❌ Error reading {label_file}: {e}")
                continue
            
            fold_file_to_classes[label_file.stem] = classes_in_file
        
        # Get available images for this fold
        image_files = list(images_dir.glob("*"))
        fold_valid_images = []
        
        for img_file in image_files:
            if img_file.stem in fold_file_to_classes:
                fold_valid_images.append(img_file.stem)
        
        fold_total_annotations = sum(fold_class_counts.values())
        
        print(f"  🖼️ Total images: {len(image_files)}")
        print(f"  ✅ Valid image-label pairs: {len(fold_valid_images)}")
        print(f"  📊 Total annotations: {fold_total_annotations}")
        
        print(f"  📈 Class Distribution for fold_{fold_idx}:")
        for class_name in CLASS_NAMES:
            count = fold_class_counts.get(class_name, 0)
            percentage = (count / fold_total_annotations * 100) if fold_total_annotations > 0 else 0
            print(f"     {class_name:8}: {count:5,} annotations ({percentage:5.1f}%)")
        
        # Store fold statistics
        fold_stats[fold_idx] = {
            'class_counts': fold_class_counts,
            'file_to_classes': fold_file_to_classes,
            'valid_images': fold_valid_images,
            'total_annotations': fold_total_annotations,
            'total_images': len(fold_valid_images),
            'fold_dir': fold_dir
        }
        
        # Update total statistics
        total_stats['class_counts'].update(fold_class_counts)
        total_stats['file_to_classes'].update({f"fold_{fold_idx}_{k}": v for k, v in fold_file_to_classes.items()})
        total_stats['valid_images'].extend([f"fold_{fold_idx}_{img}" for img in fold_valid_images])
        total_stats['total_annotations'] += fold_total_annotations
        total_stats['total_images'] += len(fold_valid_images)
    
    print(f"\n🌟 OVERALL STATISTICS ACROSS ALL FOLDS:")
    print(f"  📊 Total annotations: {total_stats['total_annotations']:,}")
    print(f"  🖼️ Total valid images: {total_stats['total_images']:,}")
    print(f"  📈 Overall class distribution:")
    
    for class_name in CLASS_NAMES:
        count = total_stats['class_counts'].get(class_name, 0)
        percentage = (count / total_stats['total_annotations'] * 100) if total_stats['total_annotations'] > 0 else 0
        print(f"     {class_name:8}: {count:5,} annotations ({percentage:5.1f}%)")
    
    return fold_stats, total_stats

# Analyze the k-fold CV dataset
fold_stats, total_stats = analyze_kfold_dataset(SOURCE_DIR)

In [None]:
def perform_stratified_sampling_per_fold(fold_stats, target_size_per_fold):
    """
    Perform stratified sampling for each fold independently
    """
    print(f"\n🎲 PERFORMING STRATIFIED SAMPLING FOR EACH FOLD")
    print("=" * 60)
    
    fold_sampling_stats = {}
    
    for fold_idx, stats in fold_stats.items():
        print(f"\n🔄 Processing fold_{fold_idx}...")
        print("-" * 30)
        
        class_counts = stats['class_counts']
        file_to_classes = stats['file_to_classes']
        valid_images = stats['valid_images']
        total_annotations = stats['total_annotations']
        
        if total_annotations == 0:
            print(f"⚠️  Skipping fold_{fold_idx}: No annotations found")
            continue
        
        # Calculate proportional targets for each class in this fold
        target_class_counts = {}
        print("🎯 Target class distribution:")
        for class_name in CLASS_NAMES:
            original_count = class_counts.get(class_name, 0)
            proportion = original_count / total_annotations if total_annotations > 0 else 0
            target_count = int(target_size_per_fold * proportion)
            target_class_counts[class_name] = target_count
            print(f"   {class_name:8}: {target_count:4,} images ({proportion*100:5.1f}%)")
        
        # Group images by the classes they contain
        class_to_images = defaultdict(list)
        for image_name, classes_in_image in file_to_classes.items():
            for class_name in classes_in_image:
                class_to_images[class_name].append(image_name)
        
        print(f"\n📊 Images per class in fold_{fold_idx}:")
        for class_name in CLASS_NAMES:
            count = len(class_to_images[class_name])
            print(f"   {class_name:8}: {count:4,} images")
        
        # Strategy: Sample images to approximately match target class distribution
        selected_images = set()
        current_class_counts = Counter()
        
        # Create a list of all images with their class sets for this fold
        image_class_data = [(img, classes) for img, classes in file_to_classes.items()]
        random.shuffle(image_class_data)
        
        # Priority-based selection to balance classes
        max_iterations = len(image_class_data) * 2  # Prevent infinite loops
        iteration = 0
        
        while len(selected_images) < target_size_per_fold and iteration < max_iterations:
            iteration += 1
            
            # Find the class that is most under-represented
            class_deficits = {}
            for class_name in CLASS_NAMES:
                target = target_class_counts[class_name]
                current = current_class_counts[class_name]
                if target > 0:
                    deficit_ratio = (target - current) / target
                    class_deficits[class_name] = deficit_ratio
                else:
                    class_deficits[class_name] = 0
            
            # Sort classes by deficit (most needed first)
            most_needed_class = max(class_deficits, key=class_deficits.get)
            
            # Find unselected images that contain the most needed class
            candidates = [
                (img, classes) for img, classes in image_class_data 
                if img not in selected_images and most_needed_class in classes
            ]
            
            if not candidates:
                # If no candidates for the most needed class, pick any remaining image
                candidates = [
                    (img, classes) for img, classes in image_class_data 
                    if img not in selected_images
                ]
            
            if not candidates:
                break  # No more images to select
            
            # Select the best candidate (prefer images with classes that are under-represented)
            best_img, best_classes = None, None
            best_score = -float('inf')
            
            for img, classes in candidates[:100]:  # Limit search for efficiency
                score = sum(class_deficits.get(c, 0) for c in classes)
                if score > best_score:
                    best_score = score
                    best_img, best_classes = img, classes
            
            if best_img:
                selected_images.add(best_img)
                for class_name in best_classes:
                    current_class_counts[class_name] += 1
        
        print(f"\n✅ Selected {len(selected_images)} images for fold_{fold_idx}")
        print("📊 Achieved class distribution:")
        
        sampling_ratio = len(selected_images) / len(valid_images) if len(valid_images) > 0 else 0
        
        fold_sampling_stats[fold_idx] = {
            'selected_images': selected_images,
            'target_class_counts': target_class_counts,
            'achieved_class_counts': current_class_counts,
            'sampling_ratio': sampling_ratio,
            'fold_dir': stats['fold_dir']
        }
        
        for class_name in CLASS_NAMES:
            target = target_class_counts[class_name]
            achieved = current_class_counts[class_name]
            ratio = achieved / target if target > 0 else 0
            print(f"   {class_name:8}: {achieved:4,} / {target:4,} ({ratio:5.1%})")
    
    # Overall summary
    total_selected = sum(len(stats['selected_images']) for stats in fold_sampling_stats.values())
    print(f"\n🌟 OVERALL SAMPLING SUMMARY:")
    print(f"  Total images selected across all folds: {total_selected:,}")
    print(f"  Average images per fold: {total_selected/len(fold_sampling_stats):.0f}")
    
    return fold_sampling_stats

# Perform the stratified sampling for each fold
fold_sampling_stats = perform_stratified_sampling_per_fold(fold_stats, TARGET_TRAIN_SIZE_PER_FOLD)

In [None]:
def create_subsampled_kfold_dataset(source_dir, target_dir, fold_sampling_stats):
    """
    Create the subsampled k-fold CV dataset by copying selected files from each fold
    """
    print(f"\n📁 CREATING SUBSAMPLED K-FOLD CV DATASET")
    print("=" * 60)
    
    # Create target directory structure
    target_dir.mkdir(parents=True, exist_ok=True)
    print(f"📁 Created main directory at {target_dir}")
    
    total_copied_images = 0
    total_copied_labels = 0
    
    for fold_idx, stats in fold_sampling_stats.items():
        print(f"\n🔄 Processing fold_{fold_idx}...")
        print("-" * 30)
        
        # Create fold directory structure
        fold_target_dir = target_dir / f"fold_{fold_idx}"
        for split in ['train', 'val']:
            for subdir in ['images', 'labels']:
                (fold_target_dir / split / subdir).mkdir(parents=True, exist_ok=True)
        
        print(f"  📁 Created directory structure for fold_{fold_idx}")
        
        # Copy subsampled training set
        selected_images = stats['selected_images']
        fold_source_dir = stats['fold_dir']
        
        source_train_images = fold_source_dir / "train" / "images"
        source_train_labels = fold_source_dir / "train" / "labels"
        target_train_images = fold_target_dir / "train" / "images"
        target_train_labels = fold_target_dir / "train" / "labels"
        
        print(f"  📤 Copying training set ({len(selected_images)} images)...")
        
        fold_copied_images = 0
        fold_copied_labels = 0
        
        for img_name in tqdm(selected_images, desc=f"Copying fold_{fold_idx} train files"):
            # Find the actual image file (could have different extensions)
            img_files = list(source_train_images.glob(f"{img_name}.*"))
            if img_files:
                source_img = img_files[0]
                target_img = target_train_images / source_img.name
                shutil.copy2(source_img, target_img)
                fold_copied_images += 1
            
            # Copy corresponding label file
            source_label = source_train_labels / f"{img_name}.txt"
            if source_label.exists():
                target_label = target_train_labels / f"{img_name}.txt"
                shutil.copy2(source_label, target_label)
                fold_copied_labels += 1
        
        print(f"  ✅ Copied {fold_copied_images} training images")
        print(f"  ✅ Copied {fold_copied_labels} training labels")
        
        total_copied_images += fold_copied_images
        total_copied_labels += fold_copied_labels
        
        # Copy validation set (unchanged)
        print(f"  📤 Copying validation set (unchanged)...")
        
        source_val = fold_source_dir / "val"
        target_val = fold_target_dir / "val"
        
        if source_val.exists():
            # Copy all validation images
            source_val_images = source_val / "images"
            source_val_labels = source_val / "labels"
            
            val_img_count = 0
            val_label_count = 0
            
            if source_val_images.exists():
                val_img_files = list(source_val_images.glob("*"))
                for img_file in val_img_files:
                    shutil.copy2(img_file, target_val / "images" / img_file.name)
                val_img_count = len(val_img_files)
            
            if source_val_labels.exists():
                val_label_files = list(source_val_labels.glob("*.txt"))
                for label_file in val_label_files:
                    shutil.copy2(label_file, target_val / "labels" / label_file.name)
                val_label_count = len(val_label_files)
            
            print(f"  ✅ Copied {val_img_count} validation images")
            print(f"  ✅ Copied {val_label_count} validation labels")
        
        # Copy and update data.yaml for this fold
        source_yaml = fold_source_dir / "data.yaml"
        target_yaml = fold_target_dir / "data.yaml"
        
        if source_yaml.exists():
            with open(source_yaml, 'r') as f:
                data_config = yaml.safe_load(f)
            
            # Update paths to be relative to fold directory
            data_config['train'] = 'train/images'
            data_config['val'] = 'val/images'
            
            # Add metadata
            data_config['# Subsampling Info'] = {
                'original_dataset': str(fold_source_dir),
                'target_train_size': len(selected_images),
                'sampling_date': '2025-07-20',
                'sampling_seed': 42,
                'fold_index': fold_idx
            }
            
            with open(target_yaml, 'w') as f:
                yaml.dump(data_config, f, default_flow_style=False)
            
            print(f"  ✅ Created data.yaml for fold_{fold_idx}")
    
    print(f"\n🎉 K-Fold CV dataset subsampling complete!")
    print(f"📁 New dataset location: {target_dir}")
    print(f"🖼️ Total training images copied: {total_copied_images:,}")
    print(f"📄 Total training labels copied: {total_copied_labels:,}")
    print(f"📊 Average images per fold: {total_copied_images/len(fold_sampling_stats):.0f}")

# Create the subsampled k-fold CV dataset
create_subsampled_kfold_dataset(SOURCE_DIR, TARGET_DIR, fold_sampling_stats)

In [None]:
def validate_subsampled_kfold_dataset(target_dir):
    """
    Validate the created subsampled k-fold CV dataset
    """
    print(f"\n✅ VALIDATING SUBSAMPLED K-FOLD CV DATASET")
    print("=" * 60)
    
    fold_validation_stats = {}
    total_validation_stats = Counter()
    
    for fold_idx in range(NUM_FOLDS):
        fold_dir = target_dir / f"fold_{fold_idx}"
        print(f"\n📁 Validating fold_{fold_idx}...")
        
        if not fold_dir.exists():
            print(f"❌ fold_{fold_idx} directory not found!")
            continue
        
        # Check directory structure
        required_dirs = [
            fold_dir / "train" / "images",
            fold_dir / "train" / "labels", 
            fold_dir / "val" / "images",
            fold_dir / "val" / "labels"
        ]
        
        for dir_path in required_dirs:
            if dir_path.exists():
                file_count = len(list(dir_path.glob("*")))
                relative_path = dir_path.relative_to(fold_dir)
                print(f"  ✅ {relative_path}: {file_count} files")
            else:
                relative_path = dir_path.relative_to(fold_dir)
                print(f"  ❌ {relative_path}: Missing!")
        
        # Analyze new train set for this fold
        train_labels_dir = fold_dir / "train" / "labels"
        fold_class_counts = Counter()
        
        if train_labels_dir.exists():
            label_files = list(train_labels_dir.glob("*.txt"))
            
            for label_file in label_files:
                if label_file.stat().st_size == 0:
                    continue
                
                try:
                    with open(label_file, 'r') as f:
                        for line in f:
                            line = line.strip()
                            if line:
                                parts = line.split()
                                if len(parts) >= 5:
                                    class_id = int(parts[0])
                                    if 0 <= class_id < len(CLASS_NAMES):
                                        class_name = CLASS_NAMES[class_id]
                                        fold_class_counts[class_name] += 1
                except Exception as e:
                    continue
            
            print(f"  📊 Training set class distribution for fold_{fold_idx}:")
            fold_total = sum(fold_class_counts.values())
            for class_name in CLASS_NAMES:
                count = fold_class_counts.get(class_name, 0)
                percentage = (count / fold_total * 100) if fold_total > 0 else 0
                print(f"     {class_name:8}: {count:5,} annotations ({percentage:5.1f}%)")
        
        fold_validation_stats[fold_idx] = fold_class_counts
        total_validation_stats.update(fold_class_counts)
    
    # Overall validation summary
    print(f"\n🌟 OVERALL VALIDATION SUMMARY:")
    print("-" * 40)
    
    overall_total = sum(total_validation_stats.values())
    print(f"📊 Total annotations across all folds: {overall_total:,}")
    
    if overall_total > 0:
        print(f"📈 Overall class distribution:")
        for class_name in CLASS_NAMES:
            count = total_validation_stats.get(class_name, 0)
            percentage = (count / overall_total * 100) if overall_total > 0 else 0
            print(f"   {class_name:8}: {count:5,} annotations ({percentage:5.1f}%)")
    
    return fold_validation_stats, total_validation_stats

# Validate the subsampled k-fold CV dataset
fold_validation_stats, total_validation_stats = validate_subsampled_kfold_dataset(TARGET_DIR)

In [None]:
def create_kfold_comparison_visualizations(fold_stats, fold_sampling_stats, fold_validation_stats, total_stats):
    """
    Create visualizations comparing original vs subsampled k-fold CV datasets
    """
    print(f"\n📊 CREATING K-FOLD COMPARISON VISUALIZATIONS")
    print("=" * 60)
    
    # Aggregate data across all folds
    original_counts = [total_stats['class_counts'].get(cls, 0) for cls in CLASS_NAMES]
    
    # Sum achieved counts across all folds
    achieved_counts = [0] * len(CLASS_NAMES)
    for fold_idx, stats in fold_sampling_stats.items():
        for i, class_name in enumerate(CLASS_NAMES):
            achieved_counts[i] += stats['achieved_class_counts'].get(class_name, 0)
    
    # Sum validation counts across all folds
    validation_counts = [total_validation_stats.get(cls, 0) for cls in CLASS_NAMES]
    
    # Calculate proportions
    original_total = sum(original_counts)
    achieved_total = sum(achieved_counts)
    validation_total = sum(validation_counts)
    
    original_props = [c/original_total*100 if original_total > 0 else 0 for c in original_counts]
    achieved_props = [c/achieved_total*100 if achieved_total > 0 else 0 for c in achieved_counts]
    validation_props = [c/validation_total*100 if validation_total > 0 else 0 for c in validation_counts]
    
    # Create comprehensive comparison plot
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('K-Fold CV Dataset Subsampling Comparison', fontsize=16, fontweight='bold')
    
    # 1. Absolute counts comparison
    x = np.arange(len(CLASS_NAMES))
    width = 0.25
    
    ax1.bar(x - width, original_counts, width, label='Original', color='skyblue', alpha=0.8)
    ax1.bar(x, achieved_counts, width, label='Subsampled', color='orange', alpha=0.8)
    ax1.bar(x + width, validation_counts, width, label='Validated', color='lightgreen', alpha=0.8)
    
    ax1.set_title('Absolute Annotation Counts (All Folds)')
    ax1.set_xlabel('Classes')
    ax1.set_ylabel('Number of Annotations')
    ax1.set_xticks(x)
    ax1.set_xticklabels(CLASS_NAMES, rotation=45)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, (orig, ach, val) in enumerate(zip(original_counts, achieved_counts, validation_counts)):
        ax1.text(i - width, orig + max(original_counts)*0.01, f'{orig}', ha='center', va='bottom', fontsize=8)
        ax1.text(i, ach + max(achieved_counts)*0.01, f'{ach}', ha='center', va='bottom', fontsize=8)
        ax1.text(i + width, val + max(validation_counts)*0.01, f'{val}', ha='center', va='bottom', fontsize=8)
    
    # 2. Percentage distribution comparison
    ax2.bar(x - width/2, original_props, width, label='Original (%)', color='skyblue', alpha=0.8)
    ax2.bar(x + width/2, achieved_props, width, label='Subsampled (%)', color='orange', alpha=0.8)
    
    ax2.set_title('Percentage Distribution Comparison (All Folds)')
    ax2.set_xlabel('Classes')
    ax2.set_ylabel('Percentage (%)')
    ax2.set_xticks(x)
    ax2.set_xticklabels(CLASS_NAMES, rotation=45)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Per-fold sampling visualization
    fold_indices = list(fold_sampling_stats.keys())
    fold_sampling_ratios = [fold_sampling_stats[i]['sampling_ratio'] for i in fold_indices]
    
    bars = ax3.bar([f'Fold {i}' for i in fold_indices], fold_sampling_ratios, 
                   color=COLORS[:len(fold_indices)], alpha=0.8)
    overall_ratio = achieved_total / original_total if original_total > 0 else 0
    ax3.axhline(y=overall_ratio, color='red', linestyle='--', 
               label=f'Overall Ratio: {overall_ratio:.3f}')
    ax3.set_title('Sampling Ratio by Fold')
    ax3.set_xlabel('Folds')
    ax3.set_ylabel('Sampling Ratio')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Add value labels
    for i, (bar, ratio) in enumerate(zip(bars, fold_sampling_ratios)):
        ax3.text(bar.get_x() + bar.get_width()/2, ratio + max(fold_sampling_ratios)*0.01, 
                f'{ratio:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 4. Dataset size comparison
    original_total_images = total_stats['total_images']
    subsampled_total_images = sum(len(stats['selected_images']) for stats in fold_sampling_stats.values())
    
    # Count validation images across all folds
    original_val_images = 0
    subsampled_val_images = 0
    
    for fold_idx in range(NUM_FOLDS):
        # Original validation count
        orig_fold_dir = SOURCE_DIR / f"fold_{fold_idx}" / "val" / "images"
        if orig_fold_dir.exists():
            original_val_images += len(list(orig_fold_dir.glob("*")))
        
        # Subsampled validation count
        sub_fold_dir = TARGET_DIR / f"fold_{fold_idx}" / "val" / "images"
        if sub_fold_dir.exists():
            subsampled_val_images += len(list(sub_fold_dir.glob("*")))
    
    sizes = ['Original\nTrain', 'Subsampled\nTrain', 'Original\nVal', 'Subsampled\nVal']
    counts = [original_total_images, subsampled_total_images, original_val_images, subsampled_val_images]
    
    colors_size = ['skyblue', 'orange', 'lightblue', 'moccasin']
    bars = ax4.bar(sizes, counts, color=colors_size, alpha=0.8)
    ax4.set_title('Dataset Size Comparison (All Folds)')
    ax4.set_ylabel('Number of Images')
    ax4.grid(True, alpha=0.3)
    
    # Add value labels
    for bar, count in zip(bars, counts):
        ax4.text(bar.get_x() + bar.get_width()/2, count + max(counts)*0.01, 
                f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Per-fold detailed comparison
    if len(fold_indices) > 1:
        fig, axes = plt.subplots(1, len(fold_indices), figsize=(4*len(fold_indices), 6))
        if len(fold_indices) == 1:
            axes = [axes]
        
        fig.suptitle('Class Distribution Comparison by Fold', fontsize=14, fontweight='bold')
        
        for idx, fold_idx in enumerate(fold_indices):
            # Original vs achieved for this fold
            fold_original = [fold_stats[fold_idx]['class_counts'].get(cls, 0) for cls in CLASS_NAMES]
            fold_achieved = [fold_sampling_stats[fold_idx]['achieved_class_counts'].get(cls, 0) for cls in CLASS_NAMES]
            
            x_fold = np.arange(len(CLASS_NAMES))
            width_fold = 0.35
            
            axes[idx].bar(x_fold - width_fold/2, fold_original, width_fold, 
                         label='Original', color='skyblue', alpha=0.8)
            axes[idx].bar(x_fold + width_fold/2, fold_achieved, width_fold, 
                         label='Subsampled', color='orange', alpha=0.8)
            
            axes[idx].set_title(f'Fold {fold_idx}')
            axes[idx].set_xlabel('Classes')
            if idx == 0:
                axes[idx].set_ylabel('Annotations')
            axes[idx].set_xticks(x_fold)
            axes[idx].set_xticklabels([cls[:4] for cls in CLASS_NAMES], rotation=45)
            if idx == 0:
                axes[idx].legend()
            axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # Summary statistics
    print(f"\n📈 K-FOLD SUBSAMPLING SUMMARY")
    print("=" * 40)
    print(f"Original training images (all folds): {original_total_images:,}")
    print(f"Subsampled training images (all folds): {subsampled_total_images:,}")
    print(f"Overall sampling ratio: {overall_ratio:.1%}")
    print(f"Target size achieved: {subsampled_total_images/(TARGET_TRAIN_SIZE_PER_FOLD * NUM_FOLDS):.1%}")
    
    print(f"\n🎯 Class Distribution Preservation (Overall):")
    for i, class_name in enumerate(CLASS_NAMES):
        original_pct = original_props[i]
        achieved_pct = achieved_props[i]
        difference = abs(achieved_pct - original_pct)
        print(f"   {class_name:8}: {original_pct:5.1f}% → {achieved_pct:5.1f}% (Δ{difference:4.1f}%)")
    
    avg_difference = np.mean([abs(a-o) for a, o in zip(achieved_props, original_props)])
    print(f"\nAverage distribution difference: {avg_difference:.2f}%")
    
    print(f"\n📊 Per-fold statistics:")
    for fold_idx in fold_indices:
        fold_ratio = fold_sampling_stats[fold_idx]['sampling_ratio']
        fold_images = len(fold_sampling_stats[fold_idx]['selected_images'])
        print(f"   Fold {fold_idx}: {fold_images:3d} images (ratio: {fold_ratio:.3f})")
    
    return {
        'original_props': original_props,
        'achieved_props': achieved_props,
        'overall_ratio': overall_ratio,
        'avg_difference': avg_difference,
        'fold_ratios': {i: fold_sampling_stats[i]['sampling_ratio'] for i in fold_indices}
    }

# Create comparison visualizations
comparison_stats = create_kfold_comparison_visualizations(
    fold_stats, fold_sampling_stats, fold_validation_stats, total_stats
)