# 📚 Libraries

In [None]:
import os
import random
import shutil
import matplotlib.pyplot as plt
import numpy as np

# ⚙️ Parameters

In [None]:
SOURCE_DIR = r"C:\Users\cadur\Downloads\Urbansonic\ESC_50\0_Mels_Gen\DATA\Data_Augmentacion"
DEST_DIR = r"C:\Users\cadur\Downloads\Urbansonic\ESC_50\0_Mels_Gen\RESULT\Split"
TRAIN_RATIO = 0.90

# 📁 Output Directories

In [None]:
train_dir = os.path.join(DEST_DIR, "train")
test_dir = os.path.join(DEST_DIR, "test")
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 🧪 1. Split Data into Training and Testing Sets

In [None]:
for class_name in os.listdir(SOURCE_DIR):
    class_path = os.path.join(SOURCE_DIR, class_name)
    if os.path.isdir(class_path):
        files = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
        random.shuffle(files)

        n_total = len(files)
        n_train = int(n_total * TRAIN_RATIO)

        train_files = files[:n_train]
        test_files = files[n_train:]

        train_class_dir = os.path.join(train_dir, class_name)
        test_class_dir = os.path.join(test_dir, class_name)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)

        for f in train_files:
            shutil.copy2(os.path.join(class_path, f), os.path.join(train_class_dir, f))
        for f in test_files:
            shutil.copy2(os.path.join(class_path, f), os.path.join(test_class_dir, f))

        print(f"✅ Class '{class_name}': {len(train_files)} training files, {len(test_files)} testing files.")

# 📊 2. Plot Class Distribution (Generic Function)

In [None]:
def plot_class_distribution(data_dir, title):
    classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    counts = []
    for cls in classes:
        cls_path = os.path.join(data_dir, cls)
        n_images = len([f for f in os.listdir(cls_path) if os.path.isfile(os.path.join(cls_path, f))])
        counts.append(n_images)

    fig, ax = plt.subplots(figsize=(10, 6))
    colors = plt.cm.tab10(np.linspace(0, 1, len(classes)))
    bars = ax.bar(classes, counts, color=colors)

    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, height, f'{int(height)}', ha='center', va='bottom', fontsize=12)

    ax.set_xlabel("Classes")
    ax.set_ylabel("Number of Images")
    ax.set_title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# 📈 3. Plot Distribution for Training and Testing

In [None]:
plot_class_distribution(train_dir, "🟩 Image Distribution per Class (TRAIN)")
plot_class_distribution(test_dir, "🟦 Image Distribution per Class (TEST)")

# 🔁 4. Stratified 5-Fold Cross-Validation

In [None]:
CV_OUTPUT_DIR = r"C:\Users\cadur\Downloads\Urbansonic\ESC_50\0_Mels_Gen\RESULT\CV_5"
N_FOLDS = 5

os.makedirs(CV_OUTPUT_DIR, exist_ok=True)

class_folders = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
print("🔎 Detected classes:", class_folders)

class_files = {}
for cls in class_folders:
    cls_path = os.path.join(train_dir, cls)
    files = [os.path.join(cls_path, f) for f in os.listdir(cls_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    class_files[cls] = files

total_images = sum(len(files) for files in class_files.values())
print(f"🧮 Total images in training set: {total_images}")

fold_assignments = {i: {} for i in range(N_FOLDS)}
for cls, files in class_files.items():
    files = np.array(files)
    indices = np.arange(len(files))
    np.random.shuffle(indices)
    split_indices = np.array_split(indices, N_FOLDS)
    for i in range(N_FOLDS):
        fold_assignments[i][cls] = files[split_indices[i]].tolist()

for i in range(N_FOLDS):
    fold_dir = os.path.join(CV_OUTPUT_DIR, f"fold_{i+1}")
    os.makedirs(fold_dir, exist_ok=True)
    for cls in class_folders:
        os.makedirs(os.path.join(fold_dir, cls), exist_ok=True)
        for file_path in fold_assignments[i][cls]:
            dest_path = os.path.join(fold_dir, cls, os.path.basename(file_path))
            shutil.copy2(file_path, dest_path)
    fold_count = sum(len(os.listdir(os.path.join(fold_dir, cls))) for cls in class_folders)
    print(f"📁 Fold {i+1} created with {fold_count} images.")

# ✅ Final Verification

In [None]:
total_fold_images = 0
for i in range(N_FOLDS):
    fold_dir = os.path.join(CV_OUTPUT_DIR, f"fold_{i+1}")
    fold_count = sum(len(os.listdir(os.path.join(fold_dir, cls))) for cls in class_folders)
    total_fold_images += fold_count

print(f"📊 Total images across all folds: {total_fold_images}")