In [1]:
# Split a dataset into train, test, and validation sets balancing according to the logic below
# THERE IS NO LOGIC FOR SEPARATNG AUGMETNED FILES HERE SIMPLE SPLIT IN ROUND ROBIN STYLE
import os
import shutil
import csv
import random

def find_pairs(folder):
    """Find image/annotation pairs in a folder."""
    pairs = []
    for root, _, files in os.walk(folder):
        files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.txt'))]
        images = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        for image in images:
            annotation = os.path.splitext(image)[0] + ".txt"
            if annotation in files:
                pairs.append((os.path.join(root, image), os.path.join(root, annotation)))
    return pairs

def copy_pairs(pairs, dest_folder):
    """Copy image/annotation pairs to a destination folder."""
    os.makedirs(dest_folder, exist_ok=True)
    for image, annotation in pairs:
        shutil.copy(image, os.path.join(dest_folder, os.path.basename(image)))
        shutil.copy(annotation, os.path.join(dest_folder, os.path.basename(annotation)))

def split_dataset(source_folder, train_folder, val_folder, test_folder, log_file, train_ratio=0.7, val_ratio=0.15):
    """Split dataset into train, validation, and test sets using round-robin allocation."""
    print("Starting dataset split...")
    
    # Find all pairs in the source folder
    all_pairs = []
    for subfolder in os.listdir(source_folder):
        folder_path = os.path.join(source_folder, subfolder)
        if os.path.isdir(folder_path):
            pairs = find_pairs(folder_path)
            all_pairs.extend(pairs)

    # Shuffle all pairs
    random.shuffle(all_pairs)

    # Calculate split counts
    total_pairs = len(all_pairs)
    train_count = int(total_pairs * train_ratio)
    val_count = int(total_pairs * val_ratio)
    test_count = total_pairs - train_count - val_count

    # Split data
    train_pairs = all_pairs[:train_count]
    val_pairs = all_pairs[train_count:train_count + val_count]
    test_pairs = all_pairs[train_count + val_count:]

    # Copy pairs to respective folders
    copy_pairs(train_pairs, train_folder)
    copy_pairs(val_pairs, val_folder)
    copy_pairs(test_pairs, test_folder)

    # Log results
    with open(log_file, mode='w', newline='') as csvfile:
        log_writer = csv.writer(csvfile)
        log_writer.writerow(["Total Pairs", "Train Pairs", "Validation Pairs", "Test Pairs"])
        log_writer.writerow([total_pairs, train_count, val_count, test_count])

    print(f"Dataset split completed. Train: {train_count}, Validation: {val_count}, Test: {test_count}. Log saved to {log_file}.")

# Define paths
source_folder = "D:/FlagDetectionDatasets/ExportedDatasetsReducedML"
train_folder = "D:/FlagDetectionDatasets/ExportedDatasetsReducedML_A/train"
val_folder = "D:/FlagDetectionDatasets/ExportedDatasetsReducedML_A/val"
test_folder = "D:/FlagDetectionDatasets/ExportedDatasetsReducedML_A/test"
log_file = "split_log_simplified.csv"

# Create train, validation, and test folders
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Split dataset
split_dataset(source_folder, train_folder, val_folder, test_folder, log_file)


Starting dataset split...
Moved 0 augmented pairs to train folder.
Finished moving data: 8461 to train, 1813 to test, 1813 to val.
Log saved to split_log_13.csv.
Total Pairs: 12087, Augmented Pairs: 0, Non-Augmented Pairs: 12087
