In [3]:
# Split a dataset into train, test, and validation sets balancing according to the logic below
# NB All folders with "Aug" are included in the augmentation set but results in under representation 
# Features:
#- Identifies and moves augmented data to the train folder.
#- Splits remaining data into train, test, and validation sets based on defined proportions.
#- Logs details of the split into a CSV file for easy verification
#
#  Detailed outline: 
# 1.	Define split ratios as splitTrain .7, splitTest .15 and splitVal .15 1
# 2.	Calculate totalPairs (total dataset pairs) 
# 3.	Calculate and augmentedPairs (pairs in augmented folders).
# 4.	Calculate the number of remaining pairs nonAugmentedPairs = totalPairs – augmentedPairs)
# 5.	Calculate the targets to split the non-augmented data.
#         o	For Train this will be totalPairs * splitTrain - augmentedPairs
#         o	For Test this will be splitTest * totalPairs
#         o	For Valid this will be splitVal * totalPairs
# 6.	Create trainMoved, testMoved and valMoved to record the number moved for each set. 
# 7.	Move the augmented data to the train folder, incrementing trainMoved
# 8.	Confirm that augmented data was moved to train folder and show value of trainMoved.
# 9.	Split the remaining data into the 3 folders in a controlled way. For each sub-folder
        # o	Move 1 pair to test and increment testMoved
        # o	Move 1 pair to val and increment valMoved
        # o	Move 1 pair to trainMoved and increment trainMoved
#10.	Continue moving files to test until testMoved is equal to splitTest
#11.	Continue moving files to val until splitVal is equal to splitMoved
#12.	Print a message when finished moving to test and when finished moving to val
#13.	Move the remaining unaugmented files to train, incrementing trainMoved
#14.	Print a message when finished moving to train
#15.	Create the CSV shwoing the number of pairs moved to each folder, the number of augmented folders moved to train 
#       Consider these edge cases and consider also the viability of the resulting split and overall balance of unaugmented data in train
#       Is it sufficient, etc. Scenario where augmented data is equal or greater than the allowed allocation for the train split (augmentedPairs >= totalPairs * splitTrain)
#	    All remaining non-augmented pairs will go to test and val only
#       Scenario where remaining non augmented data is less than than the target for both test and val (nonAugmentedPairs < testTarget + valTarget)
#	    adjust the allocation to ensure no excess or shortage in test and val.

import os
import shutil
import random
import csv

def find_pairs(folder):
    """Find image/annotation pairs in a folder."""
    pairs = []
    for root, _, files in os.walk(folder):
        files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.txt'))]
        images = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        for image in images:
            annotation = os.path.splitext(image)[0] + ".txt"
            if annotation in files:
                pairs.append((os.path.join(root, image), os.path.join(root, annotation)))
    return pairs

def copy_pairs(pairs, dest_folder):
    """Copy image/annotation pairs to a destination folder."""
    os.makedirs(dest_folder, exist_ok=True)
    for image, annotation in pairs:
        shutil.copy(image, os.path.join(dest_folder, os.path.basename(image)))
        shutil.copy(annotation, os.path.join(dest_folder, os.path.basename(annotation)))

def balance_dataset_split(source_folder, train_folder, test_folder, val_folder, log_file):
    print(f"Starting...")

    """Balance the dataset split into train, test, and validation sets."""
    splitTrain = 0.7
    splitTest = 0.15
    splitVal = 0.15

    # Identify Augmented folders
    augmented_folders = [
        os.path.join(source_folder, d)
        for d in os.listdir(source_folder)
        if os.path.isdir(os.path.join(source_folder, d)) and ("Aug" in d or "Swapped" in d)
    ]

    # Identify Non-Augmented folders
    non_augmented_folders = [
        os.path.join(source_folder, d)
        for d in os.listdir(source_folder)
        if os.path.isdir(os.path.join(source_folder, d)) and ("Aug" not in d and "Swapped" not in d)
    ]

    log_data = []

    # Count total pairs
    totalPairs = 0
    for folder in augmented_folders + non_augmented_folders:
        totalPairs += len(find_pairs(folder))

    # Count augmented pairs
    augmentedPairs = 0
    for folder in augmented_folders:
        augmentedPairs += len(find_pairs(folder))

    # Calculate non-augmented pairs and split targets
    nonAugmentedPairs = totalPairs - augmentedPairs
    trainTarget = int(totalPairs * splitTrain) - augmentedPairs
    testTarget = int(totalPairs * splitTest)
    valTarget = int(totalPairs * splitVal)

    trainMoved, testMoved, valMoved = 0, 0, 0

    # Move augmented data to train
    for folder in augmented_folders:
        pairs = find_pairs(folder)
        copy_pairs(pairs, train_folder)
        trainMoved += len(pairs)

    print(f"Moved {trainMoved} augmented pairs to train folder.")

    # Move non-augmented data into train, test, and val
    for folder in non_augmented_folders:
        folder_name = os.path.basename(folder)
        pairs = find_pairs(folder)
        random.shuffle(pairs)

        # Move to test folder
        while testMoved < testTarget and pairs:
            pair = pairs.pop()
            copy_pairs([pair], test_folder)
            testMoved += 1

        # Move to val folder
        while valMoved < valTarget and pairs:
            pair = pairs.pop()
            copy_pairs([pair], val_folder)
            valMoved += 1

        # Move remaining to train folder
        while pairs:
            pair = pairs.pop()
            copy_pairs([pair], train_folder)
            trainMoved += 1

        log_data.append([
            folder_name, len(find_pairs(folder)), trainMoved, testMoved, valMoved
        ])

    print(f"Finished moving data: {trainMoved} to train, {testMoved} to test, {valMoved} to val.")

    # Log details to CSV
    with open(log_file, mode='w', newline='') as csvfile:
        log_writer = csv.writer(csvfile)
        log_writer.writerow([
            "Subfolder", "Total Pairs", "Train Moved", "Test Moved", "Val Moved"
        ])
        log_writer.writerows(log_data)

    print(f"Log saved to {log_file}.")

# Define source and target folders
source_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelected"
train_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedML/train"
test_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedML/test"
val_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedML/val"
log_file = "split_log_11_1_25.csv"

# Create train, test, and validation folders
for folder in [train_folder, test_folder, val_folder]:
    os.makedirs(folder, exist_ok=True)

# Balance dataset split
balance_dataset_split(source_folder, train_folder, test_folder, val_folder, log_file)


Moved 3991 augmented pairs to train folder.
Finished moving data: 6104 to train, 1308 to test, 1308 to val.
Log saved to split_log_11_1_25.csv.
