In [4]:
# Split a dataset into train, test, and validation sets balancing according to the logic below
# NB All folders with "Aug" are NOT included in the augmentation SET
# SHUFFLE NON -AUGMENTED DATA GLOBALLY Non augmented data is shuffled 
# All folders with "Mrg", "Swapped" and "Perspective" "OCR" only are incuded in the augmentation set. 
# Features:
#- Identifies and moves augmented data to the train folder.
#- Splits remaining data into train, test, and validation sets based on defined proportions.
#- Logs details of the split into a CSV file for easy verification
#
#  Detailed outline: 
# 1.	Define split ratios as splitTrain .7, splitTest .15 and splitVal .15 1
# 2.	Calculate totalPairs (total dataset pairs) 
# 3.	Calculate and augmentedPairs (pairs in augmented folders).
# 4.	Calculate the number of remaining pairs nonAugmentedPairs = totalPairs â€“ augmentedPairs)
# 5.	Calculate the targets to split the non-augmented data.
#         o	For Train this will be totalPairs * splitTrain - augmentedPairs
#         o	For Test this will be splitTest * totalPairs
#         o	For Valid this will be splitVal * totalPairs
# 6.	Create trainMoved, testMoved and valMoved to record the number moved for each set. 
# 7.	Move the augmented data to the train folder, incrementing trainMoved
# 8.	Confirm that augmented data was moved to train folder and show value of trainMoved.
# 9.	Shuffle the non -augmented data and distribute to 3 folders in a controlled way. 
        # o	
        # o	
        # o	
#10.	Continue moving files to test until testMoved is equal to splitTest
#11.	Continue moving files to val until splitVal is equal to splitMoved
#12.	Print a message when finished moving to test and when finished moving to val
#13.	Move the remaining unaugmented files to train, incrementing trainMoved
#14.	Print a message when finished moving to train
#15.	Create the CSV shwoing the number of pairs moved to each folder, the number of augmented folders moved to train 

import os
import shutil
import random
import csv

def find_pairs(folder):
    """Find image/annotation pairs in a folder."""
    pairs = []
    for root, _, files in os.walk(folder):
        files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.txt'))]
        images = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        for image in images:
            annotation = os.path.splitext(image)[0] + ".txt"
            if annotation in files:
                pairs.append((os.path.join(root, image), os.path.join(root, annotation)))
    return pairs

def copy_pairs(pairs, dest_folder):
    """Copy image/annotation pairs to a destination folder."""
    os.makedirs(dest_folder, exist_ok=True)
    for image, annotation in pairs:
        shutil.copy(image, os.path.join(dest_folder, os.path.basename(image)))
        shutil.copy(annotation, os.path.join(dest_folder, os.path.basename(annotation)))

def balance_dataset_split(source_folder, train_folder, test_folder, val_folder, log_file):
    """Balance the dataset split into train, test, and validation sets."""
    splitTrain = 0.7
    splitTest = 0.15
    splitVal = 0.15

    # Identify augmented folders
    augmented_folders = [
        os.path.join(source_folder, d)
        for d in os.listdir(source_folder)
        if os.path.isdir(os.path.join(source_folder, d)) and
        ("Mrg" in d or "Swapped" in d or "Perspective" in d or "OCR" in d)
    ]

    # Identify non-augmented folders (all other folders)
    non_augmented_folders = [
        os.path.join(source_folder, d)
        for d in os.listdir(source_folder)
        if os.path.isdir(os.path.join(source_folder, d)) and
        d not in {os.path.basename(folder) for folder in augmented_folders}
    ]

    log_data = []

    # Collect all pairs and calculate dataset statistics
    totalPairs = 0
    augmentedPairs = 0
    all_non_augmented_pairs = []

    # Process augmented folders
    for folder in augmented_folders:
        pairs = find_pairs(folder)
        totalPairs += len(pairs)
        augmentedPairs += len(pairs)
        copy_pairs(pairs, train_folder)
        log_data.append([os.path.basename(folder), len(pairs), len(pairs), 0, 0])  # Log augmented folders

    # Process non-augmented folders
    for folder in non_augmented_folders:
        pairs = find_pairs(folder)
        totalPairs += len(pairs)
        all_non_augmented_pairs.extend(pairs)
        log_data.append([os.path.basename(folder), len(pairs), 0, 0, 0])  # Initialize log entry

    nonAugmentedPairs = totalPairs - augmentedPairs
    trainTarget = int(totalPairs * splitTrain) - augmentedPairs
    testTarget = int(totalPairs * splitTest)
    valTarget = int(totalPairs * splitVal)

    # Shuffle non-augmented pairs globally
    random.shuffle(all_non_augmented_pairs)

    trainMoved, testMoved, valMoved = 0, 0, 0

    # Allocate non-augmented data to test, validation, and train
    for pair in all_non_augmented_pairs:
        if testMoved < testTarget:
            copy_pairs([pair], test_folder)
            testMoved += 1
        elif valMoved < valTarget:
            copy_pairs([pair], val_folder)
            valMoved += 1
        else:
            copy_pairs([pair], train_folder)
            trainMoved += 1

    # Update log with final counts
    for row in log_data:
        folder_name = row[0]
        folder_pairs = [p for p in all_non_augmented_pairs if folder_name in p[0]]
        row[2] += len([p for p in folder_pairs if p in train_folder])
        row[3] += len([p for p in folder_pairs if p in test_folder])
        row[4] += len([p for p in folder_pairs if p in val_folder])

    # Add summary to log
    log_data.append([
        "TOTAL", totalPairs, trainMoved, testMoved, valMoved
    ])

    print(f"Finished moving data: {trainMoved} to train, {testMoved} to test, {valMoved} to val.")

    # Save log to CSV
    with open(log_file, mode='w', newline='') as csvfile:
        log_writer = csv.writer(csvfile)
        log_writer.writerow(["Subfolder", "Total Pairs", "Train Moved", "Test Moved", "Val Moved"])
        log_writer.writerows(log_data)

    print(f"Log saved to {log_file}.")
    print(f"Total Pairs: {totalPairs}, Augmented Pairs: {augmentedPairs}, Non-Augmented Pairs: {nonAugmentedPairs}")


# Define source and target folders
source_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedML"
train_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedMLSHUFFLE/train"
test_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedMLSHUFFLE/test"
val_folder = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedMLSHUFFLE/val"
log_file = "split_log_shuffle.csv"

# Create train, test, and validation folders
for folder in [train_folder, test_folder, val_folder]:
    os.makedirs(folder, exist_ok=True)

# Balance dataset split
balance_dataset_split(source_folder, train_folder, test_folder, val_folder, log_file)


Starting dataset split...


KeyboardInterrupt: 