# This is a train test splitter

The intent of this code is to take each classification/image type combination and split the images into train-validate (80%) and test (20%) data.

INPUTS

All of the data is in each channel based folder.   e.g. _or contains all the original (3-channel or RGB images)
And Classification based sub-folder.   e.g. country contains 'country' classifier folders etc.

These are called root folders and dst_folders respectively.


PROCESSING

['country', 'exact_piece', 'force', 'piece'] classifier folders in each of the root folders are placed into  _test & _train subfolders, with images spread between the,

the 80/20 train test split is contained in def_split data.  Assuming a 80%(train-validate) 20%(test) split

This code runs over the 12 datasets produced. (4 for each of greyscale (1 channel), original (3-channel) and depth-map added (4-channel)  

In part 2, the irrelevant combinations are removed and an excel file generated.


LOCALISATION

Running on these datasets is hardcoded. Re-running requires that the folding folders are adjusted
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or',
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc'
        
These 3 folders can be seen in root_folders in part 1, dst_folders in part 2.

In addition, there is an excel filepath that requires repointing in part 


# Part 1

Split the data and generate all the subfolders

In [1]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split

def create_folder_structure(base_path, subfolders, labels):
    for subfolder in subfolders:
        train_folder = os.path.join(base_path, subfolder + '_train')
        test_folder = os.path.join(base_path, subfolder + '_test')
        os.makedirs(train_folder, exist_ok=True)
        os.makedirs(test_folder, exist_ok=True)
        
        for label in labels:
            os.makedirs(os.path.join(train_folder, label), exist_ok=True)
            os.makedirs(os.path.join(test_folder, label), exist_ok=True)

def get_files_from_folder(folder):
    files = []
    labels = []
    for root, _, filenames in os.walk(folder):
        for filename in filenames:
            if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                files.append(os.path.join(root, filename))
                labels.append(os.path.basename(root))  # Assuming the folder name is the label
    return np.array(files), np.array(labels)

def split_data(files, labels, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(files, labels, test_size=test_size, random_state=random_state, stratify=labels)
    return X_train, X_test, y_train, y_test

def distribute_files(files, labels, destination_folder):
    for file, label in zip(files, labels):
        label_folder = os.path.join(destination_folder, label)
        shutil.copy(file, label_folder)
    print(f"Copied {len(files)} files to {destination_folder}")

def process_main_folder(main_folder):
    subfolders = ['country', 'exact_piece', 'force', 'piece']
    
    # Collect unique labels across all subfolders
    all_labels = set()
    for subfolder in subfolders:
        full_path = os.path.join(main_folder, subfolder)
        if not os.path.exists(full_path):
            continue
        _, labels = get_files_from_folder(full_path)
        all_labels.update(labels)
    
    create_folder_structure(main_folder, subfolders, all_labels)
    
    for subfolder in subfolders:
        full_path = os.path.join(main_folder, subfolder)
        if not os.path.exists(full_path):
            continue
        files, labels = get_files_from_folder(full_path)
        
        # Split the data into train and test sets
        X_train, X_test, y_train, y_test = split_data(files, labels)
        
        # Distribute the files into train and test folders
        distribute_files(X_train, y_train, os.path.join(main_folder, subfolder + '_train'))
        distribute_files(X_test, y_test, os.path.join(main_folder, subfolder + '_test'))

def main(base_folders):
    for base_folder in base_folders:
        process_main_folder(base_folder)

if __name__ == "__main__":
    base_folders = [
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or',
        r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc'
    ]
    main(base_folders)


Copied 1224 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\country_train
Copied 306 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\country_test
Copied 1224 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\exact_piece_train
Copied 306 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\exact_piece_test
Copied 1224 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\force_train
Copied 306 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\force_test
Copied 1224 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\piece_train
Copied 306 files to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy\piece_test
Copied 1224 files to C:\Users\ReCas\OneDrive\Documents\2

# Part 2 : Remove extra folders created (to prevent classification issues)

Unfortunately, the above script generated class files for all 4 types in each _train _test folder.  So make sure only relevant folders exist

LOCALISATION

The root folders are called dst_folders below and will require amendment to your datasources.  These are 'dst_folders'.

Also the 'excel_save_path' folder needs re-definition


In [6]:
import os
import logging
import time
import pandas as pd
from datetime import datetime

# Configure logging
logging.basicConfig(filename='folder_cleanup.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the destination directories
dst_folders = {
    '06': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\06.DataSets_gy',
    '07': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\07.DataSets_or',
    '08': r'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\08.DataSets_fc'
}

# Define classes for country, force, piece, and exact piece
_classes = {
    'country': ['100RUS', '200GER', '300UK', '400JAP', '500USA'],
    'force': ['AIR', 'LND', 'SEA'],
    'piece': ['01INF', '02TNK', '03FGT', '04BMB', '05DES', '06TRS', '07SUB', '08BAT', '09CAR'],
    'exact_piece': [
        '101_100RUS_01INF', '102_100RUS_02TNK', '103_100RUS_03FGT', '104_100RUS_04BMB', '105_100RUS_05DES',
        '106_100RUS_06TRS', '107_100RUS_07SUB', '108_100RUS_08BAT', '109_100RUS_09CAR', '201_200GER_01INF',
        '202_200GER_02TNK', '203_200GER_03FGT', '204_200GER_04BMB', '205_200GER_05DES', '206_200GER_06TRS',
        '207_200GER_07SUB', '208_200GER_08BAT', '209_200GER_09CAR', '301_300UK_01INF', '302_300UK_02TNK',
        '303_300UK_03FGT', '304_300UK_04BMB', '305_300UK_05DES', '306_300UK_06TRS', '307_300UK_07SUB',
        '308_300UK_08BAT', '309_300UK_09CAR', '401_400JAP_01INF', '402_400JAP_02TNK', '403_400JAP_03FGT',
        '404_400JAP_04BMB', '405_400JAP_05DES', '406_400JAP_06TRS', '407_400JAP_07SUB', '408_400JAP_08BAT',
        '409_400JAP_09CAR', '501_500USA_01INF', '502_500USA_02TNK', '503_500USA_03FGT', '504_500USA_04BMB',
        '505_500USA_05DES', '506_500USA_06TRS', '507_500USA_07SUB', '508_500USA_08BAT', '509_500USA_09CAR'
    ]
}

# Define the path to save the Excel summary file with a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
excel_save_path = fr'C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__\FileList\folder_summary_{timestamp}.xlsx'

# Function to delete irrelevant subfolders only in _test and _train directories
def delete_irrelevant_folders():
    for key, dst_folder in dst_folders.items():
        for subfolder in ['country_test', 'country_train', 'force_test', 'force_train', 'piece_test', 'piece_train', 'exact_piece_test', 'exact_piece_train']:
            path = os.path.join(dst_folder, subfolder)
            for root, dirs, files in os.walk(path):
                for dir_name in dirs:
                    if (subfolder.startswith('country') and dir_name not in _classes['country']) or \
                       (subfolder.startswith('force') and dir_name not in _classes['force']) or \
                       (subfolder.startswith('piece') and dir_name not in _classes['piece']) or \
                       (subfolder.startswith('exact_piece') and dir_name not in _classes['exact_piece']):
                        full_path = os.path.join(root, dir_name)
                        # Ensure the folder is empty before attempting to delete it
                        if not os.listdir(full_path):  # Check if the directory is empty
                            try:
                                os.rmdir(full_path)
                                logging.info(f"Deleted empty folder: {full_path}")
                            except Exception as e:
                                logging.error(f"Error deleting folder {full_path}: {e}")
                        else:
                            logging.warning(f"Folder {full_path} not empty, skipping deletion.")

# Function to summarize the number of files in each folder and save to an Excel file
def summarize_files():
    summary_data = []
    for key, dst_folder in dst_folders.items():
        for subfolder in ['country', 'country_test', 'country_train', 'force', 'force_test', 'force_train', 'piece', 'piece_test', 'piece_train', 'exact_piece', 'exact_piece_test', 'exact_piece_train']:
            path = os.path.join(dst_folder, subfolder)
            if os.path.exists(path):
                for root, dirs, files in os.walk(path):
                    for dir_name in dirs:
                        full_path = os.path.join(root, dir_name)
                        file_count = len([name for name in os.listdir(full_path) if os.path.isfile(os.path.join(full_path, name))])
                        logging.info(f"Folder: {full_path} - {file_count} files")
                        summary_data.append({'Folder': full_path, 'File Count': file_count})

    # Create a DataFrame and save to Excel
    df = pd.DataFrame(summary_data)
    os.makedirs(os.path.dirname(excel_save_path), exist_ok=True)
    df.to_excel(excel_save_path, index=False)
    print(f"Summary saved to {excel_save_path}")

if __name__ == "__main__":
    start_time = time.time()
    try:
        delete_irrelevant_folders()
        logging.info("Folder cleanup completed successfully.")
        summarize_files()
        logging.info("File summarization completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"Script executed in {elapsed_time:.2f} seconds.")
    print(f"Script executed in {elapsed_time:.2f} seconds.")  # Also print the elapsed time for immediate feedback


Summary saved to C:\Users\ReCas\OneDrive\Documents\2024_AIMachineLearning\99_Projects\00.Imported_All_Files\202407__\FileList\folder_summary_20240731_151027.xlsx
Script executed in 2.05 seconds.
