In [None]:
# Step 4a
# Rename files to prevent conflicts 
# Convert the folder structure from the CVAT Yolo1.1. export
# For each sub-folder in input_data:
#  - Store the folder name as job_name.
#  - Delete the specified files (obj.data, train.txt, and obj.names) from the root of the job folder.
#  - Rename files within the folder obj_train_data by replacing the string "frame" with the job_name.
#  - Move the files up one level (from obj_train_data to the parent folder).
#  - Delete the now-empty obj_train_data folder.
# Split the data 
#  - Check if images_wip and labels_wip exist in the flags_data directory. 
#  - If not, create them along with their sub-folders (train, test, valid).
#  - Set up strudture for Yolo compatibility - Ensure images_wip and labels_wip, with train, test, and valid sub-folders inside both.
#  - Shuffe the image files and then splits them into train (70%), test (15%), and valid (15%) sets.
#  - Handle corresponding text files: For each image in the sets, finds the corresponding .txt annotation file and 
#  - Move the text files along with the images to their respective folders.
# Move image and text file:
# Move image and corresponding text files from their current job folder to the appropriate train, test, or valid folder.

import os
import shutil
import random
from pathlib import Path

def process_data(input_folder):
  
    # Iterate through each sub-folder (job) in 'input_data'
    for job_folder_name in os.listdir(input_folder):
        job_folder_path = os.path.join(input_folder, job_folder_name)

        if os.path.isdir(job_folder_path):
            job_name = job_folder_name

            # Delete specific files from the root of the job folder
            for unwanted_file in ['obj.data', 'train.txt', 'obj.names']:
                unwanted_file_path = os.path.join(job_folder_path, unwanted_file)
                if os.path.exists(unwanted_file_path):
                    os.remove(unwanted_file_path)

            # Check if the sub-folder contains a folder called 'obj_train_data'
            obj_train_data_path = os.path.join(job_folder_path, 'obj_train_data')
            if os.path.exists(obj_train_data_path):
                # Rename files inside 'obj_train_data', replacing 'frame' with 'job_name'
                for file_name in os.listdir(obj_train_data_path):
                    if 'frame' in file_name:
                        new_file_name = file_name.replace('frame', job_name)
                        os.rename(os.path.join(obj_train_data_path, file_name), 
                                  os.path.join(obj_train_data_path, new_file_name))

                # Move all files from 'obj_train_data' up one level to the parent folder
                for file_name in os.listdir(obj_train_data_path):
                    # change tomove when confirmed that it is working 
                    shutil.copy(os.path.join(obj_train_data_path, file_name), job_folder_path)

                # Delete the 'obj_train_data' folder
                #shutil.rmtree(obj_train_data_path)

## Split the data 
def split_data(input_folder, output_folder):
    # Create the 'images_wip' and 'labels_wip' folders if they don't exist
    images_wip = os.path.join(output_folder, 'images')
    labels_wip = os.path.join(output_folder, 'labels')

    os.makedirs(images_wip, exist_ok=True)
    os.makedirs(labels_wip, exist_ok=True)

    # Create the 'train', 'test', and 'valid' sub-folders in 'images_wip' and 'labels_wip'
    for subfolder in ['test']:
    #for subfolder in ['train', 'test', 'valid']:
        os.makedirs(os.path.join(images_wip, subfolder), exist_ok=True)
        os.makedirs(os.path.join(labels_wip, subfolder), exist_ok=True)

    # Iterate through each sub-folder (job) in 'data_to_split'
    for job_folder_name in os.listdir(input_folder):
        job_folder_path = os.path.join(input_folder, job_folder_name)

        if os.path.isdir(job_folder_path):
            # Get all image files and shuffle them
            # We want to shuffle within the JOb as each job contains the frames in sequence from a video 
            # so adjacent frames will be more similar 
            image_files = [f for f in os.listdir(job_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            random.shuffle(image_files)

            # Calculate the split indices
            total_files = len(image_files)
         
            # Helper function to move files
            def move_files(files, src_folder, dest_folder, file_extension):
                for file_name in files:
                    # Move image files
                    image_src = os.path.join(src_folder, file_name)
                    image_dest = os.path.join(dest_folder, file_name)
                    # shutil.move(image_src, image_dest)
                    shutil.copy(image_src, image_dest)

                    # Find and move the corresponding text file
                    base_name = os.path.splitext(file_name)[0]
                    text_src = os.path.join(src_folder, base_name + file_extension)
                    if os.path.exists(text_src):
                        text_dest = os.path.join(dest_folder.replace('images', 'labels'), base_name + file_extension)
                        #shutil.move(text_src, text_dest)
                        shutil.copy(text_src, text_dest)

            # Move the training, validation, and test sets of images and corresponding text files
            #move_files(train_files, job_folder_path, os.path.join(images_wip, 'train'), '.txt')
            #move_files(valid_files, job_folder_path, os.path.join(images_wip, 'valid'), '.txt')
            #move_files(test_files, job_folder_path, os.path.join(images_wip, 'test'), '.txt')
            # Just move all as this scrip targets images that have been exluded for testing at an earlier stage 
            move_files(image_files, job_folder_path, os.path.join(images_wip, 'test'), '.txt')

# Convert the folder structure from the CVAT Yolo1.1. export
#input_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsNotSplit'  
#input_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsExtractedStage2'
input_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsExtractedForTestSet'
output_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsExtractedForTestSetToYolo'

#input_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsNotSplit/test_export'
process_data(input_folder) #, output_folder)

# Split each job individually so as to keep the correct mix of classes / keep integrity 
data_to_split = input_folder # Path to extracted and partially organised data 
#output_folder = 'D:/FlagDetectionDatasets/dataset'  # Path to output folder for data 
#ouput_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsNotSplit/test_export_out'

split_data(input_folder, output_folder)