In [None]:
class Args:
    data_folder = ""  # Set to empty string
    train_folder = ""  # Set to empty string
    test_folder = ""  # Set to empty string
    split_ratio = 0.8  # Default split ratio (80% for training and 20% for testing)

    # Function to set default values if folder paths are empty
    def set_defaults(self):
        if not self.data_folder:
            self.data_folder = "Tensorflow/workspace/images/collectedimages"  # Default data folder
        if not self.train_folder:
            self.train_folder = "Tensorflow/workspace/images/train"  # Default train folder
        if not self.test_folder:
            self.test_folder = "Tensorflow/workspace/images/test"  # Default test folder

# Create an instance of the Args class to store the argument values
args = Args()
args.set_defaults()  # Set default values if folder paths are empty

In [None]:
import os
import shutil
import random
import argparse
import logging
from tqdm import tqdm

def parse_arguments():
    parser = argparse.ArgumentParser(description='Split image dataset into training and testing sets.')
    parser.add_argument('--data_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'collectedimages'),
                        help='Path to the folder containing the image dataset')
    parser.add_argument('--train_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'train'),
                        help='Path to the output train folder')
    parser.add_argument('--test_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'test'),
                        help='Path to the output test folder')
    parser.add_argument('--split_ratio', type=float, default=0.8,
                        help='Ratio of images to be used for training (e.g., 0.8 means 80% for training and 20% for testing)')
    return parser.parse_args()

def get_images_by_subfolder(data_folder):
    images_by_subfolder = {}
    for folder in os.listdir(data_folder):
        images = [os.path.join(data_folder, folder, f) for f in os.listdir(os.path.join(data_folder, folder)) if f.endswith(".jpg")]
        images_by_subfolder[folder] = images
    return images_by_subfolder

def read_tracking_file(tracking_file_path):
    processed_counts = {}
    if os.path.exists(tracking_file_path):
        with open(tracking_file_path, 'r') as f:
            for line in f:
                folder, count = line.strip().split(',')
                processed_counts[folder] = int(count)
    return processed_counts

def update_tracking_file(tracking_file_path, processed_counts):
    with open(tracking_file_path, 'w') as f:
        for folder, count in processed_counts.items():
            f.write(f"{folder},{count}\n")
def split_and_copy_images(train_folder, test_folder, images_by_subfolder, split_ratio, tracking_file_path):
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    processed_counts = read_tracking_file(tracking_file_path)

    for folder, images in images_by_subfolder.items():
        if folder not in processed_counts:
            processed_counts[folder] = 0

        new_images = images[processed_counts[folder]:]
        random.shuffle(new_images)

        train_count = int(split_ratio * len(new_images))
        test_count = len(new_images) - train_count

        train_images = new_images[:train_count]
        test_images = new_images[train_count:]

        for image_path in tqdm(train_images, desc=f"Copying train images from {folder}"):
            image_name = os.path.basename(image_path)
            xml_name = os.path.splitext(image_name)[0] + ".xml"
            xml_path = os.path.join(os.path.dirname(image_path), xml_name)

            shutil.copy(image_path, os.path.join(train_folder, image_name))
            shutil.copy(xml_path, os.path.join(train_folder, xml_name))

        for image_path in tqdm(test_images, desc=f"Copying test images from {folder}"):
            image_name = os.path.basename(image_path)
            xml_name = os.path.splitext(image_name)[0] + ".xml"
            xml_path = os.path.join(os.path.dirname(image_path), xml_name)

            shutil.copy(image_path, os.path.join(test_folder, image_name))
            shutil.copy(xml_path, os.path.join(test_folder, xml_name))

        processed_counts[folder] += len(new_images)

    update_tracking_file(tracking_file_path, processed_counts)
    
def main():
    logging.basicConfig(level=logging.INFO)

    args = parse_arguments()

    if not os.path.exists(args.data_folder):
        logging.error(f"Data folder '{args.data_folder}' does not exist.")
        return

    images_by_subfolder = get_images_by_subfolder(args.data_folder)

    if not images_by_subfolder:
        logging.error("No images found in the data folder.")
        return

    tracking_file_path = "image_split_tracking.txt"
    split_and_copy_images(args.train_folder, args.test_folder, images_by_subfolder, args.split_ratio, tracking_file_path)
    logging.info("Successfully split and copied images and XML files to train and test folders.")

if __name__ == "__main__":
    main()
