## Usage Guide

This is designed to split a dataset of images and their corresponding XML files into training and testing sets. The dataset is organized into subfolders within a main data folder. If you have used the highlight feature before this please use the part 2 of the 2.1 version

### Prerequisites

Make sure you have Python 3.x installed on your system.

### How to use the script

1. Save the provided script as a `.py` file, for example `split_images.py`.

2. Open a terminal or command prompt and navigate to the directory where the script is saved.

3. Run the script using the following command:




This will use the default data folder, train folder, test folder, and split ratio (80% for training and 20% for testing). The default paths are as follows:

- Data folder: `Tensorflow/workspace/images/collectedimages`
- Train folder: `Tensorflow/workspace/images/train`
- Test folder: `Tensorflow/workspace/images/test`

4. To customize the input and output folder paths and the split ratio, use the following command-line arguments:

- `--data_folder`: Path to the folder containing the image dataset
- `--train_folder`: Path to the output train folder
- `--test_folder`: Path to the output test folder
- `--split_ratio`: Ratio of images to be used for training (e.g., 0.8 means 80% for training and 20% for testing)

Example command with custom arguments:



python split_images.py --data_folder "path/to/data" --train_folder "path/to/train" --test_folder "path/to/test" --split_ratio 0.9


5. The script will split the images and XML files from each subfolder into the train and test folders, maintaining the specified split ratio. Progress will be logged in the terminal or command prompt.

## Notes

- Make sure the input data folder contains subfolders with images and their corresponding XML files.
- The script will create the train and test folders if they don't already exist.
- This script does not consider that if you have already split the data and add new data into collected images file

## If you want to use this notebook instead 

Modify the first cell for the 
    data_folder, train_folder, test_folder, split_ratio

In [None]:
class Args:
    data_folder = ""  # Set to empty string
    train_folder = ""  # Set to empty string
    test_folder = ""  # Set to empty string
    split_ratio = 0.8  # Default split ratio (80% for training and 20% for testing)

    # Function to set default values if folder paths are empty
    def set_defaults(self):
        if not self.data_folder:
            self.data_folder = "Tensorflow/workspace/images/collectedimages"  # Default data folder
        if not self.train_folder:
            self.train_folder = "Tensorflow/workspace/images/train"  # Default train folder
        if not self.test_folder:
            self.test_folder = "Tensorflow/workspace/images/test"  # Default test folder

# Create an instance of the Args class to store the argument values
args = Args()
args.set_defaults()  # Set default values if folder paths are empty


In [None]:
import os
import shutil
import random
import argparse
import logging
from tqdm import tqdm

def parse_arguments():
    parser = argparse.ArgumentParser(description='Split image dataset into training and testing sets.')
    parser.add_argument('--data_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'collectedimages'),
                        help='Path to the folder containing the image dataset')
    parser.add_argument('--train_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'train'),
                        help='Path to the output train folder')
    parser.add_argument('--test_folder', type=str, default=os.path.join('Tensorflow', 'workspace', 'images', 'test'),
                        help='Path to the output test folder')
    parser.add_argument('--split_ratio', type=float, default=0.8,
                        help='Ratio of images to be used for training (e.g., 0.8 means 80% for training and 20% for testing)')
    return parser.parse_args()

def get_images_by_subfolder(data_folder):
    images_by_subfolder = {}
    for folder in os.listdir(data_folder):
        images = [os.path.join(data_folder, folder, f) for f in os.listdir(os.path.join(data_folder, folder)) if f.endswith(".jpg")]
        images_by_subfolder[folder] = images
    return images_by_subfolder

def split_and_copy_images(train_folder, test_folder, images_by_subfolder, split_ratio):
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    for folder, images in images_by_subfolder.items():
        random.shuffle(images)
        train_images = images[:int(split_ratio * len(images))]
        test_images = images[int(split_ratio * len(images)):]

        for image_path in tqdm(train_images, desc=f"Copying train images from {folder}"):
            image_name = os.path.basename(image_path)
            xml_name = os.path.splitext(image_name)[0] + ".xml"
            xml_path = os.path.join(os.path.dirname(image_path), xml_name)
            shutil.copy(image_path, os.path.join(train_folder, image_name))
            shutil.copy(xml_path, os.path.join(train_folder, xml_name))

        for image_path in tqdm(test_images, desc=f"Copying test images from {folder}"):
            image_name = os.path.basename(image_path)
            xml_name = os.path.splitext(image_name)[0] + ".xml"
            xml_path = os.path.join(os.path.dirname(image_path), xml_name)
            shutil.copy(image_path, os.path.join(test_folder, image_name))
            shutil.copy(xml_path, os.path.join(test_folder, xml_name))

def main():
    logging.basicConfig(level=logging.INFO)

    if not os.path.exists(args.data_folder):
        logging.error(f"Data folder '{args.data_folder}' does not exist.")
        return

    images_by_subfolder = get_images_by_subfolder(args.data_folder)

    if not images_by_subfolder:
        logging.error("No images found in the data folder.")
        return

    split_and_copy_images(args.train_folder, args.test_folder, images_by_subfolder, args.split_ratio)
    logging.info("Successfully split and copied images and XML files to train and test folders.")

main()