In [1]:
import os
import random
import shutil

# Set the seed for reproducibility
seed_value = 42
random.seed(seed_value)

# Set the paths to your original data directory and the destination directory for the split data
data_dir = 'data/aizoo/'
# Check if val files are in the /train folder
aizoo_val_files = "./data/aizoo-val-items.txt"

In [2]:
# Read the file names from the VALIDATION file
with open(aizoo_val_files, 'r') as file:
    existing_files = [line.strip() for line in file]

In [3]:
# Get the list of file names in the folder
folder_files = [os.path.splitext(filename)[0] for filename in os.listdir(data_dir + "/data/images") if os.path.isfile(os.path.join(data_dir + "/data/images", filename))]
assert len(folder_files) == len(set(folder_files))

In [4]:
test_files = []

for file in folder_files:
    for val_file in existing_files:
        if file.split("_jpg")[0] == val_file:
            test_files.append(file)

assert len(test_files) == len(existing_files)
assert len(set(test_files)) == len(test_files)

In [5]:
len(set(folder_files) - set(test_files))
train_val_files = list(set(folder_files) - set(test_files))

In [6]:
validation_percent = 0.1

# Shuffle the image files randomly
random.shuffle(train_val_files)

# Calculate the number of images for each split
num_images = len(train_val_files)
print("# of images:", num_images)

num_validation = int(num_images * validation_percent)

# Split the image files into train/validation/test sets
train_files = train_val_files[:num_images - num_validation]
print("Train images:", len(train_files))
validation_files = train_val_files[num_images - num_validation:]
print("Val images:", len(validation_files))

assert len(train_files) + len(validation_files) == num_images

In [7]:
# Create the split directories
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'train'), exist_ok=True)
os.makedirs(os.path.join(data_dir, 'valid'), exist_ok=True)
os.makedirs(os.path.join(data_dir, 'test'), exist_ok=True)

In [14]:
def move_partition(partition, files):
    images_dir = data_dir + "data/images"
    labels_dir = data_dir + "data/labels"
    
    for file_name in files:
        file = file_name + ".jpg"
        label = file_name + ".txt"

        partition_images_dir = os.path.join(data_dir, partition, "images")
        partition_labels_dir = os.path.join(data_dir, partition, "labels")
        
        os.makedirs(partition_images_dir, exist_ok=True)
        os.makedirs(partition_labels_dir, exist_ok=True)

        # Copy Image
        shutil.copy(os.path.join(images_dir, file), os.path.join(partition_images_dir, file))
        # Copy label
        shutil.copy(os.path.join(labels_dir, label), os.path.join(partition_labels_dir, label))

In [16]:
move_partition("valid", validation_files)
move_partition("train", train_files)
move_partition("test", test_files)
print('Data split completed successfully.')

Data split completed successfully.
