### Libraries

In [1]:
import os
import random
import shutil

### Data Splits

In [2]:
# Set the percentage of data to take
valid_percentage = 0.2
test_percentage = 0.1

### Get the list of files in the train

In [3]:
# Existing train split directories
external_disk_path = "/media/tiagociiic/easystore"
split = 'train'
dataset_dir = os.path.join(external_disk_path, "RORD")
train_dir = os.path.join(dataset_dir, split)
img_dir = os.path.join(train_dir, 'img')
gt_dir = os.path.join(train_dir, 'gt')
mask_dir = os.path.join(train_dir, 'mask')

# Get the list of image files
img_files = [f for f in os.listdir(img_dir) if f.endswith('.jpg')]
gt_files = [f for f in os.listdir(gt_dir) if f.endswith('.jpg')]
mask_files = [f for f in os.listdir(mask_dir) if f.endswith('.png')]

# Get the common file names
common_files = set(os.path.splitext(f)[0] for f in img_files) & set(os.path.splitext(f)[0] for f in gt_files) & set(
    os.path.splitext(f)[0] for f in mask_files)


### Create the new directories

In [4]:
# Create the new directories
valid_dir = os.path.join(dataset_dir, 'valid')
test_dir = os.path.join(dataset_dir, 'test')
sub_dirs = ['img', 'gt', 'mask']

for sub_dir in sub_dirs:
    os.makedirs(os.path.join(valid_dir, sub_dir), exist_ok=True)
    os.makedirs(os.path.join(test_dir, sub_dir), exist_ok=True)

### Take 20 % of the common files for validation

In [None]:
num_files = int(len(common_files) * valid_percentage)
files_to_take = random.sample(list(common_files), num_files)

# Copy the files to the validation directories
for file in files_to_take:
    img_file = os.path.join(img_dir, file + '.jpg')
    gt_file = os.path.join(gt_dir, file + '.jpg')
    mask_file = os.path.join(mask_dir, file + '.png')
    shutil.copy(img_file, os.path.join(valid_dir, 'img'))
    shutil.copy(gt_file, os.path.join(valid_dir, 'gt'))
    shutil.copy(mask_file, os.path.join(valid_dir, 'mask'))

print(f"Copied {num_files} files to the validation split")

### Take 10 % of the common files for testing

In [None]:
# Take a percentage of the remaining common files for testing
remaining_files = list(common_files - set(files_to_take))
num_files = int(len(remaining_files) * test_percentage)
files_to_take = random.sample(remaining_files, num_files)

# Copy the files to the test directories
for file in files_to_take:
    img_file = os.path.join(img_dir, file + '.jpg')
    gt_file = os.path.join(gt_dir, file + '.jpg')
    mask_file = os.path.join(mask_dir, file + '.png')
    shutil.copy(img_file, os.path.join(test_dir, 'img'))
    shutil.copy(gt_file, os.path.join(test_dir, 'gt'))
    shutil.copy(mask_file, os.path.join(test_dir, 'mask'))

print(f"Copied {num_files} files to the test split")