In [52]:
!pip install scikit-learn



In [53]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split

In [54]:
#Input dataset directory (should contain 'images/' and 'labels/')
input_dir = "../../../Testing/Input/domino dataset"

#Output folder (result will go into subfolder with same name as input folder)
output_base = "../../../Testing/Output"

# Split ratios (must total 1.0)
train_ratio = 0.7
val_ratio   = 0.15
test_ratio  = 0.15

img_exts = ['.jpg', '.png', '.jpeg']

In [55]:
image_dir = os.path.join(input_dir, 'images')
label_dir = os.path.join(input_dir, 'labels')

if not os.path.isdir(image_dir):
    raise FileNotFoundError("Image directory not found:", image_dir)
if not os.path.isdir(label_dir):
    raise FileNotFoundError("Label directory not found:", label_dir)

if not os.path.isdir(output_base):
    raise FileNotFoundError("Output base directory not found:", output_base)

image_files = [f for f in os.listdir(image_dir) if os.path.splitext(f)[1].lower() in img_exts]
label_files = [f for f in os.listdir(label_dir) if f.lower().endswith('.txt')]

print("Found", len(image_files), "image files in:", image_dir)
print("Found", len(label_files), "label files in:", label_dir)
print("Output base directory exists:", output_base)


Found 57 image files in: ../../../Testing/Input/domino dataset\images
Found 57 label files in: ../../../Testing/Input/domino dataset\labels
Output base directory exists: ../../../Testing/Output


In [56]:
dataset_name = os.path.basename(os.path.normpath(input_dir))
output_dir = os.path.join(output_base, dataset_name)

images_dir = os.path.join(input_dir, "images")
labels_dir = os.path.join(input_dir, "labels")

subsets = ["train", "val", "test"]
for subset in subsets:
    os.makedirs(os.path.join(output_dir, subset, "images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, subset, "labels"), exist_ok=True)


In [57]:
all_images = [
    f for f in os.listdir(images_dir)
    if os.path.splitext(f)[1].lower() in img_exts
]

if not all_images:
    raise ValueError("No image files found in:", images_dir)

print("Total images found:",len(all_images))

Total images found: 57


In [58]:
train_imgs, temp_imgs = train_test_split(all_images, test_size=(1 - train_ratio), random_state=42)

val_size = val_ratio / (val_ratio + test_ratio)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=(1 - val_size), random_state=42)

splits = {"train": train_imgs, "val": val_imgs, "test": test_imgs}

for split, files in splits.items():
    print(f"{split}: {len(files)} images")

train: 39 images
val: 9 images
test: 9 images


In [59]:
def copy_files(file_list, subset):
    for fname in file_list:
        name, ext = os.path.splitext(fname)

        src_img = os.path.join(images_dir, fname)
        dst_img = os.path.join(output_dir, subset, "images", fname)
        shutil.copy2(src_img, dst_img)

        label_file = name + ".txt"
        src_lbl = os.path.join(labels_dir, label_file)
        dst_lbl = os.path.join(output_dir, subset, "labels", label_file)

        if os.path.exists(src_lbl):
            shutil.copy2(src_lbl, dst_lbl)
        else:
            print(f"Warning: Label not found for {fname}")

for subset, file_list in splits.items():
    copy_files(file_list, subset)

print("Dataset split and copied successfully!")
print("Output saved in:", output_dir)

Dataset split and copied successfully!
Output saved in: ../../../Testing/Output\domino dataset
