In [2]:
import os
import random
import shutil
from pathlib import Path

# Input folders (original full dataset)
images_path = Path("PreparedData/images")
labels_path = Path("PreparedData/labels")

# Output root
output_dir = Path("dataset")
splits = ["train", "val", "test"]
ratios = [0.8, 0.1, 0.1]  # 80% train, 10% val, 10% test

# Create destination folders
for split in splits:
    (output_dir / f"images/{split}").mkdir(parents=True, exist_ok=True)
    (output_dir / f"labels/{split}").mkdir(parents=True, exist_ok=True)

# Get all images (only those that have a matching label file)
image_files = list(images_path.glob("*.jpg")) + list(images_path.glob("*.png"))
image_files = [img for img in image_files if (labels_path / (img.stem + ".txt")).exists()]
random.shuffle(image_files)

# Split the dataset
total = len(image_files)
train_end = int(ratios[0] * total)
val_end = train_end + int(ratios[1] * total)

splits_data = {
    "train": image_files[:train_end],
    "val": image_files[train_end:val_end],
    "test": image_files[val_end:]
}

# Copy files
for split, files in splits_data.items():
    for img_file in files:
        label_file = labels_path / (img_file.stem + ".txt")

        if not label_file.exists():
            continue  # skip if label is missing

        shutil.copy(img_file, output_dir / f"images/{split}" / img_file.name)
        shutil.copy(label_file, output_dir / f"labels/{split}" / label_file.name)

print("✅ Dataset split complete:")
for split in splits:
    count = len(os.listdir(output_dir / f"images/{split}"))
    print(f"  {split}: {count} images")


✅ Dataset split complete:
  train: 4619 images
  val: 577 images
  test: 578 images
