In [5]:
import os
import shutil
import random
from tqdm import tqdm

In [10]:
# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

In [8]:
# Set random seed for reproducibility
random.seed(42)

In [9]:

# Define paths
input_images_dir = './images/'
input_labels_dir = './labels/'
output_base_dir = './split_dataset/'
train_images_dir = os.path.join(output_base_dir, 'train', 'images')
train_labels_dir = os.path.join(output_base_dir, 'train', 'labels')
val_images_dir = os.path.join(output_base_dir, 'val', 'images')
val_labels_dir = os.path.join(output_base_dir, 'val', 'labels')
test_images_dir = os.path.join(output_base_dir, 'test', 'images')
test_labels_dir = os.path.join(output_base_dir, 'test', 'labels')

# Create output directories
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(train_labels_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(val_labels_dir, exist_ok=True)
os.makedirs(test_images_dir, exist_ok=True)
os.makedirs(test_labels_dir, exist_ok=True)


In [11]:
# Get list of all images
image_files = [f for f in os.listdir(input_images_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

# Shuffle the dataset
random.shuffle(image_files)

# Calculate split indices
total_images = len(image_files)
train_end = int(train_ratio * total_images)
val_end = train_end + int(val_ratio * total_images)

# Split the dataset
train_files = image_files[:train_end]
val_files = image_files[train_end:val_end]
test_files = image_files[val_end:]

In [12]:

# Get list of all images
image_files = [f for f in os.listdir(input_images_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

# Shuffle the dataset
random.shuffle(image_files)

# Calculate split indices
total_images = len(image_files)
train_end = int(train_ratio * total_images)
val_end = train_end + int(val_ratio * total_images)

# Split the dataset
train_files = image_files[:train_end]
val_files = image_files[train_end:val_end]
test_files = image_files[val_end:]

In [13]:

def copy_files(file_list, source_image_dir, source_label_dir, dest_image_dir, dest_label_dir):
    for file_name in tqdm(file_list, desc=f'Copying to {dest_image_dir}'):
        # Copy image file
        src_image_path = os.path.join(source_image_dir, file_name)
        dst_image_path = os.path.join(dest_image_dir, file_name)
        shutil.copy2(src_image_path, dst_image_path)

        # Copy corresponding label file
        label_name = os.path.splitext(file_name)[0] + '.txt'
        src_label_path = os.path.join(source_label_dir, label_name)
        if os.path.exists(src_label_path):
            dst_label_path = os.path.join(dest_label_dir, label_name)
            shutil.copy2(src_label_path, dst_label_path)
        else:
            print(f"Warning: Label file {label_name} not found for image {file_name}.")

# Copy files to respective directories
copy_files(train_files, input_images_dir, input_labels_dir, train_images_dir, train_labels_dir)
copy_files(val_files, input_images_dir, input_labels_dir, val_images_dir, val_labels_dir)
copy_files(test_files, input_images_dir, input_labels_dir, test_images_dir, test_labels_dir)

print("Dataset split completed.")

Copying to ./split_dataset/train/images: 100%|██████████| 36470/36470 [00:29<00:00, 1228.92it/s]
Copying to ./split_dataset/val/images: 100%|██████████| 7815/7815 [00:06<00:00, 1123.69it/s]
Copying to ./split_dataset/test/images: 100%|██████████| 7816/7816 [00:07<00:00, 1035.65it/s]

Dataset split completed.



