In [1]:
from pathlib import Path

In [15]:
original_data_path = Path("original_data")
data_path = Path("data")

In [13]:
len(list(original_data_path.rglob("*")))

16022

In [16]:
len(list(data_path.rglob("*")))

16045

In [9]:
import os
import shutil
import numpy as np

def split_data(source_folder, destination_folder, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    categories = os.listdir(source_folder)
    
    # Ensure the ratios sum up to 1
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum up to 1"
    
    for category in categories:
        category_path = os.path.join(source_folder, category)
        images = os.listdir(category_path)
        
        # Shuffle images for randomness
        np.random.shuffle(images)
        
        train_end = int(len(images) * train_ratio)
        valid_end = train_end + int(len(images) * valid_ratio)
        
        for folder_name, end_idx in zip(['train', 'valid', 'test'], [train_end, valid_end, len(images)]):
            dest_path = os.path.join(destination_folder, folder_name, category)
            
            if not os.path.exists(dest_path):
                os.makedirs(dest_path)
            
            if folder_name == 'train':
                selected_images = images[:end_idx]
            elif folder_name == 'valid':
                selected_images = images[train_end:end_idx]
            else:
                selected_images = images[valid_end:]
            
            # Copy each image
            for img in selected_images:
                shutil.copy2(os.path.join(category_path, img), dest_path)

In [11]:
source_folder = 'original_data'
destination_folder = 'data'
split_data(source_folder, destination_folder)
