In [1]:
import os
import shutil
import random
from glob import glob

In [2]:
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [4]:
def split_dataset(input_dir, output_train, output_test, output_val, train_split=0.7, test_split=0.2, val_split=0.1):
    
    create_dir(output_train)
    create_dir(output_test)
    create_dir(output_val)

    class_dirs = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
    
    for class_dir in class_dirs:

        class_path = os.path.join(input_dir, class_dir)
        
        image_paths = glob(os.path.join(class_path, '*.*'))
        
        random.shuffle(image_paths)
        
        total_images = len(image_paths)
        train_idx = int(total_images * train_split)
        test_idx = int(total_images * (train_split + test_split))
        
        train_images = image_paths[:train_idx]
        test_images = image_paths[train_idx:test_idx]
        val_images = image_paths[test_idx:]

        def copy_files(files, target_dir, class_name):
            for file in files:
                dest_dir = os.path.join(target_dir, class_name)
                create_dir(dest_dir)
                shutil.copy(file, dest_dir)
    
        copy_files(train_images, output_train, class_dir)
        copy_files(test_images, output_test, class_dir)
        copy_files(val_images, output_val, class_dir)

In [5]:
input_directory = 'C:\\Bangkit\\ML\\code\\preprocessing\\augmentation\\new-dataset-augmentation'
output_train_directory = 'C:\\Bangkit\\ML\\code\\preprocessing\\split\\new-dataset-split\\train'
output_test_directory = 'C:\\Bangkit\\ML\\code\\preprocessing\\split\\new-dataset-split\\test'
output_val_directory = 'C:\\Bangkit\\ML\\code\\preprocessing\\split\\new-dataset-split\\val'

In [6]:
split_dataset(input_directory, output_train_directory, output_test_directory, output_val_directory)