In [1]:
import os
import shutil
import random
from collections import defaultdict

# Config
BASE_DIR = './relabeled/'
CLASSES = ['Belum', 'Sudah', 'Terlalu']
SPLITS = ['train', 'test']
TARGET_RATIO = 0.8  # 80% train, 20% test

# Helper to collect all image paths by class and split
image_paths = defaultdict(lambda: defaultdict(list))

for split in SPLITS:
    for cls in CLASSES:
        class_dir = os.path.join(BASE_DIR, split, cls)
        if os.path.exists(class_dir):
            for fname in os.listdir(class_dir):
                if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
                    image_paths[cls][split].append(os.path.join(class_dir, fname))

# Calculate total images per class
class_totals = {cls: len(image_paths[cls]['train']) + len(image_paths[cls]['test']) for cls in CLASSES}
min_total = min(class_totals.values())

# Target number per class based on ratio
target_train = int(min_total * TARGET_RATIO)
target_test = min_total - target_train

print(f"Balancing to: {min_total} total images per class ({target_train} train / {target_test} test)\n")

# Stratify each class
for cls in CLASSES:
    all_images = image_paths[cls]['train'] + image_paths[cls]['test']
    random.shuffle(all_images)

    selected = all_images[:min_total]
    new_train = selected[:target_train]
    new_test = selected[target_train:]

    # Clear target directories
    for split in SPLITS:
        cls_dir = os.path.join(BASE_DIR, split, cls)
        for f in os.listdir(cls_dir):
            fpath = os.path.join(cls_dir, f)
            if fpath not in new_train and fpath not in new_test:
                os.remove(fpath)

    # Move files to correct folders if needed
    for fpath in selected:
        fname = os.path.basename(fpath)
        target_split = 'train' if fpath in new_train else 'test'
        target_path = os.path.join(BASE_DIR, target_split, cls, fname)

        if fpath != target_path:
            shutil.move(fpath, target_path)

print("✅ Dataset balanced successfully!")


Balancing to: 2756 total images per class (2204 train / 552 test)

✅ Dataset balanced successfully!
