In [2]:
import os
import random
import cv2
import pandas as pd
from collections import defaultdict, Counter
from albumentations import Compose, Rotate, RandomScale
from tqdm import tqdm

# Transformasi augmentasi
transform = Compose([
    Rotate(limit=25, p=1, border_mode=cv2.BORDER_REFLECT_101),
    RandomScale(scale_limit=0.2, p=1, border_mode=cv2.BORDER_REFLECT_101)
])

random.seed(42)

def augment_train_balanced(
    attention_csv_path,
    train_folder,
    output_folder,
    total_augmented_images=80
):
    os.makedirs(output_folder, exist_ok=True)

    # Load CSV
    df = pd.read_csv(attention_csv_path, header=None, names=["filename", "attention_level"])
    df['attention_level'] = df['attention_level'].astype(int)

    # Filter hanya file yang ada di folder train
    train_filenames = set(os.listdir(train_folder))
    df_train = df[df['filename'].isin(train_filenames)]

    # Hitung distribusi awal
    attention_counts = df_train['attention_level'].value_counts().to_dict()
    print("📊 Distribusi awal di train:", attention_counts)

    # Buat mapping level -> list gambar
    attention_to_files = defaultdict(list)
    for _, row in df_train.iterrows():
        attention_to_files[row['attention_level']].append(row['filename'])

    # Simpan data augmentasi ke list
    augmented_rows = []
    current_counts = Counter(attention_counts)

    pbar = tqdm(total=total_augmented_images)
    aug_count = 0

    while aug_count < total_augmented_images:
        # Cari level dengan jumlah data terkini paling sedikit
        level = min(current_counts.items(), key=lambda x: x[1])[0]
        candidates = attention_to_files[level]

        if not candidates:
            continue

        img_name = random.choice(candidates)
        img_path = os.path.join(train_folder, img_name)
        image = cv2.imread(img_path)

        if image is None:
            print(f"⚠️ Gagal baca: {img_path}")
            continue

        augmented = transform(image=image)
        aug_img = augmented["image"]

        # Simpan hasil augmentasi
        aug_name = f"{os.path.splitext(img_name)[0]}_aug_{aug_count}.png"
        output_path = os.path.join(output_folder, aug_name)
        cv2.imwrite(output_path, aug_img)

        augmented_rows.append({"filename": aug_name, "attention_level": level})
        current_counts[level] += 1

        aug_count += 1
        pbar.update(1)

    pbar.close()

    # Gabungkan data asli dan augmentasi
    df_aug = pd.DataFrame(augmented_rows)
    df_combined = pd.concat([df_train[["filename", "attention_level"]], df_aug], ignore_index=True)

    # Distribusi akhir
    final_counts = df_combined['attention_level'].value_counts().to_dict()
    print("✅ Selesai augmentasi.")
    print("📊 Distribusi setelah augmentasi:", final_counts)

# Contoh penggunaan
augment_train_balanced(
    attention_csv_path='./attention.csv',
    train_folder='./train',
    output_folder='./train_aug',
    total_augmented_images=200
)


  RandomScale(scale_limit=0.2, p=1, border_mode=cv2.BORDER_REFLECT_101)


📊 Distribusi awal di train: {4: 88, 2: 46, 1: 45, 3: 37, 0: 24}


100%|██████████| 200/200 [00:03<00:00, 65.35it/s]

✅ Selesai augmentasi.
📊 Distribusi setelah augmentasi: {1: 88, 2: 88, 3: 88, 4: 88, 0: 88}



