In [None]:
import os
from glob import glob

# --- Path dataset asli ---
ORIG_DIR   = '/content/drive/MyDrive/Dataset Skripsi (15 Kelas Kran)'
ALLOWED_EXT = ('.png', '.jpg', '.jpeg', '.webp', '.avif', '.jfif')

# --- Hitung jumlah gambar per kelas ---
class_counts = {}
total_images = 0

for class_name in sorted(os.listdir(ORIG_DIR)):
    class_src = os.path.join(ORIG_DIR, class_name)
    if not os.path.isdir(class_src):
        continue

    images = [p for p in glob(os.path.join(class_src, '*'))
              if os.path.splitext(p)[1].lower() in ALLOWED_EXT]
    n = len(images)
    class_counts[class_name] = n
    total_images += n

# --- Tampilkan hasil ---
print("Jumlah data per kelas (dataset awal):")
for cls, count in class_counts.items():
  print(f"{cls:20s} : {count}")
print(f"\nTotal semua gambar: {total_images}")


Jumlah data per kelas (dataset awal):
A 801 T              : 108
BL                   : 105
BM                   : 112
CLS 02               : 101
JF 03 TA             : 109
JF 08 ST             : 173
K 406 CTG            : 141
K 407 MH             : 201
K 409 GWC            : 107
V 688 CA             : 227
V 697 GKU            : 143
V TUL                : 172
Y 316 FA             : 123
Y 321 C              : 102
Y 327 GKU            : 122

Total semua gambar: 2046


In [None]:
!pip -q install pillow-avif-plugin

# --- Imports ---
import os
import random
import shutil
from glob import glob
from tqdm import tqdm
from PIL import Image
import pillow_avif

# --- Main Parameters ---
ORIG_DIR = '/content/drive/MyDrive/Dataset Skripsi (15 Kelas Kran)'
SPLIT_DIR = '/content/drive/MyDrive/Dataset_Skripsi_Split_Kran_15' # Output directory
TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.70, 0.15, 0.15
ALLOWED_EXT = ('.png', '.jpg', '.jpeg', '.webp', '.avif', '.jfif')

# --- Utility: Save as JPG (RGB) ---
def save_as_jpg(src_path, dst_path, quality=95):
    """Opens an image, converts it to RGB, and saves it as a JPG."""
    try:
        with Image.open(src_path) as im:
            im = im.convert('RGB')
            # im = im.resize((224, 224), Image.BICUBIC)
            im.save(dst_path, 'JPEG', quality=quality, optimize=True, subsampling=0)
    except Exception as e:
        print(f"Skipping {src_path} due to error: {e}")

# --- 1) Splitting and Standardizing to JPG for train/val/test ---
print("Starting dataset splitting and standardization...")
random.seed(42)  # for reproducible splits

# Clean up existing split directory to ensure a fresh start
if os.path.exists(SPLIT_DIR):
    shutil.rmtree(SPLIT_DIR)
    print(f"Removed existing directory: {SPLIT_DIR}")

# Create the main directory structure
os.makedirs(SPLIT_DIR, exist_ok=True)
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(SPLIT_DIR, split), exist_ok=True)

class_names = sorted([d for d in os.listdir(ORIG_DIR) if os.path.isdir(os.path.join(ORIG_DIR, d))])
print(f"Found {len(class_names)} classes: {class_names}")

for class_name in class_names:
    class_src = os.path.join(ORIG_DIR, class_name)

    # Create class subfolders in each split
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(SPLIT_DIR, split, class_name), exist_ok=True)

    # Get all valid image files
    images = [p for p in glob(os.path.join(class_src, '*')) if os.path.splitext(p)[1].lower() in ALLOWED_EXT]
    random.shuffle(images)

    # Calculate split points
    total = len(images)
    train_end = int(total * TRAIN_RATIO)
    val_end = train_end + int(total * VAL_RATIO)

    split_map = {
        'train': images[:train_end],
        'val': images[train_end:val_end],
        'test': images[val_end:]
    }

    # Copy and convert images to the corresponding split folder
    for split, files in split_map.items():
        for idx, src_path in enumerate(tqdm(files, desc=f"[{class_name}] -> {split}", leave=False)):
            dst_filename = f"{class_name}_{idx:05d}.jpg"
            dst_path = os.path.join(SPLIT_DIR, split, class_name, dst_filename)
            save_as_jpg(src_path, dst_path, quality=95)

print("\n✅ Part 1 Complete: Dataset has been split and standardized into JPG format.")
print(f"Results saved in: {SPLIT_DIR}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/4.2 MB[0m [31m12.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m4.1/4.2 MB[0m [31m59.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hStarting dataset splitting and standardization...
Found 15 classes: ['A 801 T', 'BL', 'BM', 'CLS 02', 'JF 03 TA', 'JF 08 ST', 'K 406 CTG', 'K 407 MH', 'K 409 GWC', 'V 688 CA', 'V 697 GKU', 'V TUL', 'Y 316 FA', 'Y 321 C', 'Y 327 GKU']


                                                                    


✅ Part 1 Complete: Dataset has been split and standardized into JPG format.
Results saved in: /content/drive/MyDrive/Dataset_Skripsi_Split_Kran_15




In [None]:
# --- Imports ---
import os
import random
import shutil
from glob import glob
from tqdm import tqdm
from PIL import Image
import numpy as np
import albumentations as A

# --- Main Parameters ---
SPLIT_DIR   = '/content/drive/MyDrive/Dataset_Skripsi_Split_Kran_15'       # Input
AUGMENT_DIR = '/content/drive/MyDrive/Dataset_Skripsi_Augmented_200_Kran_15' # Output
TARGET_TRAIN_PER_CLASS = 200 # Target minimum images per class in the train set

# --- Copying and Augmenting the Training Set ---
print("Starting data augmentation process...")

# Clean up existing augmented directory to ensure a fresh start
if os.path.exists(AUGMENT_DIR):
    shutil.rmtree(AUGMENT_DIR)
    print(f"Removed existing directory: {AUGMENT_DIR}")

# Copy the entire split dataset to the new augmented directory
print(f"Copying files from {SPLIT_DIR} to {AUGMENT_DIR}...")
shutil.copytree(SPLIT_DIR, AUGMENT_DIR)
print("Copy complete.")

# Augmentation pipeline
augment = A.Compose([
    A.RandomBrightnessContrast(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=360, p=0.5),
    # A.RandomScale(scale_limit=0.2, p=0.5),
    # A.GaussNoise(p=0.25),
    # A.MotionBlur(p=0.25),
    # A.Resize(224, 224)
])

train_root = os.path.join(AUGMENT_DIR, 'train')
class_names = sorted([d for d in os.listdir(train_root) if os.path.isdir(os.path.join(train_root, d))])
print(f"Found {len(class_names)} classes in the training set.")

for class_name in class_names:
    class_dir = os.path.join(train_root, class_name)

    # Get the current list of JPGs in the train folder for this class
    current_jpgs = sorted(glob(os.path.join(class_dir, '*.jpg')))
    current_count = len(current_jpgs)
    needed = max(0, TARGET_TRAIN_PER_CLASS - current_count)

    print(f"[{class_name}]: {current_count} images exist. Target is {TARGET_TRAIN_PER_CLASS}. Needing {needed} more.")
    if needed == 0:
        continue

    # Augment images until the target is reached
    for i in tqdm(range(needed), desc=f"Augmenting {class_name}", leave=False):
        # Choose a random image from the original set to augment
        src_path = random.choice(current_jpgs)

        try:
            # Open image and convert to numpy array
            image = np.array(Image.open(src_path).convert('RGB'))

            # Apply augmentation
            augmented = augment(image=image)['image']

            # Ensure the data type is correct for saving
            if augmented.dtype != np.uint8:
                augmented = np.clip(augmented, 0, 255).astype(np.uint8)

            # Save the new augmented image with a unique name
            save_path = os.path.join(class_dir, f"aug_{i:06d}.jpg")
            Image.fromarray(augmented).save(save_path, 'JPEG', quality=95, optimize=True)

        except Exception as e:
            print(f"Could not augment {src_path}: {e}")
            continue

print("\n✅ Part 2 Complete: Augmentation of the training set is finished.")
print(f"Final dataset ready for training in: {AUGMENT_DIR}")

Starting data augmentation process...
Copying files from /content/drive/MyDrive/Dataset_Skripsi_Split_Kran_15 to /content/drive/MyDrive/Dataset_Skripsi_Augmented_200_Kran_15_Ver2...
Copy complete.
Found 15 classes in the training set.
[A 801 T]: 75 images exist. Target is 200. Needing 125 more.




[BL]: 73 images exist. Target is 200. Needing 127 more.




[BM]: 78 images exist. Target is 200. Needing 122 more.




[CLS 02]: 70 images exist. Target is 200. Needing 130 more.




[JF 03 TA]: 76 images exist. Target is 200. Needing 124 more.




[JF 08 ST]: 121 images exist. Target is 200. Needing 79 more.




[K 406 CTG]: 98 images exist. Target is 200. Needing 102 more.




[K 407 MH]: 140 images exist. Target is 200. Needing 60 more.




[K 409 GWC]: 74 images exist. Target is 200. Needing 126 more.




[V 688 CA]: 158 images exist. Target is 200. Needing 42 more.




[V 697 GKU]: 100 images exist. Target is 200. Needing 100 more.




[V TUL]: 120 images exist. Target is 200. Needing 80 more.




[Y 316 FA]: 86 images exist. Target is 200. Needing 114 more.




[Y 321 C]: 71 images exist. Target is 200. Needing 129 more.




[Y 327 GKU]: 85 images exist. Target is 200. Needing 115 more.


                                                                       


✅ Part 2 Complete: Augmentation of the training set is finished.
Final dataset ready for training in: /content/drive/MyDrive/Dataset_Skripsi_Augmented_200_Kran_15_Ver2


