# Data hasil Koleksi dan Scraping

In [1]:
import os
import random
import shutil
import cv2
import splitfolders
import pandas as pd
from PIL import Image
from pillow_heif import register_heif_opener
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

##### === Path Configuration ===

In [2]:
base_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MakeDataset6"
input_folder = os.path.join(base_folder, 'raw_data')
dataset_folder = os.path.join(base_folder, 'dataset')
train_path = os.path.join(dataset_folder, 'train')
augmen_path = os.path.join(train_path, 'Augmen')
final_size = (640, 640)

##### === Resize dan Rename ===

In [3]:
def resize_and_rename(input_folder, size, new_name_suffix):
    print("Processing image...")
    for class_folder in os.listdir(input_folder):
        class_folder_path = os.path.join(input_folder, class_folder)
        if os.path.isdir(class_folder_path):
            for idx, file in enumerate(os.listdir(class_folder_path), start=1):
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = os.path.join(class_folder_path, file)
                    img = cv2.imread(img_path)
                    resized_img = cv2.resize(img, size)

                    new_name = f"{class_folder}{new_name_suffix}{idx}.jpg"
                    
                    cv2.imwrite(os.path.join(class_folder_path, new_name), resized_img)
                    
                    os.remove(img_path)
    print(f"All image has been processed and saved in {input_folder}.")

In [4]:
resize_and_rename(input_folder, final_size, new_name_suffix="_")

Processing image...
All image has been processed and saved in D:\1 Main File\Project File\Capstone Bangkit\MakeDataset6\raw_data.


##### === Split Dataset ===

In [5]:
def split_dataset(input_folder, dataset_folder, ratio):
    splitfolders.ratio(
        input_folder,
        output=dataset_folder,
        seed=42,
        ratio=ratio,
        group_prefix=None
    )
    print(f"Dataset splited to train, val, and test in {dataset_folder}.")

In [9]:
split_dataset(input_folder, dataset_folder, (0.4, 0.5, 0.1))

Copying files: 699 files [00:00, 1134.85 files/s]

Dataset splited to train, val, and test in D:\1 Main File\Project File\Capstone Bangkit\MakeDataset6\dataset.





##### === Augmentasi Data ===

In [10]:
def augment_data(train_path, augmen_path, size, augmentation_count=30):
    datagen = ImageDataGenerator(
        rotation_range=15,
        zoom_range=0.1,
        width_shift_range=0.1,
        height_shift_range=0.1,
        brightness_range=[0.8, 1.2],
        vertical_flip=True,
        horizontal_flip=True,
        fill_mode='constant',
        cval=0
    )
    train_generator = datagen.flow_from_directory(
        train_path,
        target_size=size,
        batch_size=1,
        class_mode='categorical',
        shuffle=False
    )
    os.makedirs(augmen_path, exist_ok=True)
    class_names = train_generator.class_indices
    for class_name in class_names:
        os.makedirs(os.path.join(augmen_path, class_name), exist_ok=True)

    total_images = train_generator.samples
    for i in range(total_images):
        images, labels = next(train_generator)
        class_index = labels[0].argmax()
        class_name = list(class_names.keys())[list(class_names.values()).index(class_index)]
        for aug_num in range(augmentation_count):
            augmented_image = datagen.random_transform(images[0])
            save_path = os.path.join(augmen_path, class_name, f"{class_name}_aug_{i}_{aug_num}.jpg")
            tf.keras.preprocessing.image.save_img(save_path, augmented_image)
    print(f"Augmented data saved in {augmen_path}.")

In [11]:
augment_data(train_path, augmen_path, final_size)

Found 278 images belonging to 2 classes.


Augmented data saved in D:\1 Main File\Project File\Capstone Bangkit\MakeDataset6\dataset\train\Augmen.


##### === Transfer Augmented Data ===

In [12]:
def move_images_to_train(train_path, augmen_path):
    for class_name in os.listdir(augmen_path):
        augmen_class_path = os.path.join(augmen_path, class_name)
        train_class_path = os.path.join(train_path, class_name)
        os.makedirs(train_class_path, exist_ok=True)
        for filename in os.listdir(augmen_class_path):
            shutil.move(os.path.join(augmen_class_path, filename), os.path.join(train_class_path, filename))
        if not os.listdir(augmen_class_path):
            os.rmdir(augmen_class_path)
    if not os.listdir(augmen_path):
        os.rmdir(augmen_path)
    print("Augmented data transfered to train folder")

In [13]:
move_images_to_train(train_path, augmen_path)

Augmented data transfered to train folder


# Data from Roboflow

##### === Path Dataset and Folder Output ===

In [None]:

roboflow_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\RFdataset"

##### === Preprocessing Roboflow Dataset ===

In [None]:
def preprocess_rfdataset(input_folder, output_folder, size):
    split_folders = ['train', 'val', 'test']
    for split in split_folders:
        split_path = os.path.join(input_folder, split)
        output_path = os.path.join(output_folder, split)
        csv_path = os.path.join(split_path, "_annotations.csv")

        if not os.path.exists(csv_path):
            print(f"No annotation file for {split}. Skipping.")
            continue

        os.makedirs(output_path, exist_ok=True)
        df = pd.read_csv(csv_path)

        for idx, row in df.iterrows():
            img_name = row['filename']
            category = row['class']
            category_path = os.path.join(output_path, category)
            os.makedirs(category_path, exist_ok=True)

            source_img_path = os.path.join(split_path, img_name)
            new_name = f"{category}_RF_{idx + 1}.jpg"
            dest_img_path = os.path.join(category_path, new_name)

            if os.path.exists(source_img_path):
                resize_and_rename(input_folder, 
                                  final_size, 
                                  new_name_suffix="_rf_")

        os.remove(csv_path)
        print(f"Processed {split}. Annotations removed.")

In [None]:
preprocess_rfdataset(roboflow_folder, dataset_folder, final_size)

# Data From Kaggle

##### === Konfigurasi Path ===

In [None]:
kaggle_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\KaggleDataset"

##### === Preprocessing: Resize, Rename, dan Hapus Data Asli ===

In [None]:
resize_and_rename(kaggle_folder, final_size, new_name_suffix="_k_")

##### === Membagi Dataset ===

In [None]:
split_dataset(kaggle_folder, dataset_folder, (0.7, 0.3, 0))

# Additional Data

##### === Convert from HEIC to JPG ===

In [None]:
datasetcina_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\DatasetCina"

In [None]:
register_heif_opener()

In [None]:
def convert_heic_to_jpg(datasetcina_folder):
    for file_name in os.listdir(datasetcina_folder):
        file_path = os.path.join(datasetcina_folder, file_name)
        
        if file_name.lower().endswith(".heic"):
            try:
                img = Image.open(file_path)
                
                output_file_name = f"{os.path.splitext(file_name)[0]}.jpg"
                output_file_path = os.path.join(datasetcina_folder, output_file_name)
                
                if not os.path.exists(output_file_path):
                    img.save(output_file_path, "JPEG")

                    os.remove(file_path)
                    
                    print(f"Convertion successfull: {file_name} -> {output_file_name}")
                else:
                    print(f"JPG file exsist, skip: {output_file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

In [None]:
convert_heic_to_jpg(datasetcina_folder)

In [None]:
resize_and_rename(datasetcina_folder, final_size, new_name_suffix="_c_")

# Data Balancing

##### === Data Count===

In [None]:
# Menghitung jumlah gambar per kelas
def count_images_in_class(dataset_path):
    counts = {'train': {}, 'val': {}, 'test': {}}

    for split in ['train', 'val', 'test']:
        split_folder_path = os.path.join(dataset_path, split)
        if os.path.exists(split_folder_path):
            for class_folder in os.listdir(split_folder_path):
                class_folder_path = os.path.join(split_folder_path, class_folder)
                if os.path.isdir(class_folder_path):
                    counts[split][class_folder] = len([f for f in os.listdir(class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        else:
            print(f"Folder '{split}' not found.")
    
    return counts


In [None]:
dataset_class_counts = count_images_in_class(dataset_folder)

In [None]:
print("Number of image each class for each split:")
for split in ['train', 'val', 'test']:
    print(f"\nNumber of picture in folder {split}:")
    print(dataset_class_counts[split])

##### === Balancing val ===

In [None]:
def balance_val_dataset(dataset_path, datasetcina_folder):
    val_counts = count_images_in_class(dataset_path)['val']

    max_class = max(val_counts, key=val_counts.get)
    max_count = val_counts[max_class]

    val_folder = os.path.join(dataset_path, 'val')
    train_folder = os.path.join(dataset_path, 'train')

    for class_name, count in val_counts.items():
        if count < max_count:
            difference = max_count - count

            source_class_folder = os.path.join(datasetcina_folder, class_name)
 
            target_class_folder_val = os.path.join(val_folder, class_name)
            os.makedirs(target_class_folder_val, exist_ok=True)
 
            files_to_move = [f for f in os.listdir(source_class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))][:difference]
            for file_name in files_to_move:
                shutil.move(os.path.join(source_class_folder, file_name), os.path.join(target_class_folder_val, file_name))

    for class_name in os.listdir(datasetcina_folder):
        source_class_folder = os.path.join(datasetcina_folder, class_name)
        target_class_folder_train = os.path.join(train_folder, class_name)
        os.makedirs(target_class_folder_train, exist_ok=True)
        
        files_to_move = [f for f in os.listdir(source_class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        for file_name in files_to_move:
            shutil.move(os.path.join(source_class_folder, file_name), os.path.join(target_class_folder_train, file_name))

In [None]:
balance_val_dataset(dataset_folder, datasetcina_folder)

In [None]:
dataset_class_counts = count_images_in_class(dataset_folder)

In [None]:
print("Number of image each class for each split:")
for split in ['train', 'val', 'test']:
    print(f"\nNumber of picture in folder {split}:")
    print(dataset_class_counts[split])

In [None]:
def augment_and_balance_data(train_path, variations_per_image):
    datagen = ImageDataGenerator(
        rotation_range=27,
        vertical_flip=True,
        horizontal_flip=True,
        fill_mode='constant',
        cval=0
    )

    class_counts = {}
    class_names = os.listdir(train_path)

    for class_name in class_names:
        class_folder = os.path.join(train_path, class_name)
        if os.path.isdir(class_folder):
            class_image_count = len([f for f in os.listdir(class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
            class_counts[class_name] = class_image_count

    target_class_name = max(class_counts, key=class_counts.get)
    target_class_count = class_counts[target_class_name]

    for class_name, class_image_count in class_counts.items():
        if class_image_count < target_class_count:
            num_images_to_generate = target_class_count - class_image_count
            print(f"Augmentation for class {class_name}: {num_images_to_generate} image")

            generated_count = 0
            class_images = [f for f in os.listdir(os.path.join(train_path, class_name)) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

            for image_file in class_images:
                if generated_count >= num_images_to_generate:
                    break

                image_path = os.path.join(train_path, class_name, image_file)
                img = cv2.imread(image_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (640, 640))
                img = img.reshape((1,) + img.shape)

                for _ in range(variations_per_image):
                    if generated_count < num_images_to_generate:
                        for batch in datagen.flow(img, batch_size=1, save_to_dir=os.path.join(train_path, class_name), save_prefix='aug', save_format='jpg'):
                            generated_count += 1
                            break 

            print(f"Augmentation for {class_name} Finish, total image: {target_class_count}")

    print("Data balanced!")

In [None]:
augment_and_balance_data(train_path, 2)

In [None]:
dataset_class_counts = count_images_in_class(dataset_folder)

In [None]:
print("Number of image each class for each split:")
for split in ['train', 'val', 'test']:
    print(f"\nNumber of picture in folder {split}:")
    print(dataset_class_counts[split])

##### === Data balancing more classes ===

In [14]:
def distribute_data(dataset_path, max_train=3588, max_val=813):
    splits = ['train', 'val', 'test']

    train_folder_path = os.path.join(dataset_path, 'train')
    for class_name in os.listdir(train_folder_path):
        class_folder_path = os.path.join(train_folder_path, class_name)
        if os.path.isdir(class_folder_path):
            train_images = [f for f in os.listdir(class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            train_count = len(train_images)

            if train_count > max_train:
                excess_count = train_count - max_train
                print(f"Memindahkan {excess_count} gambar dari '{class_name}' di train ke val.")

                val_class_folder_path = os.path.join(dataset_path, 'val', class_name)
                os.makedirs(val_class_folder_path, exist_ok=True)

                for image in train_images[:excess_count]:
                    shutil.move(os.path.join(class_folder_path, image), os.path.join(val_class_folder_path, image))

            val_images = [f for f in os.listdir(val_class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            val_count = len(val_images)

            if val_count > max_val:
                excess_count = val_count - max_val
                print(f"Memindahkan {excess_count} gambar dari '{class_name}' di val ke test.")

                test_class_folder_path = os.path.join(dataset_path, 'test', class_name)
                os.makedirs(test_class_folder_path, exist_ok=True)

                for image in val_images[:excess_count]:
                    shutil.move(os.path.join(val_class_folder_path, image), os.path.join(test_class_folder_path, image))

    print("Distribusi data selesai.")

In [15]:
distribute_data(dataset_folder)

Memindahkan 752 gambar dari 'PP' di train ke val.
Memindahkan 115 gambar dari 'PP' di val ke test.
Memindahkan 690 gambar dari 'PS' di train ke val.
Memindahkan 50 gambar dari 'PS' di val ke test.
Distribusi data selesai.


##### === Shuffle dataset ===

In [None]:
def shuffle_dataset(dataset_path):
    splits = ['train', 'val', 'test']

    for split in splits:
        split_folder_path = os.path.join(dataset_path, split)
        if os.path.exists(split_folder_path):
            for class_name in os.listdir(split_folder_path):
                class_folder_path = os.path.join(split_folder_path, class_name)
                if os.path.isdir(class_folder_path):
                    images = [f for f in os.listdir(class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
                    
                    random.shuffle(images)

                    temp_folder = os.path.join(class_folder_path, 'temp')
                    os.makedirs(temp_folder, exist_ok=True)

                    for image in images:
                        shutil.move(os.path.join(class_folder_path, image), os.path.join(temp_folder, image))

                    for image in os.listdir(temp_folder):
                        shutil.move(os.path.join(temp_folder, image), os.path.join(class_folder_path, image))

                    os.rmdir(temp_folder)

                    print(f"Dataset in '{split}' has been randomized for class '{class_name}'.")
        else:
            print(f"Folder '{split}' not found at dataset.")

In [None]:
shuffle_dataset(dataset_folder)