# Data hasil Koleksi dan Scraping

In [1]:
import os
import random
import shutil
import cv2
import splitfolders
import pandas as pd
from PIL import Image
from pillow_heif import register_heif_opener
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

##### === Path Configuration ===

In [2]:
base_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData"
input_folder = os.path.join(base_folder, 'raw_data')  # Folder input (raw data)
dataset_folder = os.path.join(base_folder, 'Dataset')  # Folder hasil split
train_path = os.path.join(dataset_folder, 'train')  # Path folder train
augmen_path = os.path.join(train_path, 'Augmen')  # Folder augmentasi
final_size = (640, 640)  # Ukuran gambar seragam

##### === Resize dan Rename ===

In [None]:
def resize_and_rename(input_folder, size, new_name_suffix):
    print("Memproses gambar...")
    for class_folder in os.listdir(input_folder):
        class_folder_path = os.path.join(input_folder, class_folder)
        if os.path.isdir(class_folder_path):
            for idx, file in enumerate(os.listdir(class_folder_path), start=1):
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = os.path.join(class_folder_path, file)
                    img = cv2.imread(img_path)
                    resized_img = cv2.resize(img, size)
                    
                    # Format nama file dengan menggantikan {class_folder} dan {idx}
                    new_name = f"{class_folder}{new_name_suffix}{idx}.jpg"
                    
                    # Simpan gambar yang telah diresize dengan nama baru
                    cv2.imwrite(os.path.join(class_folder_path, new_name), resized_img)
                    
                    # Hapus file lama setelah disimpan dengan nama baru
                    os.remove(img_path)
    print(f"Semua gambar telah diproses dan disimpan ke {input_folder}.")


In [17]:
resize_and_rename(input_folder, final_size, new_name_suffix="_")

Memproses gambar...
Semua gambar telah diproses dan disimpan ke D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\DataDummy\raw_data.


##### === Split Dataset ===

In [18]:
def split_dataset(input_folder, dataset_folder, ratio):
    splitfolders.ratio(
        input_folder,
        output=dataset_folder,
        seed=42,
        ratio=ratio, # 70% train, 30% valid, 0% test
        group_prefix=None
    )
    print(f"Dataset berhasil dibagi ke dalam train, valid, dan test di {dataset_folder}.")

In [19]:
split_dataset(input_folder, dataset_folder, (0.7, 0.3, 0))

Copying files: 650 files [00:03, 165.33 files/s]


Dataset berhasil dibagi ke dalam train, valid, dan test di D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\DataDummy\Dataset.


##### === Augmentasi Data ===

In [20]:
def augment_data(train_path, augmen_path, size, augmentation_count=5):
    datagen = ImageDataGenerator(
        rotation_range=40,
        vertical_flip=True,
        horizontal_flip=True,
        fill_mode='constant',
        cval=0
    )
    train_generator = datagen.flow_from_directory(
        train_path,
        target_size=size,
        batch_size=1,
        class_mode='categorical',
        shuffle=False
    )
    os.makedirs(augmen_path, exist_ok=True)
    class_names = train_generator.class_indices
    for class_name in class_names:
        os.makedirs(os.path.join(augmen_path, class_name), exist_ok=True)

    total_images = train_generator.samples
    for i in range(total_images):
        images, labels = next(train_generator)
        class_index = labels[0].argmax()
        class_name = list(class_names.keys())[list(class_names.values()).index(class_index)]
        for aug_num in range(augmentation_count):
            augmented_image = datagen.random_transform(images[0])
            save_path = os.path.join(augmen_path, class_name, f"{class_name}_aug_{i}_{aug_num}.jpg")
            tf.keras.preprocessing.image.save_img(save_path, augmented_image)
    print(f"Hasil augmentasi disimpan di {augmen_path}.")

In [21]:
augment_data(train_path, augmen_path, final_size)

Found 454 images belonging to 2 classes.
Hasil augmentasi disimpan di D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\DataDummy\Dataset\train\Augmen.


##### === Transfer Augmented Data ===

In [22]:
def move_images_to_train(train_path, augmen_path):
    for class_name in os.listdir(augmen_path):
        augmen_class_path = os.path.join(augmen_path, class_name)
        train_class_path = os.path.join(train_path, class_name)
        os.makedirs(train_class_path, exist_ok=True)
        for filename in os.listdir(augmen_class_path):
            shutil.move(os.path.join(augmen_class_path, filename), os.path.join(train_class_path, filename))
        if not os.listdir(augmen_class_path):
            os.rmdir(augmen_class_path)
    if not os.listdir(augmen_path):
        os.rmdir(augmen_path)
    print("Augmented data berhasil dipindahkan ke folder train.")

In [23]:
move_images_to_train(train_path, augmen_path)

Augmented data berhasil dipindahkan ke folder train.


# Data from Roboflow

##### === Path Dataset and Folder Output ===

In [None]:

roboflow_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\RFdataset"

##### === Preprocessing Roboflow Dataset ===

In [None]:
def preprocess_rfdataset(input_folder, output_folder, size):
    """
    Preproses dataset Roboflow menggunakan anotasi.
    """
    split_folders = ['train', 'val', 'test']
    for split in split_folders:
        split_path = os.path.join(input_folder, split)
        output_path = os.path.join(output_folder, split)
        csv_path = os.path.join(split_path, "_annotations.csv")

        if not os.path.exists(csv_path):
            print(f"No annotation file for {split}. Skipping.")
            continue

        os.makedirs(output_path, exist_ok=True)
        df = pd.read_csv(csv_path)

        for idx, row in df.iterrows():
            img_name = row['filename']
            category = row['class']
            category_path = os.path.join(output_path, category)
            os.makedirs(category_path, exist_ok=True)

            source_img_path = os.path.join(split_path, img_name)
            new_name = f"{category}_RF_{idx + 1}.jpg"
            dest_img_path = os.path.join(category_path, new_name)

            if os.path.exists(source_img_path):
                resize_and_rename(input_folder, 
                                  final_size, 
                                  new_name_suffix="_rf_")

        os.remove(csv_path)
        print(f"Processed {split}. Annotations removed.")

In [27]:
# Jalankan preprocessing
preprocess_rfdataset(roboflow_folder, dataset_folder, final_size)

No annotation file for train. Skipping.
No annotation file for val. Skipping.
No annotation file for test. Skipping.


# Data From Kaggle

##### === Konfigurasi Path ===

In [None]:
kaggle_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\KaggleDataset"
subfolders = ["PET", "HDPE"]  # Subfolder dalam dataset

##### === Preprocessing: Resize, Rename, dan Hapus Data Asli ===

In [None]:
resize_and_rename(kaggle_folder, final_size, new_name_suffix="_k_")

##### === Membagi Dataset ===

In [None]:
split_dataset(kaggle_folder, dataset_folder, (0.7, 0.3, 0))

# Additional Data

##### === Convert from HEIC to JPG ===

In [None]:
# Folder input dan outputr
datasetcina_folder = r"D:\1 Main File\Project File\Capstone Bangkit\MyOwnTry\AllData\DatasetCina"

In [None]:
register_heif_opener()

In [None]:
def convert_heic_to_jpg(datasetcina_folder):
    for file_name in os.listdir(datasetcina_folder):
        file_path = os.path.join(datasetcina_folder, file_name)
        
        # Cek apakah file berformat HEIC
        if file_name.lower().endswith(".heic"):
            try:
                # Buka file HEIC
                img = Image.open(file_path)
                
                # Path untuk file JPG yang baru
                output_file_name = f"{os.path.splitext(file_name)[0]}.jpg"
                output_file_path = os.path.join(datasetcina_folder, output_file_name)
                
                # Cek apakah file JPG sudah ada
                if not os.path.exists(output_file_path):
                    # Simpan sebagai JPG
                    img.save(output_file_path, "JPEG")
                    
                    # Hapus file HEIC asli
                    os.remove(file_path)
                    
                    print(f"Berhasil mengonversi: {file_name} -> {output_file_name}")
                else:
                    print(f"File JPG sudah ada, melewati: {output_file_name}")
            except Exception as e:
                print(f"Error memproses {file_name}: {e}")

In [None]:
convert_heic_to_jpg(datasetcina_folder)

In [None]:
resize_and_rename(datasetcina_folder, final_size, new_name_suffix="_c_")

# Data Balancing

##### === Data Count==-=

In [8]:
# Menghitung jumlah gambar per kelas
def count_images_in_class(dataset_path):
    # Menyimpan jumlah gambar per kelas untuk train, val, dan test
    counts = {'train': {}, 'val': {}, 'test': {}}

    # Folder yang ingin dicek (train, val, dan test)
    for split in ['train', 'val', 'test']:
        split_folder_path = os.path.join(dataset_path, split)
        if os.path.exists(split_folder_path):
            for class_folder in os.listdir(split_folder_path):
                class_folder_path = os.path.join(split_folder_path, class_folder)
                if os.path.isdir(class_folder_path):
                    counts[split][class_folder] = len([f for f in os.listdir(class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        else:
            print(f"Folder '{split}' tidak ditemukan di dataset.")
    
    return counts


In [9]:
dataset_class_counts = count_images_in_class(dataset_folder)

In [10]:
print("Jumlah gambar per kelas untuk masing-masing split:")
for split in ['train', 'val', 'test']:
    print(f"\nJumlah gambar di folder {split}:")
    print(dataset_class_counts[split])

Jumlah gambar per kelas untuk masing-masing split:

Jumlah gambar di folder train:
{'HDPE': 3588, 'PET': 3588}

Jumlah gambar di folder val:
{'HDPE': 813, 'PET': 813}

Jumlah gambar di folder test:
{'HDPE': 75, 'PET': 98}


##### === Balancing val ===

In [None]:
def balance_val_dataset(dataset_path, datasetcina_folder):
    # Hitung jumlah gambar per kelas di val
    val_counts = count_images_in_class(dataset_path)['val']
    
    # Cari kelas dengan jumlah data terbanyak
    max_class = max(val_counts, key=val_counts.get)
    max_count = val_counts[max_class]
    
    # Folder val dan train
    val_folder = os.path.join(dataset_path, 'val')
    train_folder = os.path.join(dataset_path, 'train')
    
    # Iterasi setiap kelas di val
    for class_name, count in val_counts.items():
        if count < max_count:
            # Hitung selisih
            difference = max_count - count
            
            # Path folder sumber di datasetcina
            source_class_folder = os.path.join(datasetcina_folder, class_name)
            
            # Path folder tujuan di val
            target_class_folder_val = os.path.join(val_folder, class_name)
            os.makedirs(target_class_folder_val, exist_ok=True)
            
            # Pindahkan file dari datasetcina ke val
            files_to_move = [f for f in os.listdir(source_class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))][:difference]
            for file_name in files_to_move:
                shutil.move(os.path.join(source_class_folder, file_name), os.path.join(target_class_folder_val, file_name))
    
    # Pindahkan sisa file dari datasetcina ke train
    for class_name in os.listdir(datasetcina_folder):
        source_class_folder = os.path.join(datasetcina_folder, class_name)
        target_class_folder_train = os.path.join(train_folder, class_name)
        os.makedirs(target_class_folder_train, exist_ok=True)
        
        files_to_move = [f for f in os.listdir(source_class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        for file_name in files_to_move:
            shutil.move(os.path.join(source_class_folder, file_name), os.path.join(target_class_folder_train, file_name))

In [None]:
balance_val_dataset(dataset_folder, datasetcina_folder)

In [None]:
dataset_class_counts = count_images_in_class(dataset_folder)

In [None]:
print("Jumlah gambar per kelas untuk masing-masing split:")
for split in ['train', 'val', 'test']:
    print(f"\nJumlah gambar di folder {split}:")
    print(dataset_class_counts[split])

In [None]:
def augment_and_balance_data(train_path, variations_per_image):
    # Inisialisasi ImageDataGenerator
    datagen = ImageDataGenerator(
        rotation_range=27,
        vertical_flip=True,
        horizontal_flip=True,
        fill_mode='constant',
        cval=0
    )

    # Ambil nama kelas dan hitung jumlah gambar per kelas
    class_counts = {}
    class_names = os.listdir(train_path)

    for class_name in class_names:
        class_folder = os.path.join(train_path, class_name)
        if os.path.isdir(class_folder):
            class_image_count = len([f for f in os.listdir(class_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
            class_counts[class_name] = class_image_count

    # Tentukan target_class_count dari kelas dengan jumlah gambar terbanyak
    target_class_name = max(class_counts, key=class_counts.get)
    target_class_count = class_counts[target_class_name]

    for class_name, class_image_count in class_counts.items():
        if class_image_count < target_class_count:
            # Hitung selisih jumlah gambar yang dibutuhkan
            num_images_to_generate = target_class_count - class_image_count
            print(f"Augmentasi untuk kelas {class_name}: {num_images_to_generate} gambar")

            # Loop augmentasi untuk setiap gambar di kelas
            generated_count = 0
            class_images = [f for f in os.listdir(os.path.join(train_path, class_name)) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

            for image_file in class_images:
                if generated_count >= num_images_to_generate:
                    break

                # Baca gambar
                image_path = os.path.join(train_path, class_name, image_file)
                img = cv2.imread(image_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (640, 640))
                img = img.reshape((1,) + img.shape)  # Reshape untuk generator

                # Generate gambar augmentasi (maksimum `variations_per_image` per gambar)
                for _ in range(variations_per_image):
                    if generated_count < num_images_to_generate:
                        # Simpan augmentasi langsung ke folder kelas
                        for batch in datagen.flow(img, batch_size=1, save_to_dir=os.path.join(train_path, class_name), save_prefix='aug', save_format='jpg'):
                            generated_count += 1
                            break  # Hentikan setelah satu gambar dihasilkan

            print(f"Augmentasi untuk {class_name} selesai, total gambar: {target_class_count}")

    print("Augmentasi selesai untuk semua kelas.")

In [None]:
augment_and_balance_data(train_path, 2)

In [7]:
dataset_class_counts = count_images_in_class(dataset_folder)

NameError: name 'count_images_in_class' is not defined

In [None]:
print("Jumlah gambar per kelas untuk masing-masing split:")
for split in ['train', 'val', 'test']:
    print(f"\nJumlah gambar di folder {split}:")
    print(dataset_class_counts[split])

##### === Shuffle dataset ===

In [5]:
def shuffle_dataset(dataset_path):
    """
    Mengacak urutan data di setiap kelas dalam dataset.
    
    Args:
    - dataset_path: Path ke folder utama dataset yang memiliki subfolder train, val, dan test.
    """
    # Subfolder yang ingin diacak
    splits = ['train', 'val', 'test']

    for split in splits:
        split_folder_path = os.path.join(dataset_path, split)
        if os.path.exists(split_folder_path):
            # Iterasi setiap folder kelas
            for class_name in os.listdir(split_folder_path):
                class_folder_path = os.path.join(split_folder_path, class_name)
                if os.path.isdir(class_folder_path):
                    # Ambil semua file gambar dalam folder kelas
                    images = [f for f in os.listdir(class_folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
                    
                    # Acak urutan gambar
                    random.shuffle(images)

                    # Pindahkan gambar ke folder sementara untuk mengacak
                    temp_folder = os.path.join(class_folder_path, 'temp')
                    os.makedirs(temp_folder, exist_ok=True)

                    for image in images:
                        shutil.move(os.path.join(class_folder_path, image), os.path.join(temp_folder, image))

                    # Pindahkan gambar kembali ke folder kelas dari folder sementara
                    for image in os.listdir(temp_folder):
                        shutil.move(os.path.join(temp_folder, image), os.path.join(class_folder_path, image))

                    # Hapus folder sementara
                    os.rmdir(temp_folder)

                    # Cetak pesan untuk setiap kelas yang telah diacak
                    print(f"Dataset di folder '{split}' telah diacak untuk kelas '{class_name}'.")
        else:
            print(f"Folder '{split}' tidak ditemukan di dataset.")

In [6]:
shuffle_dataset(dataset_folder)

Dataset di folder 'train' telah diacak untuk kelas 'HDPE'.
Dataset di folder 'train' telah diacak untuk kelas 'PET'.
Dataset di folder 'val' telah diacak untuk kelas 'HDPE'.
Dataset di folder 'val' telah diacak untuk kelas 'PET'.
Dataset di folder 'test' telah diacak untuk kelas 'HDPE'.
Dataset di folder 'test' telah diacak untuk kelas 'PET'.
