In [None]:
# libraries
import os
import shutil
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from PIL import UnidentifiedImageError

### Preprocessing

In [None]:
# dataset paths
original_dataset_path = '/kaggle/input/plant-disease-classification-merged-dataset'
output_path = '/kaggle/working/filtered_dataset'
output_augmented_path = '/kaggle/working/augmented_data'
os.makedirs(output_augmented_path, exist_ok=True)

#selected crops for classification
selected_crops = [
    "Apple__black_rot", "Apple__healthy", "Apple__rust", "Apple__scab",
    "Cassava__bacterial_blight", "Cassava__brown_streak_disease", "Cassava__green_mottle",
    "Cassava__healthy", "Cassava__mosaic_disease",
    "Grape__black_measles", "Grape__black_rot", "Grape__healthy", "Grape__leaf_blight_(isariopsis_leaf_spot)",
    "Potato__early_blight", "Potato__healthy", "Potato__late_blight",
    "Rice__brown_spot", "Rice__healthy", "Rice__hispa", "Rice__leaf_blast", "Rice__neck_blast",
    "Sugarcane__bacterial_blight", "Sugarcane__healthy", 'Sugarcane__red_rot', "Sugarcane__red_stripe", "Sugarcane__rust",
    "Tea__algal_leaf", "Tea__anthracnose", "Tea__bird_eye_spot", "Tea__brown_blight",
    "Tea__healthy", "Tea__red_leaf_spot",
    "Wheat__brown_rust", "Wheat__healthy", "Wheat__septoria", "Wheat__yellow_rust",
    "Tomato__bacterial_spot", "Tomato__early_blight", "Tomato__healthy", "Tomato__late_blight", "Tomato__leaf_mold",
    "Tomato__mosaic_virus", "Tomato__septoria_leaf_spot", "Tomato__spider_mites_(two_spotted_spider_mite)",
    "Tomato__target_spot", "Tomato__yellow_leaf_curl_virus"
]

#label mapping english label to bindo
label_mapping = {
    "Apple__black_rot": "apel_busuk_hitam",
    "Apple__healthy": "apel_sehat",
    "Apple__rust": "apel_karat",
    "Apple__scab": "apel_keropos",
    
    "Cassava__bacterial_blight": "singkong_hawar_bakteri",
    "Cassava__brown_streak_disease": "singkong_virus_garis_coklat",
    "Cassava__green_mottle": "singkong_bintik_hijau",
    "Cassava__healthy": "singkong_sehat",
    "Cassava__mosaic_disease": "singkong_penyakit_mosaik",
    
    "Grape__black_measles": "anggur_campak_hitam",
    "Grape__black_rot": "anggur_busuk_hitam",
    "Grape__healthy": "anggur_sehat",
    "Grape__leaf_blight_(isariopsis_leaf_spot)": "anggur_hawar_daun",
    
    "Potato__early_blight": "kentang_hawar_awal",
    "Potato__healthy": "kentang_sehat",
    "Potato__late_blight": "kentang_hawar_akhir",
    
    "Rice__brown_spot": "padi_bintik_coklat",
    "Rice__healthy": "padi_sehat",
    "Rice__hispa": "padi_hispa",
    "Rice__leaf_blast": "padi_blas_daun",
    "Rice__neck_blast": "padi_blas_leher",
    
    "Sugarcane__bacterial_blight": "tebu_hawar_bakteri",
    "Sugarcane__healthy": "tebu_sehat",
    'Sugarcane__red_rot': "tebu_busuk_merah",
    "Sugarcane__red_stripe": "tebu_garis_merah",
    "Sugarcane__rust": "tebu_karat_tebu",
    
    "Tea__algal_leaf": "teh_bercak_daun_algal",
    "Tea__anthracnose": "teh_antraknosa",
    "Tea__bird_eye_spot": "teh_bercak_mata_burung",
    "Tea__brown_blight": "teh_bercak_coklat",
    "Tea__healthy": "teh_sehat",
    "Tea__red_leaf_spot": "teh_bercak_daun_merah",
    
    "Wheat__brown_rust": "gandum_karat_daun",
    "Wheat__healthy": "gandum_sehat",
    "Wheat__septoria": "gandum_bercak_septoria",
    "Wheat__yellow_rust": "gandum_karat_garis_kuning",
    
    "Tomato__bacterial_spot": "tomat_bercak_bakteri",
    "Tomato__early_blight": "tomat_pembusukan_daun_muda",
    "Tomato__healthy": "tomat_sehat",
    "Tomato__late_blight": "tomat_busuk_daun",
    "Tomato__leaf_mold": "tomat_bercak_daun_oleh_jamur",
    "Tomato__mosaic_virus": "tomat_virus_mosaik",
    "Tomato__septoria_leaf_spot": "tomat_bercak_daun_septoria",
    "Tomato__spider_mites_(two_spotted_spider_mite)": "tomat_tungau_laba_laba",
    "Tomato__target_spot": "tomat_bintik_target",
    "Tomato__yellow_leaf_curl_virus": "tomat_virus_daun_kuning_keriting"
}

#### Data Augmentation

In [None]:
#image augmentation parameters
max_images = 2000
augmentor = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

#augmentation and filtering
valid_formats = {".jpg", ".jpeg", ".png"}
def is_valid_format(file_name):
    return any(file_name.lower().endswith(ext) for ext in valid_formats)

for crop in selected_crops:
    crop_path = os.path.join(original_dataset_path, crop)
    output_crop_path = os.path.join(output_augmented_path, crop)
    os.makedirs(output_crop_path, exist_ok=True)
    
    images = os.listdir(crop_path)
    valid_images = []
    
    #validate images
    for img_name in images:
        img_path = os.path.join(crop_path, img_name)
        if not is_valid_format(img_name):
            print(f"Skipping invalid file: {img_name}")
            continue
        try:
            load_img(img_path)
            valid_images.append(img_name)
        except UnidentifiedImageError:
            print(f"Skipping corrupted image: {img_name}")
    
    print(f"{len(valid_images)} valid images found for {crop}.")
    
    #augmentation
    if len(valid_images) < max_images:
        augment_count = max_images - len(valid_images)
        for i in range(augment_count):
            img_name = valid_images[i % len(valid_images)]
            img_path = os.path.join(crop_path, img_name)
            img = load_img(img_path, target_size=(224, 224))
            img_array = img_to_array(img).reshape((1,) + img_to_array(img).shape)
            for batch in augmentor.flow(img_array, batch_size=1, save_to_dir=output_crop_path,
                                        save_prefix=f"{crop}_aug", save_format="jpg"):
                break
        print(f"Augmented {augment_count} images for {crop}.")
    else:
        print(f"No augmentation needed for {crop}.")
    
    print(f"Augmentation completed for {crop}.")

### Dataset Splitting

In [None]:
#splitting ratios
train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

#create directories for train, val, test splits
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(output_path, split), exist_ok=True)

#mapping english labels to indonesian labels
mapped_selected_crops = {crop: label_mapping[crop] for crop in selected_crops if crop in label_mapping}

#splitting data for selected crops
for crop, indonesian_label in mapped_selected_crops.items():
    crop_path = os.path.join(original_dataset_path, crop)
    if not os.path.isdir(crop_path):
        print(f"Warning: {crop_path} does not exist.")
        continue

    images = os.listdir(crop_path)
    random.shuffle(images)

    train_size = int(len(images) * train_ratio)
    val_size = int(len(images) * val_ratio)

    train_imgs, val_imgs, test_imgs = images[:train_size], images[train_size:train_size + val_size], images[train_size + val_size:]

    def copy_images(img_list, split):
        #use the Indonesian label as the directory name
        dest_path = os.path.join(output_path, split, indonesian_label)
        os.makedirs(dest_path, exist_ok=True)
        for img in img_list:
            shutil.copy(os.path.join(crop_path, img), os.path.join(dest_path, img))

    copy_images(train_imgs, 'train')
    copy_images(val_imgs, 'val')
    copy_images(test_imgs, 'test')

print("Dataset splitting complete with Indonesian labels.")

### Data Generator

In [None]:
#data augmentation and generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
val_test_datagen = ImageDataGenerator(rescale=1./255)

train_path, val_path, test_path = [os.path.join(output_path, split) for split in ['train', 'val', 'test']]

train_generator = train_datagen.flow_from_directory(
    train_path, target_size=(224, 224), batch_size=32, class_mode='categorical'
)
val_generator = val_test_datagen.flow_from_directory(
    val_path, target_size=(224, 224), batch_size=32, class_mode='categorical'
)
test_generator = val_test_datagen.flow_from_directory(
    test_path, target_size=(224, 224), batch_size=32, class_mode='categorical', shuffle=False
)

### Model Definition and Training

In [None]:
#resNet50 model
base_model = ResNet50(weights=None, include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dense(len(train_generator.class_indices), activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=x)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ModelCheckpoint('/kaggle/working/best_model_2.keras', save_best_only=True, monitor='val_loss')
]

#train the model
history = model.fit(
    train_generator, epochs=15, validation_data=val_generator, callbacks=callbacks
)

#evaluate the model
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")