In [None]:
# Importing Libraries for Image Classification Pipeline
#
# This cell imports all necessary libraries and modules for:
# - Loading, processing, and augmenting images
# - Building a Convolutional Neural Network (CNN) using Keras
# - Preprocessing labels for multi-class or multi-label classification
# - Splitting the dataset into training and test sets
# - Visualizing results

import numpy as np
import pickle  
import cv2 
from os import listdir 

# Preprocessing and encoding
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer  

# Keras deep learning imports
from keras.models import Sequential  
from keras.layers.normalization import BatchNormalization 
from keras.layers.convolutional import Conv2D, MaxPooling2D 
from keras.layers.core import Activation, Flatten, Dropout, Dense 
from keras import backend as K  

# Keras utilities for preprocessing images
from keras.preprocessing.image import ImageDataGenerator 
from keras.optimizers import Adam  
from keras.preprocessing import image   
from keras.preprocessing.image import img_to_array 

# Scikit-learn utilities
from sklearn.model_selection import train_test_split

# Matplotlib for plotting
import matplotlib.pyplot as plt 

In [None]:
# Defining Configuration Constants for Model Training and Image Processing
#
# This cell sets up:
# - Model hyperparameters
# - Default image dimensions
# - Dataset directory location


EPOCHS = 25
# int: Number of epochs (full passes through the training data) during model training.

INIT_LR = 1e-3
# float: Initial learning rate for the optimizer (Adam). Controls how quickly the model updates weights.

BS = 32
# int: Batch size. Number of images processed before updating model weights.

default_image_size = tuple((256, 256))


image_size = 0
# int: Placeholder variable for image size (not yet set). May be used later for dynamic resizing.

directory_root = '../input/plantvillage/'
# str: Path to the root directory containing the dataset of images.

width = 256
# int: Target image width in pixels.

height = 256
# int: Target image height in pixels.

depth = 3
# int: Number of color channels in the input images (3 for RGB).
  
  



In [None]:
# Initialize lists to hold image data and their corresponding labels
image_list, label_list = [], []

try:
    print("[INFO] Loading images ...")

    # List all folders in the root directory (e.g., plant types)
    root_dir = listdir(directory_root)

    # Remove macOS system file if present
    for directory in root_dir:
        if directory == ".DS_Store":
            root_dir.remove(directory)

    # Loop through each plant category folder
    for plant_folder in root_dir:
        # List all disease-specific folders inside each plant category
        plant_disease_folder_list = listdir(f"{directory_root}/{plant_folder}")

        # Remove .DS_Store from disease folders
        for disease_folder in plant_disease_folder_list:
            if disease_folder == ".DS_Store":
                plant_disease_folder_list.remove(disease_folder)

        # Loop through each plant disease folder
        for plant_disease_folder in plant_disease_folder_list:
            print(f"[INFO] Processing {plant_disease_folder} ...")

            # Get list of image files in the disease folder
            plant_disease_image_list = listdir(f"{directory_root}/{plant_folder}/{plant_disease_folder}/")

            # Remove .DS_Store if present in the image list
            for single_plant_disease_image in plant_disease_image_list:
                if single_plant_disease_image == ".DS_Store":
                    plant_disease_image_list.remove(single_plant_disease_image)

            # Limit to first 200 images (can help manage dataset size during testing/training)
            for image in plant_disease_image_list[:200]:
                # Build the complete path to the image file
                image_directory = f"{directory_root}/{plant_folder}/{plant_disease_folder}/{image}"

                # Only process JPG files
                if image_directory.endswith(".jpg") or image_directory.endswith(".JPG"):
                    # Convert image to array and add to list
                    image_array = convert_image_to_array(image_directory)
                    image_list.append(image_array)

                    # Append corresponding label (disease folder name)
                    label_list.append(plant_disease_folder)

    print("[INFO] Image loading completed")  

except Exception as e:
    # Catch and print any error that occurs during loading
    print(f"Error : {e}")


In [None]:
image_size = len(image_list)

In [None]:
# Initialize the LabelBinarizer to convert string labels into one-hot encoded vectors
label_binarizer = LabelBinarizer()

# Fit the label binarizer on the collected labels and transform them into one-hot encoded format
image_labels = label_binarizer.fit_transform(label_list)

# Save the fitted label binarizer to a file so it can be reused later (e.g., during inference)
pickle.dump(label_binarizer, open('label_transform.pkl', 'wb'))

# Get the total number of unique classes (i.e., plant diseases)
n_classes = len(label_binarizer.classes_)


In [None]:
print(label_binarizer.classes_)

In [None]:
# Convert Image List to NumPy Array and Normalize

np_image_list = np.array(image_list, dtype=np.float16) / 225.0

In [None]:
# Splitting the Data into Training and Testing Sets

print("[INFO] Spliting data to train, test")
x_train, x_test, y_train, y_test = train_test_split(np_image_list, image_labels, test_size=0.2, random_state = 42) 

In [None]:
# Data Augmentation Setup
#
# This cell configures an ImageDataGenerator to perform real-time data augmentation.
# Augmentation helps improve model generalization by creating varied versions
# of the training images.

aug = ImageDataGenerator(
    rotation_range=25, width_shift_range=0.1,
    height_shift_range=0.1, shear_range=0.2, 
    zoom_range=0.2,horizontal_flip=True, 
    fill_mode="nearest")

In [None]:
# Building the Convolutional Neural Network (CNN) Model
#
# This block defines a Sequential CNN architecture for image classification.
# It includes:
# - Multiple convolutional layers to extract spatial features
# - Batch normalization to stabilize learning
# - Max pooling to downsample feature maps
# - Dropout to reduce overfitting
# - Dense layers for final classification


model = Sequential()
inputShape = (height, width, depth)
chanDim = -1
if K.image_data_format() == "channels_first":
    inputShape = (depth, height, width)
    chanDim = 1
model.add(Conv2D(32, (3, 3), padding="same",input_shape=inputShape))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(n_classes))
model.add(Activation("softmax"))

In [None]:
model.summary()

In [None]:
opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
# distribution
model.compile(loss="binary_crossentropy", optimizer=opt,metrics=["accuracy"])
# train the network
print("[INFO] training network...")

In [None]:
# Training the CNN Model with Data Augmentation
#
# This cell fits the model using the augmented image generator.
# The generator yields batches of augmented data in real time.
# Training progress and validation performance are stored in `history`.

history = model.fit_generator(
    aug.flow(x_train, y_train, batch_size=BS),
    validation_data=(x_test, y_test),
    steps_per_epoch=len(x_train) // BS,
    epochs=EPOCHS, verbose=1
    )

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accurarcy')
plt.plot(epochs, val_acc, 'r', label='Validation accurarcy')
plt.title('Training and Validation accurarcy')
plt.legend()

plt.figure()

#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()

In [None]:
print("[INFO] Calculating model accuracy")
scores = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {scores[1]*100}")