# CISC5800: Machine Learning Final Project
### *Creating a Convolutional Neural Network using a Plant Leaf Diseases Training Dataset from Kaggle*
## By: Ryan Cruise, SJ

### Part 8: Creating a Stratified K-Fold CNN

Unfortunately, I forgot to save any intermediary metrics before beginning the model. And since the model took about 14 hours to run, I will not be running it again. Nonetheless, this model has the best and most consistent performance of any model I used. I was able to save the models and will include them in the GitHub.

In [57]:
import os

#silencing warnings from tensorflow re: gpu use
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "2"
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
from PIL import Image
import glob
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

#verify local GPU being used by Tensorflow
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
#directory path of parent folder of all plant images
plant_dir = "/home/ryan/plant_data/plant-diseases-training-dataset"
parent = Path(plant_dir)

#list of names of each folder (class)
classes = [entry.name for entry in parent.iterdir() if entry.is_dir()] 
class_onehot = [x for x in range(71)]

health_dir = []

#initialize empty class df
df_class = pd.DataFrame({'class': classes, 'y':class_onehot, 'plant': pd.NA, 'health': ['unhealthy'for x in range(len(classes))], 'count':np.nan})

#creates df of classes and thier counts
for cls in classes:
    #assigns class plant, health, and class count to each class
    p=Path(f'{plant_dir}/{cls}')
    count = sum(1 for entry in p.rglob('*') if entry.is_file())
    df_class.loc[df_class['class'] == cls, 'count'] = count

    #assigns plant type
    split = cls.split('___',1)
    df_class.loc[df_class['class'] == cls, 'plant'] = split[0]
    
    #create dir list to create classes of only <plant>_healthy and <plant>_unhealthy 
    if split[1] == 'healthy':
        health_dir.append(cls)
    elif (split[1] != 'healthy') & (f'{split[0]}___unhealthy' not in health_dir):
        health_dir.append(f'{split[0]}___unhealthy')       

    #assigns health of class
    if split[1] == 'healthy':
        df_class.loc[df_class['class'] == cls, 'health'] = 'healthy'

print(df_class.sort_values(by='count', ascending=False))

                       class   y       plant     health    count
45  Cassava___mosaic_disease  45     Cassava  unhealthy  13158.0
51  Orange___citrus_greening  51      Orange  unhealthy   5507.0
57        Tomato___leaf_curl  57      Tomato  unhealthy   5357.0
49         Soybean___healthy  49     Soybean    healthy   5090.0
63        Rose___slug_sawfly  63        Rose  unhealthy   4979.0
..                       ...  ..         ...        ...      ...
69        Apple___brown_spot  69       Apple  unhealthy    215.0
2       Watermelon___healthy   2  Watermelon    healthy    205.0
21  Coffee___red_spider_mite  21      Coffee  unhealthy    167.0
65  Watermelon___anthracnose  65  Watermelon  unhealthy    155.0
67         Potato___nematode  67      Potato  unhealthy     68.0

[71 rows x 5 columns]


In [114]:
skf_dir = "/home/ryan/plants_skf"
plant_dir = "/home/ryan/plant_data/plant-diseases-training-dataset"
parent = Path(skf_dir)
parent_class = Path(plant_dir)

#list of names of each folder (class)
classes = [entry.name for entry in parent_class.iterdir() if entry.is_dir()]
files = [entry.name for entry in parent.iterdir() if entry.is_file()] 
class_onehot = [x for x in range(71)]

df_onehot = pd.DataFrame({"class":classes, "one_hot":class_onehot})

# Initialize an empty DataFrame with columns
df_class = pd.DataFrame(columns=['file','class','img','onehot'])

for file in files:
    #assigns plant type
    split = file.split('-',1)
    cls = split[0]
    img = split[1]
    onehot = df_onehot.loc[df_onehot["class"] == cls, "one_hot"].values[0]
    df_class.loc[len(df_class)] = {'file':file,'class': cls, 'img': img, 'onehot':onehot}

print(df_class)
    

                                      file                     class  \
0             Tomato___leaf_curl-89828.jpg        Tomato___leaf_curl   
1             Corn___common_rust-50205.jpg        Corn___common_rust   
2               Grape___healthy-105584.jpg           Grape___healthy   
3          Tomato___early_blight-80856.jpg     Tomato___early_blight   
4           Sugercane___mosaic-106479.jpeg        Sugercane___mosaic   
...                                    ...                       ...   
116142        Grape___black_rot-110744.jpg         Grape___black_rot   
116143                Rose___rust-7365.jpg               Rose___rust   
116144     Potato___early_blight-67452.jpg     Potato___early_blight   
116145              Rose___healthy-123.jpg            Rose___healthy   
116146  Cassava___mosaic_disease-25259.jpg  Cassava___mosaic_disease   

                img  onehot  
0         89828.jpg      57  
1         50205.jpg      39  
2        105584.jpg      20  
3         80856

In [115]:
idg = ImageDataGenerator(
    width_shift_range=0.1, 
    height_shift_range=0.1, 
    zoom_range=0.3, 
    fill_mode='nearest', 
    horizontal_flip=True)

In [116]:
y  = np.array(df_class['class'])
nsamp = y.shape[0]
print(y)
print(nsamp)

['Tomato___leaf_curl' 'Corn___common_rust' 'Grape___healthy' ...
 'Potato___early_blight' 'Rose___healthy' 'Cassava___mosaic_disease']
116147


In [117]:
print(len(np.unique(y)))

71


In [133]:
def custom_cnn(input_shape, nclasses):

    inputs = keras.Input(shape=input_shape)

    #Rescale pixels to [-1,1]
    x = layers.Rescaling(1./127.5, offset=-1.0)(inputs)

    #Stack 1 - convolution round 1, maxpooling
    x = layers.Conv2D(32, (3,3), padding="same", activation="relu")(x)
    x = layers.Conv2D(32, (3,3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

    #Stack 2 - convolution round 2, maxpooling
    x = layers.Conv2D(64, (3,3), padding="same", activation="relu")(x)
    x = layers.Conv2D(64, (3,3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

     #Stack 3 - Convolution round 3, maxpooling
    x = layers.Conv2D(128, (3,3), padding="same", activation="relu")(x)
    x = layers.Conv2D(128, (3,3), padding="same", activation="relu")(x)
    x = layers.Conv2D(128, (3,3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

    #reduces dimensionality
    x = layers.GlobalAveragePooling2D()(x)

    #drop 50% neurons to prevent overfitting before sending to FC layer
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation="relu")(x) #the only FC layer outside of the output layer
    x = layers.Dropout(0.5)(x) #drop 50% neurons to prevent overfitting before sending to output layer

    #output layer - softmax function
    outputs = layers.Dense(nclasses, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    return model

In [135]:
y  = np.array(df_class['class'])
nsamp = y.shape[0]

skf_save="/home/ryan/skf_save"

y_mod = []
ypred_mod = []
val_loss = []
val_acc = []

input_shape = (256,256,3)

skf = StratifiedKFold(n_splits=5, shuffle=True)

for i, (tr_idx, ts_idx) in enumerate(skf.split(np.zeros(nsamp),y)):

    print(f"========================FOLD {i}=========================")
    
    #load tr and ts indices
    tr_data = df_class.iloc[tr_idx]
    ts_data = df_class.iloc[ts_idx]

    #load training and testing images
    tr_dg = idg.flow_from_dataframe(tr_data, directory=skf_dir, x_col="file", y_col="class", 
                                    class_mode="categorical", shuffle=True, batch_size=64, target_size=(256,256))
    
    ts_dg = idg.flow_from_dataframe(ts_data, directory=skf_dir, x_col="file", y_col="class", 
                                    class_mode="categorical",shuffle=False, batch_size=64, target_size=(256,256))

    #create new instance of model
    model = cnn_model(input_shape, len(np.unique(y)))

    #compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="categorical_crossentropy",
        metrics=["accuracy"])

    #create callbacks
    chkpoints = [tf.keras.callbacks.ModelCheckpoint(f"{skf_save}/model_{i}.keras", monitor='val_loss', verbose=1, 
                                                    save_best_only=True, mode='min')]

    #Early Stopping for efficiency
    earlystop = [keras.callbacks.EarlyStopping(
            monitor = "val_loss",
            patience = 3,
            restore_best_weights=True)]

    callbacks = chkpoints + earlystop
    
    history = model.fit(tr_dg, steps_per_epoch=len(tr_dg), validation_data=ts_dg, validation_steps=len(ts_dg), epochs=20, callbacks=callbacks)

    pred = model.evaluate(ts_dg)
    pred = dict(zip(model.metrics_names, pred))
    val_acc.append(pred['compile_metrics'])
    val_loss.append(pred['loss'])

    print(val_acc)
    print(val_loss)
    tf.keras.backend.clear_session()    

Found 92917 validated image filenames belonging to 71 classes.
Found 23230 validated image filenames belonging to 71 classes.
Epoch 1/20
[1m1452/1452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step - accuracy: 0.2019 - loss: 3.2517
Epoch 1: val_loss improved from None to 1.92138, saving model to /home/ryan/skf_save/model_0.keras
[1m1452/1452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 339ms/step - accuracy: 0.2783 - loss: 2.7928 - val_accuracy: 0.4521 - val_loss: 1.9214
Epoch 2/20
[1m1452/1452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - accuracy: 0.4120 - loss: 2.0638
Epoch 2: val_loss improved from 1.92138 to 1.44925, saving model to /home/ryan/skf_save/model_0.keras
[1m1452/1452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 330ms/step - accuracy: 0.4396 - loss: 1.9359 - val_accuracy: 0.5550 - val_loss: 1.4492
Epoch 3/20
[1m1452/1452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step - accuracy: 0.5126 - 

In [136]:
print(val_acc)
print(val_loss)

[0.8826087117195129, 0.8818338513374329, 0.8760170340538025, 0.8801498413085938, 0.8810538649559021]
[0.34899380803108215, 0.3553321957588196, 0.37334582209587097, 0.3736894130706787, 0.3564313054084778]
