## Import Libraries

Since we will download a dataset from kaggle, we have to hand over our kaggle handle. You can find the handle in your kaggle account. 

In [None]:
#download kaggle api (kaggle.json) and import it here
from google.colab import files
files.upload()

In [None]:
!pip install -q tensorflow tensorflow-datasets

In [None]:
!pip install mlflow

In [1]:
import os, sys 
sys.path.append(os.path.dirname(os.path.realpath('/Users/paulosgidyelew/Desktop/cassava-classification-capstone/src')))
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Flatten, InputLayer, Dense, Dropout, BatchNormalization, Conv2D, Activation, MaxPooling2D
from tensorflow.keras.optimizers import RMSprop, Adam, SGD, Adagrad
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from keras.preprocessing.image import ImageDataGenerator
import tensorflow_hub as hub

import warnings
import mlflow

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, fbeta_score
from sklearn.utils import class_weight

import itertools, cv2
# user defined module
from src import confusion_matrix

# mlflow parameters:
EXPERIMENT_NAME = "Classava_capstone"
TRACKING_URI = "https://hudsju377cddpoevnjdkfnvpwovniewnipcdsnkvn.mlflow.neuefische.de"

warnings.filterwarnings('ignore')

RSEED = 42
tf.random.set_seed(RSEED)



## Simple Convolutional Neural Network with balanced data... have to include that here. 


Now we want use the first model, but use balanced data. We chose a simple convolutional model in order to get a first glance at the results. We want to use this model as a low benchmark that we want to beat in more complex models that we will use afterwards. We were using the following tutorial as a guideline for the construction of the network: <a href= "https://www.youtube.com/watch?v=cAICT4Al5Ow&t=334s
">https://www.youtube.com/watch?v=cAICT4Al5Ow&t=334s</a>


First we will set up MLflow to keep track of our experiments

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run(run_name='First, simple convolutional model')
run = mlflow.active_run()

Then we will create the architecture of the model. Here we are building three convolusional layers followed by one dense layers

In [None]:
model = Sequential()
model.add(Conv2D(64, 3, 3, input_shape=(380, 380, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5,'softmax'))



In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

The ImageDataGenerator is used to produce the train and validation sets.

In [None]:
image_data_generator = ImageDataGenerator(rescale=1./255,
                                          rotation_range=90, 
                                          shear_range=0.2, 
                                          zoom_range=0.2, 
                                          horizontal_flip=True, 
                                          vertical_flip=True,
                                          validation_split=0.2)#,
                                          

train_set = image_data_generator.flow_from_directory('/content/train', 
                                                     subset='training', 
                                                     target_size=(380,380), 
                                                     class_mode='categorical', 
                                                     batch_size=32, 
                                                     shuffle=True,
                                                     interpolation='nearest',
                                                     color_mode="rgb",
                                                     )
val_set = image_data_generator.flow_from_directory('/content/train', 
                                                   subset='validation', 
                                                   target_size=(380,380), 
                                                   class_mode='categorical', 
                                                   batch_size=32, 
                                                   shuffle=False,
                                                   interpolation='nearest',  
                                                   color_mode="rgb"
                                                   )

We can look at the pictures and labels of one batch of the validation set:

In [None]:
#We can have a look at the images and labels in the batches
#The first [i] determines the batch number and the second [i]  determines if we look at the images or its labels of this batch
val_set[1][1]

In [None]:
val_set[1][0]

We can check out one instance of our set and its corresponding label:

In [None]:
plt.imshow(val_set[1][0][30])
print (val_set[1][1][30])

In [None]:
#the amount of batches in the train set are:
len(train_set)

We can include the class weights of the train and validation set, to balance out the training. 

In [None]:
from collections import Counter
counter = Counter(train_set.classes)  
max_val = float(max(counter.values()))    #maximum value    
class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}                     

In [None]:
#class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(train_set.classes), train_set.classes)

In [None]:
model_checkpoint_filepath = 'checkpoints/simple_conv_model_balanced.ckpt'#h5
model_check_point = ModelCheckpoint(model_checkpoint_filepath,
                                    verbose=1, 
                                    save_weights_only=True, 
                                    monitor='val_loss', 
                                    save_best_only=True, 
                                    mode='auto')

# the train_set contains both the pictures and the labels, so we do not have to define them separately
history = model.fit(train_set, 
                    epochs=10, 
                    verbose=1, 
                    callbacks=[model_check_point], 
                    validation_data=val_set, 
                    steps_per_epoch=len(train_set),
                    validation_steps=len(val_set), 
                    class_weight=class_weights
                    )

Let us plot the training-process.

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('epoch')
plt.ylabel('Accuracy')
plt.legend(['training','validation'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss Function')
plt.xlabel('epoch')
plt.ylabel('Loss')
plt.legend(['loss','val_loss'], loc='upper right')
plt.show()

### Construction of the confusion matrix

In [None]:
#we can use model.predict to predict the validation set and argmax gives us the the highest number for each element
results = model.predict(val_set)
results = np.argmax(results, axis=1)

In [None]:
#report = classification_report(list_of_true_labels,results)
report = classification_report(val_set.classes, results)
print (report)

In [None]:
cm = confusion_matrix(val_set.classes, results)
confusion_matrix.plot_confusion_matrix(
    cm, classes=['CBB', 'CBSD','CGM','CMD','Healthy'], 
    title='Pre-trained'
)

Calculation of the F2 score (description can be found in the simple model chapter)

In [None]:
#Due to imbalance in our dataset we have to use 'macro' for averaging
F2_score = fbeta_score(val_set.classes,results, average='macro', beta=2)
print(F2_score)

Now let us save the parameters of the model to MLflow:

In [None]:
#These are the parameters that will be transfered to MlFlow for logging our experiments

#Find meaningful parameters!
params = {
      "number of epochs": 10,
      "input_shape": val_set[0][0][0].shape,
      "confusion matrix":cm
  }

In [None]:
#logging params to mlflow
mlflow.log_params(params)
#setting tags
mlflow.set_tag("colab", "True")
#logging metrics
mlflow.log_metric("train-" + "accuracy", history.history['accuracy'][-1])
mlflow.log_metric("val-" + "accuracy", history.history['val_accuracy'][-1])
mlflow.log_metric("train-" + "loss", history.history['loss'][-1])
mlflow.log_metric("val-" + "loss", history.history['val_loss'][-1])
mlflow.log_metric("F2-score", F2_score)

# logging the model to mlflow will not work without a AWS Connection setup.. too complex for now
# but possible if running mlflow locally
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

Using the weighted classes for the fit did not deliver good results. Oversamplling of the data would be needed. 