# Modelling and Evaluation

## Objectives

* Answer business requirement  - the stakeholder wishes to predict whether a lemon is of good or poor quality from a photograph.

## Inputs

* lemon-quality-dataset/lemon_dataset/train
* lemon-quality-dataset/lemon_dataset/validation
* lemon-quality-dataset/lemon_dataset/test
* Image shape embeddings

## Outputs

* Class indices for datasets
* TensorFlow machine learning model
* Hyperparameter optimisation pipeline
* Model trained on best hyperparameters
* Learning curve plot to evaluate model performance
* Model evaluation
* Confusion Matrix for Test Set




---

## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

# Change working directory

* We are assuming you will store the notebooks in a sub folder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

Make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chdir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

### Set Input Directories

In [None]:
my_data_dir = 'inputs/lemon-quality-dataset/lemon_dataset'
train_path = my_data_dir + '/train'
val_path = my_data_dir + '/validation'
test_path = my_data_dir + '/test'

### Set output directory

In [None]:
version = 'v3'
file_path = f'outputs/{version}'

if 'outputs' in os.listdir(current_dir) and version in os.listdir(current_dir + '/outputs'):
  print('Old version is already available create a new version.')
  pass
else:
  os.makedirs(name=file_path)

Set labels

In [None]:
labels = os.listdir(train_path)

print(
    f"Project Labels: {labels}"
    )

Set image shape

In [None]:
import joblib
version = 'v3'
image_shape = joblib.load(filename=f"outputs/{version}/image_shape.pkl")
image_shape

## Number of images in train, validation, and test sets

In [None]:
def frequency_plots(folders: list):
  df_freq = pd.DataFrame([])
  df_total_freq = pd.DataFrame([])
  for folder in folders:
    for label in labels:
      df_freq = df_freq.append(
        pd.Series(data={'Set': folder,
                        'Label': label,
                        'Frequency':int(len(os.listdir(my_data_dir+'/'+ folder + '/' + label)))}
                  ),
                  ignore_index=True
        )
      print(f"* {folder} - {label}: {len(os.listdir(my_data_dir+'/'+ folder + '/' + label))} images")
  print("\n")
  print(df_freq)
  for label in labels:
    df_query = df_freq.query(f'Label == "{label}"')
    df_total_freq = df_total_freq.append(
      pd.Series(data={'Label': label,
        'Total': df_query['Frequency'].sum()}
      ),
        ignore_index=True
      )
  print(df_total_freq)
  sns.set_style("whitegrid")
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize =(12, 5))

  sns.barplot(data=df_freq, x='Set', y='Frequency', hue='Label', ax=ax1)
  ax2.pie(data=df_total_freq, x='Total', labels='Label', autopct='%.1f%%')
  plt.savefig(f'{file_path}/labels_distribution.png', bbox_inches='tight', dpi=150)
  plt.show()

In [None]:
freq_plots = frequency_plots(folders = ['train', 'validation', 'test'])

## Image Data Augmentation

### Image Data Generator

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

Initialise image data generator and define parameters

In [None]:
augmented_image_data = ImageDataGenerator(rotation_range=20,
                                   width_shift_range=0.10, 
                                   height_shift_range=0.10,
                                   shear_range=0.1,
                                   zoom_range=0.1,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   fill_mode='nearest',
                                   rescale=1./255
                              )

Augment training image dataset

In [None]:
batch_size = 20 
train_set = augmented_image_data.flow_from_directory(train_path,
                                              target_size=image_shape[:2],
                                              color_mode='rgb',
                                              batch_size=batch_size,
                                              class_mode='binary',
                                              shuffle=True
                                              )

train_set.class_indices

Augment validation image dataset

In [None]:
validation_set = ImageDataGenerator(rescale=1./255).flow_from_directory(val_path,
                                                          target_size=image_shape[:2],
                                                          color_mode='rgb',
                                                          batch_size=batch_size,
                                                          class_mode='binary',
                                                          shuffle=False
                                                          )

validation_set.class_indices

Augment test image dataset

In [None]:
test_set = ImageDataGenerator(rescale=1./255).flow_from_directory(test_path,
                                                    target_size=image_shape[:2],
                                                    color_mode='rgb',
                                                    batch_size=batch_size,
                                                    class_mode='binary',
                                                    shuffle=False
                                                    )

test_set.class_indices

---

## Plot Augmented training images

In [None]:
for _ in range(3):
    img, label = train_set.next()
    print(img.shape)  
    plt.imshow(img[0])
    plt.show()

Plot augmented validation images

In [None]:
for _ in range(3):
    img, label = validation_set.next()
    print(img.shape) 
    plt.imshow(img[0])
    plt.show()

Plot augmented test set images

In [None]:
for _ in range(3):
    img, label = test_set.next()
    print(img.shape)  
    plt.imshow(img[0])
    plt.show()

Save class indices

In [None]:
joblib.dump(value=train_set.class_indices ,
            filename=f"{file_path}/class_indices.pkl")

---

## Model Creation

Import layers for model creation and tuner for hyperparameter optimisation

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D



Define hypermodel for hypertuning

In [None]:
def model_builder(hp):
    model = Sequential()

    model.add(Conv2D(filters=32, kernel_size=(3,3),input_shape=image_shape, activation='relu',))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, kernel_size=(3,3),input_shape=image_shape, activation='relu',))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, kernel_size=(3,3),input_shape=image_shape, activation='relu',))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    hp_units = hp.Int('units', min_value=64, max_value=512, step=64)
    model.add(Dense(units=hp_units, activation = 'relu')) #vary activation hyperparameter for future models

    model.add(Dropout(0.5)) #vary amount for future models
    model.add(Dense(1, activation = 'sigmoid')) # change activation in output layer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4])
    opt = keras.optimizers.Adam(learning_rate=hp_learning_rate)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

Perform hyperparameter tuning - hyperband chosen as tuner for processing speed

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory=file_path,
                     project_name='hypertuning'
                     )

Import early stopping

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=3)

Perform tuner search

In [None]:
tuner.search(train_set, 
epochs=25, 
steps_per_epoch = len(train_set.classes) // batch_size, 
validation_data=validation_set, 
callbacks=[early_stop], 
verbose=2)

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Create model using defined best hyperparameters

In [None]:
model = tuner.hypermodel.build(best_hps)

In [None]:
model.summary()

Fit model with best hyperparameters to train set

In [None]:
model.fit(train_set,
          epochs=25,
          steps_per_epoch = len(train_set.classes) // batch_size,
          validation_data=validation_set,
          callbacks=[early_stop],
          verbose=1
          )

Save model

In [None]:
model.save(f'outputs/{version}/lemon_quality_model.h5')

### Model Performance

Model learning curve

In [None]:
losses = pd.DataFrame(model.history.history)

sns.set_style("whitegrid")
losses[['loss','val_loss']].plot(style='.-')
plt.title("Loss")
plt.savefig(f'{file_path}/model_training_losses.png', bbox_inches='tight', dpi=150)
plt.show()

print("\n")
losses[['accuracy','val_accuracy']].plot(style='.-')
plt.title("Accuracy")
plt.savefig(f'{file_path}/model_training_acc.png', bbox_inches='tight', dpi=150)
plt.show()

### Model Evaluation

Load saved model

In [None]:
from keras.models import load_model
model = load_model(f'outputs/{version}/lemon_quality_model.h5')

Evaluate saved model on test set

In [None]:
evaluation = model.evaluate(test_set)

Save evaluation pickle

In [None]:
joblib.dump(value=evaluation ,
            filename=f"outputs/{version}/evaluation.pkl")

## Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
pred = model.predict(test_set)
y_pred = np.concatenate(np.round(pred).astype(int))
target_names = ['Bad Quality', 'Good Quality']
confusion_matrix = confusion_matrix(test_set.classes, y_pred)
plt.figure(figsize=(7,6))
sns.heatmap(confusion_matrix, 
            annot=True,
            fmt='g', 
            cmap='Blues',
            xticklabels=['Predicted Bad Quality', 'Predicted Good Quality'],
            yticklabels=[' Actual Bad Quality', 'Actual Good Quality'],
)
plt.savefig(f'{file_path}/confusion_matrix.png', bbox_inches='tight', dpi=150)
print(classification_report(test_set.classes, y_pred, target_names=target_names))

## Predict on new data

Load random image

In [None]:
from tensorflow.keras.preprocessing import image

pointer = 49 
label = labels[0] # select bad or good quality

pil_image = image.load_img(test_path + '/'+ label + '/'+ os.listdir(test_path+'/'+ label)[pointer],
                          target_size=image_shape, color_mode='rgb')
print(f'Image shape: {pil_image.size}, Image mode: {pil_image.mode}')
pil_image

Convert image to array and prepare for prediction

In [None]:
my_image = image.img_to_array(pil_image)
my_image = np.expand_dims(my_image, axis=0)/255
print(my_image.shape)

Predict class probabilities

In [None]:
pred_proba = model.predict(my_image)[0,0]

target_map = {v: k for k, v in train_set.class_indices.items()}
pred_class =  target_map[pred_proba > 0.5]  

if pred_class == target_map[0]: pred_proba = 1 - pred_proba

print(pred_proba)
print(pred_class)