# Import needed modules

In [None]:
model1_name = 'ConvNeXtTiny'
model2_name = 'MobileNetV2'
model3_name = 'InceptionV3'
model4_name = 'DenseNet121'

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import system libs
import os
import time
import shutil
import pathlib
import itertools

# import data handling tools
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# import Deep learning Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras import regularizers

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

print ('modules loaded')

modules loaded


# Create needed functions

#### **Function to create data frame**

In [None]:
# Generate data paths with labels
def define_paths(dir):
    filepaths = []
    labels = []

    folds = os.listdir(dir)
    for fold in folds:
        foldpath = os.path.join(dir, fold)
        filelist = os.listdir(foldpath)

        for fold_ in filelist:
            foldpath_ = os.path.join(foldpath, fold_)
            filelist_ = os.listdir(foldpath_)

            for file_ in filelist_:
                fpath = os.path.join(foldpath_, file_)
                filepaths.append(fpath)
                labels.append(fold_)

    return filepaths, labels


# Concatenate data paths with labels into one dataframe ( to later be fitted into the model )
def define_df(files, classes):
    Fseries = pd.Series(files, name= 'filepaths')
    Lseries = pd.Series(classes, name='labels')
    return pd.concat([Fseries, Lseries], axis= 1)


# Function that create dataframe for train, validation, and test data
def create_df(data_dir):

    # train dataframe
    files, classes = define_paths(data_dir[0])
    df = define_df(files, classes)

    strat = df['labels']
    train_df, valid_df = train_test_split(df, train_size=0.85, shuffle=True, random_state=123, stratify=strat)

    # test dataframe
    files, classes = define_paths(data_dir[1])
    test_df = define_df(files, classes)

    return train_df, valid_df, test_df

#### Function to generate images from dataframe

In [None]:
def create_model_data (train_df, valid_df, test_df, batch_size):
    '''
    This function takes train, validation, and test dataframe and fit them into image data generator, because model takes data from image data generator.
    Image data generator converts images into tensors. '''


    # define model parameters
    img_size = (224, 224)
    channels = 3 # either BGR or Grayscale
    color = 'rgb'
    img_shape = (img_size[0], img_size[1], channels)

    # Recommended : use custom function for test data batch size, else we can use normal batch size.
    ts_length = len(test_df)
    test_batch_size = max(sorted([ts_length // n for n in range(1, ts_length + 1) if ts_length%n == 0 and ts_length/n <= 80]))
    test_steps = ts_length // test_batch_size

    # This function which will be used in image data generator for data augmentation, it just take the image and return it again.
    def scalar(img):
        return img

    tr_gen = ImageDataGenerator(
      preprocessing_function= scalar,
      rotation_range=50,
      horizontal_flip=True,
      vertical_flip=True
    )

    ts_gen = ImageDataGenerator(preprocessing_function= scalar)

    train_gen = tr_gen.flow_from_dataframe( train_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= color, shuffle= True, batch_size= batch_size)

    valid_gen = ts_gen.flow_from_dataframe( valid_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= color, shuffle= True, batch_size= batch_size)

    # Note: we will use custom test_batch_size, and make shuffle= false
    test_gen = ts_gen.flow_from_dataframe( test_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= color, shuffle= False, batch_size= test_batch_size)

    return train_gen, valid_gen, test_gen

# **Model Structure**

#### **Start Reading Dataset**

In [None]:
data_dir = ['/content/drive/MyDrive/C-NMC_Leukemia/training_data','/content/drive/MyDrive/C-NMC_Leukemia/validation_data']

try:
    # Get splitted data
    train_df, valid_df, test_df = create_df(data_dir)

    # Get Generators
    batch_size = 40
    train_gen, valid_gen, test_gen = create_model_data(train_df, valid_df, test_df, batch_size)

except:
    print('Invalid Input')

Found 6830 validated image filenames belonging to 2 classes.
Found 1205 validated image filenames belonging to 2 classes.
Found 4522 validated image filenames belonging to 2 classes.


#### **Generic Model Creation**

In [None]:
# Create Model Structure
img_size = (224, 224)
channels = 3
img_shape = (img_size[0], img_size[1], channels)
class_count = len(list(train_gen.class_indices.keys())) # to define number of classes in dense layer

# MODEL 1
base_model = tf.keras.applications.ConvNeXtTiny(include_top= False, weights= "imagenet", input_shape= img_shape, pooling= 'max')
model1 = Sequential([
    base_model,
    BatchNormalization(axis= -1, momentum= 0.99, epsilon= 0.001),
    Dense(256, kernel_regularizer= regularizers.l2(l= 0.016), activity_regularizer= regularizers.l1(0.006),
                bias_regularizer= regularizers.l1(0.006), activation= 'relu'),
    Dropout(rate= 0.45, seed= 123),
    Dense(class_count, activation= 'softmax')
])
model1.compile(Adamax(learning_rate= 0.0012), loss= 'categorical_crossentropy', metrics= ['accuracy'])
model1.load_weights(f'/content/drive/MyDrive/Models/ALL-final-{model1_name}-weights.h5')
model1.summary()

# MODEL 2
base_model = tf.keras.applications.MobileNetV2(include_top= False, weights= "imagenet", input_shape= img_shape, pooling= 'max')
model2 = Sequential([
    base_model,
    BatchNormalization(axis= -1, momentum= 0.99, epsilon= 0.001),
    Dense(256, kernel_regularizer= regularizers.l2(l= 0.016), activity_regularizer= regularizers.l1(0.006),
                bias_regularizer= regularizers.l1(0.006), activation= 'relu'),
    Dropout(rate= 0.45, seed= 123),
    Dense(class_count, activation= 'softmax')
])
model2.compile(Adamax(learning_rate= 0.0012), loss= 'categorical_crossentropy', metrics= ['accuracy'])
model2.load_weights(f'/content/drive/MyDrive/Models/ALL-final-{model2_name}-weights.h5')
model2.summary()

# MODEL 3
base_model = tf.keras.applications.InceptionV3(include_top= False, weights= "imagenet", input_shape= img_shape, pooling= 'max')
model3 = Sequential([
    base_model,
    BatchNormalization(axis= -1, momentum= 0.99, epsilon= 0.001),
    Dense(256, kernel_regularizer= regularizers.l2(l= 0.016), activity_regularizer= regularizers.l1(0.006),
                bias_regularizer= regularizers.l1(0.006), activation= 'relu'),
    Dropout(rate= 0.45, seed= 123),
    Dense(class_count, activation= 'softmax')
])
model3.compile(Adamax(learning_rate= 0.0012), loss= 'categorical_crossentropy', metrics= ['accuracy'])
model3.load_weights(f'/content/drive/MyDrive/Models/ALL-final-{model3_name}-weights.h5')
model3.summary()

# MODEL 4
base_model = tf.keras.applications.DenseNet121(include_top= False, weights= "imagenet", input_shape= img_shape, pooling= 'max')
model4 = Sequential([
    base_model,
    BatchNormalization(axis= -1, momentum= 0.99, epsilon= 0.001),
    Dense(256, kernel_regularizer= regularizers.l2(l= 0.016), activity_regularizer= regularizers.l1(0.006),
                bias_regularizer= regularizers.l1(0.006), activation= 'relu'),
    Dropout(rate= 0.45, seed= 123),
    Dense(class_count, activation= 'softmax')
])
model4.compile(Adamax(learning_rate= 0.0012), loss= 'categorical_crossentropy', metrics= ['accuracy'])
model4.load_weights(f'/content/drive/MyDrive/Models/ALL-final-{model4_name}-weights.h5')
model4.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 convnext_tiny (Functional)  (None, 768)               27820128  
                                                                 
 batch_normalization_2 (Batc  (None, 768)              3072      
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             (None, 256)               196864    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 2)                 514       
                                                                 
Total params: 28,020,578
Trainable params: 28,019,042
Non-trainable params: 1,536
______________________________________

# **Evaluate model**

# **Get Predictions**

In [None]:
g_dict = test_gen.class_indices
classes = list(g_dict.keys())

ts_length = len(test_df)
test_batch_size = test_batch_size = max(sorted([ts_length // n for n in range(1, ts_length + 1) if ts_length%n == 0 and ts_length/n <= 80]))
test_steps = ts_length // test_batch_size

#### **Tasks per fold:**

1. Make predictions
2. Adjust thresholds and produce classification report for each threshold pair
3. Get best threshold levels for each result metric
4. Display adjusted confusion matrix and metrics with improvement
5. Append top [chosen metric=> Weighted F1-Score] to the scores list

In [None]:
best_scores = []
y = 1

for model in [model1, model2, model3, model4]:
  print('**********************************')
  print(f'Model # {y}')
  print('**********************************')

  test_score = model.evaluate(test_gen, steps= test_steps, verbose= 1)

  print('-' * 20)
  print('Original threshold')
  print("Test Loss: ", test_score[0])
  print("Test Accuracy: ", test_score[1])

  # make predictions
  y_pred_probs = model.predict_generator(test_gen)
  y_pred_original = np.argmax(y_pred_probs, axis=1)
  cm = confusion_matrix(test_gen.classes, y_pred_original)
  print()
  print('Confusion matrix:')
  print(cm)
  print(classification_report(test_gen.classes, y_pred_original, target_names= classes, digits= 4))

  # experimenting thresholds
  acc_list = []
  pre_list = []
  w_pre_list = []
  rec_list = []
  w_rec_list = []
  f1_list = []
  w_f1_list = []

  thresholds = [0.5,0.5]

  for x in range(81):
    thresholds[0] = 0.5 + x*0.005
    thresholds[1] = 0.5 - x*0.005
    y_pred_adjusted = (y_pred_probs >= thresholds).astype(int) # Applying the adjusted thresholds for each class
    y_pred_adjusted = np.argmax(y_pred_adjusted, axis=1)

    acc_list.append(accuracy_score(test_gen.classes, y_pred_adjusted))
    pre_list.append( precision_score(test_gen.classes, y_pred_adjusted, pos_label= 0))
    w_pre_list.append( precision_score(test_gen.classes, y_pred_adjusted, average='weighted'))
    rec_list.append(recall_score(test_gen.classes, y_pred_adjusted, pos_label= 0))
    w_rec_list.append(recall_score(test_gen.classes, y_pred_adjusted, average='weighted'))
    f1_list.append(f1_score(test_gen.classes, y_pred_adjusted, pos_label= 0))
    w_f1_list.append(f1_score(test_gen.classes, y_pred_adjusted, average='weighted'))

  # results
  best_acc_ndx = np.argmax(np.array(acc_list))
  best_pre_ndx = np.argmax(np.array(pre_list))
  best_w_pre_ndx = np.argmax(np.array(w_pre_list))
  best_rec_ndx = np.argmax(np.array(rec_list))
  best_w_rec_ndx = np.argmax(np.array(w_rec_list))
  best_f1_ndx = np.argmax(np.array(f1_list))
  best_w_f1_ndx = np.argmax(np.array(w_f1_list))

  print()
  print(f'For the threshold adjustments on test set predictions for Model {y}')
  print(f'Best accuracy           - threshold [{round(0.5+0.005*best_acc_ndx,2)},{round(0.5-0.005*best_acc_ndx,2)}] -> {round(acc_list[best_acc_ndx]*100,2)}%')
  print(f'Best ALL precision      - threshold [{round(0.5+0.005*best_pre_ndx,2)},{round(0.5-0.005*best_pre_ndx,2)}] -> {round(pre_list[best_pre_ndx]*100,2)}%')
  print(f'Best weighted precision - threshold [{round(0.5+0.005*best_w_pre_ndx,2)},{round(0.5-0.005*best_w_pre_ndx,2)}] -> {round(w_pre_list[best_w_pre_ndx]*100,2)}%')
  print(f'Best ALL recall         - threshold [{round(0.5+0.005*best_rec_ndx,2)},{round(0.5-0.005*best_rec_ndx,2)}] -> {round(rec_list[best_rec_ndx]*100,2)}%')
  print(f'Best weighted recall    - threshold [{round(0.5+0.005*best_w_rec_ndx,2)},{round(0.5-0.005*best_w_rec_ndx,2)}] -> {round(w_rec_list[best_w_rec_ndx]*100,2)}%')
  print(f'Best ALL f1-score       - threshold [{round(0.5+0.005*best_f1_ndx,2)},{round(0.5-0.005*best_f1_ndx,2)}] -> {round(f1_list[best_f1_ndx]*100,2)}%')
  print(f'Best weighted f1-score  - threshold [{round(0.5+0.005*best_w_f1_ndx,2)},{round(0.5-0.005*best_w_f1_ndx,2)}] -> {round(w_f1_list[best_w_f1_ndx]*100,2)}%')
  print()

  # Apply the chosen threshold
  best_ndx = best_w_f1_ndx # Here we use Weighted F1 score. Change here if you want a different criteria

  thresholds = [round(0.5+0.005*best_ndx,2),round(0.5-0.005*best_ndx,2)]
  y_pred_adjusted = (y_pred_probs >= thresholds).astype(int) # Applying the adjusted thresholds for each class
  y_pred_adjusted = np.argmax(y_pred_adjusted, axis=1) # Getting the updated predictions
  cm = confusion_matrix(test_gen.classes, y_pred_adjusted)

  #print adjusted metrics
  print(f'Threshold used for adjusted predictions is: [ {round(0.5+0.005*best_ndx,2)} , {round(0.5-0.005*best_ndx,2)} ]')
  print('')
  print('Confusion matrix:')
  print(cm)
  print(classification_report(test_gen.classes, y_pred_adjusted, target_names= classes, digits= 4))

  print(f'Change in accuracy           : {round((acc_list[best_ndx]-acc_list[0])*100,3)} %')
  print(f'Change in ALL precision      : {round((pre_list[best_ndx]-pre_list[0])*100,3)} %')
  print(f'Change in weighted precision : {round((w_pre_list[best_ndx]-w_pre_list[0])*100,3)} %')
  print(f'Change in ALL recall         : {round((rec_list[best_ndx]-rec_list[0])*100,3)} %')
  print(f'Change in weighted recall    : {round((w_rec_list[best_ndx]-w_rec_list[0])*100,3)} %')
  print(f'Change in ALL F1-score       : {round((f1_list[best_ndx]-f1_list[0])*100,3)} %')
  print(f'Change in weighted F1-score  : {round((w_f1_list[best_ndx]-w_f1_list[0])*100,3)} %')
  print()

  best_scores.append(w_f1_list[best_w_f1_ndx]*100) # change here according to criteria
  y += 1

**********************************
Model # 1
**********************************
--------------------
Original threshold
Test Loss:  0.20593611896038055
Test Accuracy:  0.9489164352416992

Confusion matrix:
[[2998   43]
 [ 188 1293]]
              precision    recall  f1-score   support

         all     0.9410    0.9859    0.9629      3041
         hem     0.9678    0.8731    0.9180      1481

    accuracy                         0.9489      4522
   macro avg     0.9544    0.9295    0.9405      4522
weighted avg     0.9498    0.9489    0.9482      4522


For the threshold adjustments on test set predictions for Model 1
Best accuracy           - threshold [0.7,0.3] -> 95.4%
Best ALL precision      - threshold [0.9,0.1] -> 97.23%
Best weighted precision - threshold [0.7,0.3] -> 95.39%
Best ALL recall         - threshold [0.5,0.5] -> 98.59%
Best weighted recall    - threshold [0.7,0.3] -> 95.4%
Best ALL f1-score       - threshold [0.7,0.3] -> 96.62%
Best weighted f1-score  - threshold [0.

#### **Score results**

In [None]:
print('Results (Weighted F1 Score comparison):')
for x in range(4):
  print(f'Model {x+1}: {round(best_scores[x],3)} %')

print()
best_ndx = np.argmax(np.array(best_scores))
print(f'Best result is Model {best_ndx+1} with score of {round(best_scores[best_ndx],3)} %')

Results (Weighted F1 Score comparison):
Model 1: 95.371 %
Model 2: 92.844 %
Model 3: 92.61 %
Model 4: 91.108 %

Best result is Model 1 with score of 95.371 %
