In [5]:
# Import base librairies
import sys
import os
import json
from pathlib import Path
import time

# Import scientific librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import Tensorflow and Keras
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

# Import scikit-learn
from sklearn.utils import class_weight

# Check running environment
try:
    from google.colab import drive
    IN_COLAB=True
except:
    IN_COLAB=False

if IN_COLAB:
    print("We're running on Colab")
    
    # Mount google drive
    mounting_point = "/content/drive/"
    drive.mount(mounting_point, force_remount=True)

    # Add project directory to kernel paths
    drive_folder = "MyDrive/pneumonia_detection/T-DEV-810-PAR_10"
    
    sys.path.append(mounting_point + drive_folder)
else:
    print("We're running localy")
    
    # Add project directory to kernel paths
    sys.path.append('../..')


We're running localy


In [6]:
# Import custom functions
from src.data.file_manager import FileManager
from src.data.tf_utils import load_image_dataset_from_tfrecord, define_distribute_strategy
from src.data.evaluation import Evaluation

zoidbergManager = FileManager()
strategy = define_distribute_strategy()
evaluation = Evaluation(strategy)

# Set default graphics visualization
%matplotlib inline

Selected distribution strategy:                     _DefaultDistributionStrategy


In [None]:
# set random seed for keras, numpy, tensorflow, and the 'random' module
SEED = 42
tf.keras.utils.set_random_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# ***2. Loading dataset*** *texte en italique*

In [30]:
BATCH_SIZE = 64
SMALL_TRAIN_SPLIT = 0.2
SMALL_VAL_SPLIT = 0.15
class_names = ['batceria', 'normal', 'virus']

First, let's load the train and val datasets

In [None]:
processed_dir_path = zoidbergManager.data_dir / 'processed'

train_path = str(processed_dir_path / 'train_512x512_rgb_ds.tfrecord')
val_path = str(processed_dir_path / 'val_512x512_rgb_ds.tfrecord')

train_ds = load_image_dataset_from_tfrecord(train_path)
val_ds = load_image_dataset_from_tfrecord(val_path)

Then, we extract a small part of each datasets to train each models on a small dataset.


In [None]:
num_train_img = train_ds.reduce(0, lambda x, _: x + 1).numpy()
num_val_img = val_ds.reduce(0, lambda x, _: x + 1).numpy()

# Shuffle data
train_ds = train_ds.shuffle(buffer_size=num_train_img, seed=42)
val_ds = val_ds.shuffle(buffer_size=num_val_img, seed=42)

# Extract a sample
small_train_size = int(num_train_img * SMALL_TRAIN_SPLIT)
small_val_size = int(num_val_img * SMALL_VAL_SPLIT)

small_train_ds = train_ds.take(small_train_size)
small_val_ds = val_ds.take(small_val_size)

def count_img_by_class(dataset, class_names=class_names):
    num_img_by_classes = {name:0 for name in class_names}
    for images, labels in dataset:
        idx_label = np.nonzero(labels.numpy())[0][0]
        for idx, name in enumerate(class_names):
            if idx_label == idx:
                num_img_by_classes[name] += 1
    return num_img_by_classes

print("In training dataset, there are :")
for class_name, num_img in count_img_by_class(small_train_ds).items():
    print(f"  - {num_img} files for class {class_name}")    
print("\nIn val dataset, there are :")
for class_name, num_img in count_img_by_class(small_val_ds).items():
    print(f"  - {num_img} files for class {class_name}")

In [31]:
# Batch & prefecth data to improve computation time
small_train_ds = small_train_ds.batch(BATCH_SIZE).prefetch(
    buffer_size=tf.data.AUTOTUNE)

small_val_ds = small_val_ds.batch(BATCH_SIZE).prefetch(
    buffer_size=tf.data.AUTOTUNE)

NameError: name 'small_train_ds' is not defined

3. Compare a bunch of bunch of models

Let's now train models on this small dataset. We set useful variables below.

⚠️⚠️⚠️ WARNING : Depending on your hardware, training cells can be computationally expensive and take a really long time to run them !!!
That's why each of these cells are wrapped in a if condition (see TRAIN_*MODEL* booleans below).

In [None]:
EPOCHS = 30
LEARNING_RATE = 0.0001

TRAIN_VGG16 = False
TRAIN_RESNET50 = False
TRAIN_INCEPTION_RESNET = False
TRAIN_XCEPTION = False
TRAIN_EFFICIENTNETB0 = False

We define also 2 callbacks :

checkpoint_cb : save model at each epoch (only save best weight).
earlystopping_cb : stop training if model does not progress. It is faster and helps against overfitting.

def checkpoint_cb(model):
    checkpoint_dir = zoidbergManager.model_dir / 'checkpoints'
    checkpoint_filepath = checkpoint_dir / f'ckpt_smallds_{model.name}.h5'
    ckpt_cb = keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_MCC',
        mode='max',
        save_best_only=True
    )
    return ckpt_cb

Next, we compute class weights to prevent imbalanced classes (as we saw when we analyzed data) :



In [None]:
y_train_iterator = train_ds.map(lambda x, y: y).as_numpy_iterator()
y_train = []
for one_vector in y_train_iterator:
    y_train.append(one_vector)
y_train = np.argmax(y_train, axis=1)

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights
dic_class_weights = {}
for idx, weight in enumerate(class_weights):
    dic_class_weights[idx] = weight
    print(f'class {class_names[idx]} => weight : {weight:2f}')

In [None]:
def save_history(model, history, training_time):
    dic = {}
    dic['history'] = history.history
    dic['training_time'] = training_time
    dic['epoch_time'] = training_time/len(history.history['loss'])
    history_dir = zoidbergManager.model_dir / 'histories'
    history_filepath = history_dir / f'hty_smallds_{model.name}.json'
    json.dump(dic, open(history_filepath, 'w'))

def train_model(model, save=False):
    start_time = time.time()
    history = model.fit(small_train_ds,
                        validation_data=small_val_ds,
                        epochs=EPOCHS,
                        #steps_per_epoch=(small_train_size // BATCH_SIZE + 1),
                        class_weight=dic_class_weights,
                        callbacks=[checkpoint_cb(model)],
                        )
    training_time = time.time() - start_time

    if save:
        save_history(model, history, training_time)

    return history, training_time

We selected 4 kinds of models to try. Those models have been chosen base on their performance :

VGG and ResNet have been used several times on this dataset with some good performances. Many notebooks can be found on kaggle
Xception and EfficientNet have good results on imagenet with few parameters.
One can check performance of each model on imagenet here : keras.applications

3.1 VGG

In [None]:
def make_vgg16():
    base_vgg16 = tf.keras.applications.VGG16(weights='imagenet', input_shape=(224,224,3), include_top=False)
    for layer in base_vgg16.layers:
        layer.trainable = False
    
    vgg16 = tf.keras.Sequential([
        keras.layers.InputLayer(input_shape=(512,512,3), name='input'),
        keras.layers.Resizing(224, 224, interpolation="bilinear", name='resize'),
        keras.layers.Rescaling(scale=1./255., name='rescale'),
        base_vgg16,
        keras.layers.Flatten(name='flatten'),
        keras.layers.Dense(1024, activation='relu', name='fully_conn1'),
        keras.layers.Dense(512, activation='relu', name='fully_conn2'),
        keras.layers.Dense(3, activation='softmax', name='out_softmax'),
    ], name = 'vgg16')

    vgg16.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss='categorical_crossentropy',
                  metrics=evaluation.get_training_metrics()
                 )
    return vgg16
    
with strategy.scope():
    vgg16 = make_vgg16()

In [None]:
if TRAIN_VGG16:
    vgg16_history, vgg16_time = train_model(vgg16, save=True)

3.2 ResNet

In [None]:
def make_resnet50():
    base_resnet50 = tf.keras.applications.ResNet50V2(weights='imagenet', input_shape=(224,224,3), include_top=False)
    for layer in base_resnet50.layers:
        layer.trainable = False
    
    resnet50 = tf.keras.Sequential([
        keras.layers.InputLayer(input_shape=(512,512,3), name='input'),
        keras.layers.Resizing(224, 224, interpolation="bilinear", name='resize'),
        keras.layers.Rescaling(scale=1./255., name='rescale'),
        base_resnet50,
        keras.layers.GlobalAveragePooling2D(name='avg_pool'),
        keras.layers.Dense(1024, activation='relu', name='fully_conn1'),
        keras.layers.Dense(512, activation='relu', name='fully_conn2'),
        keras.layers.Dense(3, activation='softmax', name='out_softmax'),
    ], name = 'resnet50')

    resnet50.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss='categorical_crossentropy',
                  metrics=evaluation.get_training_metrics()
                 )
    return resnet50
    
with strategy.scope():
    resnet50 = make_resnet50()

In [None]:
if TRAIN_RESNET50:
    resnet50_history, resnet50_time = train_model(resnet50, save=True)

In [None]:
def make_inception_resnet():
    base_inception_resnet = tf.keras.applications.InceptionResNetV2(weights='imagenet', input_shape=(299,299,3), include_top=False)
    for layer in base_inception_resnet.layers:
        layer.trainable = False
    
    inception_resnet = tf.keras.Sequential([
        keras.layers.InputLayer(input_shape=(512,512,3), name='input'),
        keras.layers.Resizing(299, 299, interpolation="bilinear", name='resize'),
        keras.layers.Rescaling(scale=1./255., name='rescale'),
        base_inception_resnet,
        keras.layers.GlobalAveragePooling2D(name='avg_pool'),
        keras.layers.Dense(1024, activation='relu', name='fully_conn1'),
        keras.layers.Dense(512, activation='relu', name='fully_conn2'),
        keras.layers.Dense(3, activation='softmax', name='out_softmax'),
    ], name = 'inception_resnet')

    inception_resnet.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss='categorical_crossentropy',
                  metrics=evaluation.get_training_metrics()
                 )
    return inception_resnet
    
with strategy.scope():
    inception_resnet = make_inception_resnet()

In [None]:
if TRAIN_INCEPTION_RESNET:
    inception_resnet_history, inception_resnet_time = train_model(inception_resnet, save=True)

3.4 EfficientNet

In [None]:
def make_efficientnetb0():
    base_efficientnetb0 = tf.keras.applications.EfficientNetV2B0(weights='imagenet', input_shape=(224,224,3), include_top=False)
    for layer in base_efficientnetb0.layers:
        layer.trainable = False
    
    efficientnetb0 = tf.keras.Sequential([
        keras.layers.InputLayer(input_shape=(512,512,3), name='input'),
        keras.layers.Resizing(224, 224, interpolation="bilinear", name='resize'),
        base_efficientnetb0,
        keras.layers.GlobalAveragePooling2D(name='avg_pool'),
        keras.layers.Dense(1024, activation='relu', name='fully_conn1'),
        keras.layers.Dense(512, activation='relu', name='fully_conn2'),
        keras.layers.Dense(3, activation='softmax', name='out_softmax'),
    ], name = 'efficientnetb0')

    efficientnetb0.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss='categorical_crossentropy',
                  metrics=evaluation.get_training_metrics()
                 )
    return efficientnetb0
    
with strategy.scope():
    efficientnetb0 = make_efficientnetb0()

In [None]:
if TRAIN_EFFICIENTNETB0:
    efficientnetb0_history, efficientnetb0_time = train_model(efficientnetb0, save=True)

3.5 Results

In [None]:
history_path = {
    'vgg16' : zoidbergManager.model_dir / 'histories' / 'hty_smallds_vgg16.json',
    'resnet50' : zoidbergManager.model_dir / 'histories' / 'hty_smallds_resnet50.json',
    'inception_resnet' : zoidbergManager.model_dir / 'histories' / 'hty_smallds_inception_resnet.json',
    'xception' : zoidbergManager.model_dir / 'histories' / 'hty_smallds_xception.json',
    'efficientnetb0' : zoidbergManager.model_dir / 'histories' / 'hty_smallds_efficientnetb0.json'
}

checkpoint_path = {
    'vgg16' : zoidbergManager.model_dir / 'checkpoints' / 'ckpt_smallds_vgg16.h5',
    'resnet50' : zoidbergManager.model_dir / 'checkpoints' / 'ckpt_smallds_resnet50.h5',
    'inception_resnet' : zoidbergManager.model_dir / 'checkpoints' / 'ckpt_smallds_inception_resnet.h5',
    'xception' : zoidbergManager.model_dir / 'checkpoints' / 'ckpt_smallds_xception.h5',
    'efficientnetb0' : zoidbergManager.model_dir / 'checkpoints' / 'ckpt_smallds_efficientnetb0.h5'
}

sumup_result_df = pd.DataFrame(columns=['model','size','training_time','max MCC','max val_MCC'])
histories = {}
for model_name, path in history_path.items():
    with open(path) as file:
        history = json.load(file)
    histories[model_name] = history['history']
    sumup = []
    sumup.append(model_name)
    sumup.append(f'{checkpoint_path[model_name].stat().st_size / (1e6):.1f} MB')
    sumup.append(f"{history['training_time']:.3f} s")
    sumup.append(f"{np.max(history['history']['MCC']):.3f}")
    sumup.append(f"{np.max(history['history']['val_MCC']):.3f}")
    idx_model = list(history_path.keys()).index(model_name)
    sumup_result_df.loc[idx_model] = sumup

sumup_result_df.head()

In [None]:
mcc_history = {}
for model, history in histories.items():
    mcc_history[model] = history['val_MCC']

sns.set_style('ticks')
bp = sns.boxplot(data=pd.DataFrame.from_dict(mcc_history).iloc[-5:])
bp.set_title('val_MCC over last 5 epochs')
sns.despine(offset=10, trim=True)

In [None]:
epochs = list(range(1,EPOCHS+1))
full_mcc_history = []
for model, history in histories.items():
    data = {}
    data['epoch'] = epochs
    data['model'] = [model]*EPOCHS
    data['MCC'] = history['MCC']
    data['val_MCC'] = history['val_MCC']
    full_mcc_history.append(pd.DataFrame.from_dict(data))
full_mcc_history_df = pd.concat(full_mcc_history)

sns.set_style('ticks')
rp = sns.relplot(
    data=pd.melt(full_mcc_history_df,['epoch','model']),
    x="epoch",
    y="value",
    hue="variable",
    col="model",
    col_wrap=2,
    height=2.5,
    aspect=2.2,
    kind="line"
)
rp.fig.suptitle('Learning curves', fontsize=14)
rp.fig.subplots_adjust(top=0.9)
sns.move_legend(rp,
                'center',
                bbox_to_anchor=(.7, .18),
                fontsize=14,
                title_fontsize=14,
                frameon=True,
                borderpad=1)