# 1 - Libraries

In [None]:
%pip install plot_keras_history

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from time import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.utils import class_weight, shuffle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from tensorflow.keras.models import Model, Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Dropout, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical

from plot_keras_history import show_history, plot_history

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

# For normalization
import cv2
from skimage.exposure import match_histograms

# os.environ["TF_KERAS"]='1'
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# 2 - Data collection

In [None]:
# Loading data
df = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
df['image_path'] = [''.join(['/kaggle/input/UBC-OCEAN/train_thumbnails/', str(x), '_thumbnail.png']) if ''.join([str(x), '_thumbnail.png']) in os.listdir('/kaggle/input/UBC-OCEAN/train_thumbnails') else ''.join(['/kaggle/input/UBC-OCEAN/train_images/', str(x), '.png']) for x in df['image_id']]

In [None]:
# For testing
X_df = df.drop(columns='label')
y_df = df['label']

X,X_test,y,y_test = train_test_split(X_df, y_df, train_size=0.1, stratify = df['label'])

data = pd.DataFrame(X, columns = list(df.columns).remove('label'))
data['label'] = y    

In [None]:
data = df
data.shape

In [None]:
# Comptez combien de fois chaque catégorie apparaît dans la colonne 'label'
label_counts = data['label'].value_counts()

# Créez un pie chart en utilisant les données de 'label_counts'
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%')
plt.show()

In [None]:
results = pd.DataFrame(columns=['method','val_accuracy', 'processing_time'])

# 3 - Model1 : Data augmentation

In [None]:
chrono = time()

In [None]:
# Number of classes
nb_lab = len(data['label'].unique())

# Class list
le = LabelEncoder()
le.fit_transform(data['label'])
list_lab = le.classes_

In [None]:
batch_size = 32
epochs = 30

In [None]:
# Fonction d'augmentation des données
def data_flow_fct(data, datagen, data_type=None, batch_size=None) :

    data_flow = datagen.flow_from_dataframe(data,
                                            #directory=dir_, # Pas besoin
                                            x_col='image_path',  # Utilisez 'image_path' comme colonne des chemins d'images
                                            y_col='label',#_name',
                                            weight_col=None,
                                            target_size=(256, 256),
                                            classes=None,
                                            class_mode='categorical',
                                            batch_size=batch_size,
                                            shuffle=True,
                                            seed=42,
                                            subset=data_type)
    return data_flow

In [None]:
# Méthode d'augmentation des données
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,# détermine le ration training/validation
    preprocessing_function=preprocess_input)

In [None]:
# Fonction de création du modèle
def create_model_fct(nb_lab) :
    #weights_path = "/kaggle/input/vgg16-weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5" # ATTENTION : activer hors connexion
    weights_path = 'imagenet'
    # Charger le modèle VGG16 pré-entraîné
    #base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    model0 = VGG16(include_top=False, weights=weights_path, input_shape=(224, 224, 3)) # ATTENTION : activer hors connexion
    
    # Layer non entraînables = on garde les poids du modèle pré-entraîné
    for layer in model0.layers:
        layer.trainable = False

    # Récupérer la sortie de ce réseau
    x = model0.output
    # Compléter le modèle
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(nb_lab, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=model0.input, outputs=predictions)
    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=["accuracy"])

    print(model.summary())
    
    return model

In [None]:
# Augmentation des données et split en train et val
train_flow = data_flow_fct(data, datagen_train, data_type='training',batch_size=batch_size)#divisor_train)
val_flow = data_flow_fct(data, datagen_train, data_type='validation',batch_size=batch_size)#divisor_val)

In [None]:
%%time
# Model creation
# 408ms

with tf.device('/gpu:1'): 
    model = create_model_fct(nb_lab)

In [None]:
%%time
# 4min35 for epochs = 1 and batch_size = 32
# Model training
with tf.device('/gpu:1'):
    
    # Call back creation
    model_save_path = "model_best_weights_1.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]
    
    # Training
    history = model.fit(train_flow, epochs=epochs, 
                        steps_per_epoch=len(train_flow),
                        callbacks=callbacks_list, 
                        validation_data=val_flow,
                        validation_steps=len(val_flow),
                        verbose=1)

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

In [None]:
# recording method,val_accuracy, processing_time
chrono = time() - chrono
results.loc[len(results)] = ['Model1 : Data augmentation',history.history['val_accuracy'][-1],chrono]
chrono = time()

# 4 - Model2 : Data augmentation + Normalization

In [None]:
# Normalization function (input : image - output : np.array)
def normalization_processing_np_np(np_image):
    # Appliquer une normalisation de l'image sur le modèle d'une image de référence ref_path
    ref_path = r"/kaggle/input/UBC-OCEAN/train_thumbnails/4_thumbnail.png"

    # Charger l'image en couleur
    ref_img = cv2.imread(ref_path, cv2.IMREAD_COLOR)
        
    #image = cv2.imread(image_path, cv2.IMREAD_COLOR)

    # Resize des images pour avoir la même taille
    ref_img = cv2.resize(ref_img, (2000, 2000), interpolation=cv2.INTER_AREA)
    
    image = cv2.resize(np_image, (2000, 2000), interpolation=cv2.INTER_AREA)

    # Appliquer la normalisation
    aft_img = match_histograms(image, ref_img)

    # Convert the image to uint8 and BGR
    aft_img = cv2.convertScaleAbs(aft_img, cv2.COLOR_LAB2BGR)

    return aft_img

In [None]:
# Example of an image Before and After normalization
image_path = '/kaggle/input/UBC-OCEAN/train_thumbnails/10143_thumbnail.png'
np_image = cv2.imread(image_path)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))

ax[0].imshow(plt.imread(image_path))

ax[1].imshow(normalization_processing_np_np(np_image))

plt.show()

In [None]:
# Data augmentation method
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,# détermine le ratio training/validation
    preprocessing_function=normalization_processing_np_np)

In [None]:
# Example of data augmentation + normalization
image_path = '/kaggle/input/UBC-OCEAN/train_thumbnails/10143_thumbnail.png'
img = cv2.imread(image_path) #/ 255.0  # Normalisez les valeurs de pixel à l'intervalle [0, 1]
img = cv2.resize(img, (2000, 2000), interpolation=cv2.INTER_AREA)

# Add a dimension to get shape(1, height, width, channels)
img = np.expand_dims(img, axis=0)

# Apply preprocessing to image
for img_batch in datagen_train.flow(img, batch_size=1):
    img_augmented = img_batch[0]/255
    break  # Break after first batch because we only have one

# To make sure we have the good format (height, width, channels)
#img_augmented = img_augmented.squeeze()

# Display image Before and After
plt.subplot(1, 2, 1)
plt.imshow(img.squeeze())
plt.title('Original Image')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(img_augmented)
plt.title('Augmented Image')
plt.axis('off')

plt.show()

In [None]:
%%time
# 4min35 for epochs = 1 and batch_size = 32
# Model training
with tf.device('/gpu:1'):
    
    # Call back creation
    model_save_path = "model_best_weights_2.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]
    
    # Training
    history = model.fit(train_flow, epochs=epochs, 
                        steps_per_epoch=len(train_flow),
                        callbacks=callbacks_list, 
                        validation_data=val_flow,
                        validation_steps=len(val_flow),
                        verbose=1)

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

In [None]:
# recording method,val_accuracy, processing_time
chrono = time() - chrono
results.loc[len(results)] = ['Model2 : Data augmentation + Normalization',history.history['val_accuracy'][-1],chrono]
chrono = time()

# 5 - Model3 : Data augmentation + Tiling

In [None]:
# Tiling function
def tiling_np_np(image_array, tile_size=224, pix_threshold=100):
    # pix_threshold must be between 0 and 255, the higher the more discriminating
    def pad_to_size(tile, target_size):
        # Calcule la quantité de padding nécessaire
        pad_height = max(0, target_size[0] - tile.shape[0])
        pad_width = max(0, target_size[1] - tile.shape[1])

        # Ajoute des colonnes de zéros à gauche et à droite
        pad_left = pad_width // 2
        pad_right = pad_width - pad_left

        # Ajoute des lignes de zéros en haut et en bas
        pad_top = pad_height // 2
        pad_bottom = pad_height - pad_top

        # Applique le padding
        padded_tile = np.pad(tile, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), mode='constant', constant_values=0)

        return padded_tile
    tiles = [pad_to_size(image_array[i:i+tile_size, j:j+tile_size], (tile_size, tile_size, 3)) for i in range(0, image_array.shape[0], tile_size) for j in range(0, image_array.shape[1], tile_size)]
    #print(tiles)
    # 4. Élimination des carreaux de taille inférieure à un certain threshold
    threshold = pix_threshold * tile_size**2 * 3 # Vous pouvez ajuster ce seuil en fonction de vos besoins
    filtered_tiles = [tile for tile in tiles if np.sum(tile) > threshold]

    # 5. Reconstruction d'un numpy array carré avec les tiles restant
    num_tiles_side = int(np.sqrt(len(filtered_tiles)))

    if (len(filtered_tiles) - num_tiles_side**2) != 0:
        num_blank_tiles = (num_tiles_side+1)**2 - num_tiles_side**2
        num_tiles_side = num_tiles_side + 1
    else:
        num_blank_tiles = 0

    # Création de tiles blancs
    blank_tile = np.zeros((tile_size, tile_size, 3), dtype=np.uint8)

    # Remplissage des tiles manquants avec des tiles blanches
    filled_tiles = filtered_tiles + [blank_tile] * num_blank_tiles

    # Reconstruction du numpy array carré
    reconstructed_array = np.vstack([np.hstack(filled_tiles[i*num_tiles_side:(i+1)*num_tiles_side]) for i in range(num_tiles_side)])

    # 6. Enregistrement au format png
    #reconstructed_image = Image.fromarray(reconstructed_array)
    reconstructed_image = cv2.convertScaleAbs(reconstructed_array, cv2.COLOR_LAB2BGR)
    reconstructed_image = cv2.resize(reconstructed_image, (2000, 2000), interpolation=cv2.INTER_AREA)

    #base_name = os.path.basename(image_path)
    # Create the folder if it doesn't exist
    #os.makedirs(r"reconstructed/", exist_ok=True)
    #folder = r"reconstructed/"
    #full_name = ''.join([folder, base_name])
    #reconstructed_image.save(full_name)
    return reconstructed_image
    #return reconstructed_array

In [None]:
# Example of an image Before and After normalization
image_path = '/kaggle/input/UBC-OCEAN/train_thumbnails/5251_thumbnail.png'
image_array = cv2.imread(image_path)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))

ax[0].imshow(cv2.imread(image_path))

ax[1].imshow(tiling_np_np(image_array))

plt.show()

In [None]:
# Data augmentation method
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,# détermine le ratio training/validation
    preprocessing_function=tiling_np_np)

In [None]:
# Example of data augmentation + tiling
image_path = '/kaggle/input/UBC-OCEAN/train_thumbnails/10143_thumbnail.png'
img = cv2.imread(image_path) #/ 255.0  # Normalisez les valeurs de pixel à l'intervalle [0, 1]
img = cv2.resize(img, (2000, 2000), interpolation=cv2.INTER_AREA)

# Ajouter une dimension pour obtenir une forme (1, height, width, channels)
img = np.expand_dims(img, axis=0)

# Appliquer le prétraitement à l'image
for img_batch in datagen_train.flow(img, batch_size=1):
    img_augmented = img_batch[0]/255
    break  # Break après le premier lot, car nous n'en avons qu'un

# Assurez-vous que l'image est dans le format (height, width, channels)
#img_augmented = img_augmented.squeeze()

# Afficher l'image originale et l'image augmentée
plt.subplot(1, 2, 1)
plt.imshow(img.squeeze())
plt.title('Original Image')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(img_augmented)
plt.title('Augmented Image')
plt.axis('off')

plt.show()

In [None]:
%%time
# 4min35 for epochs = 1 and batch_size = 32
# Model training
with tf.device('/gpu:1'):
    
    # Call back creation
    model_save_path = "model_best_weights_3.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]
    
    # Training
    history = model.fit(train_flow, epochs=epochs, 
                        steps_per_epoch=len(train_flow),
                        callbacks=callbacks_list, 
                        validation_data=val_flow,
                        validation_steps=len(val_flow),
                        verbose=1)

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

In [None]:
# recording method,val_accuracy, processing_time
chrono = time() - chrono
results.loc[len(results)] = ['Model3 : Data augmentation + Tiling',history.history['val_accuracy'][-1],chrono]
chrono = time()

# 6 - Model4-5-6 : Data augmentation + Balancing

In [None]:
# Comptez combien de fois chaque catégorie apparaît dans la colonne 'label'
label_counts = data['label'].value_counts()

# Créez un pie chart en utilisant les données de 'label_counts'
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%')
plt.show()

## 6-1 Balancing classes

In [None]:
data_0 = data.copy()

In [None]:
data_0.shape

In [None]:
# For stage 1
# in original dataset we have 41.3% HGSC - 23% EC - 18.4% CC - 8.7% LGSC - 8.6% MC
# we'll change these proportion to X EC - X CC - 8.7 LGSC - 8.6 MC = 41.3 HGSC
# [41.3 - (8.7 + 8.6)] / 2 = 12
# Passing EC size from 23 to 12 :
data_EC = data.loc[data['label']== 'EC',:]
data_EC_reduced = data_EC.iloc[0:int(len(data_EC)*12/23)]
data_EC_reduced['label'].value_counts()

In [None]:
# Passing CC size from 18.4 to 12
data_CC = data.loc[data['label']== 'CC',:]
data_CC_reduced = data_CC.iloc[0:int(len(data_CC)*12/18.4)]
data_CC_reduced['label'].value_counts()

In [None]:
# New data concatenate data_HGSC, data_EC_reduced, data_CC_reduced, data_LGSC and data_MC
data_HGSC_LGSC_MC = data.loc[data['label'].isin(['HGSC','LGSC','MC']),:]
new_data_1 = pd.concat([data_HGSC_LGSC_MC, data_EC_reduced])
new_data_1 = pd.concat([new_data_1, data_CC_reduced])
new_data_1['label'].value_counts()

In [None]:
new_data_1.head()

In [None]:
# Renaming classes EC, CC, LGSC and MC as Other
new_data_1['label'] = new_data_1['label'].apply(lambda x: 'Other' if x in ['EC', 'CC', 'LGSC', 'MC'] else x)
# Check 
new_data_1['label'].value_counts()

In [None]:
# For stage 2
# We reduce data to labels EC - CC - LGSC - MC
# We divide EC - CC - LGSC - MC into EC - CC and Other
new_data_2 = data.loc[data['label'].isin(['EC','CC','LGSC','MC']),:]
new_data_2['label'] = new_data_2['label'].apply(lambda x : 'Other' if x in ['LGSC','MC'] else x)
# Check
new_data_2['label'].value_counts()

In [None]:
# For stage 3
# We keep only LGSC and MC
new_data_3 = data.loc[data['label'].isin(['LGSC','MC']),:]
# Check
new_data_3['label'].value_counts()

## 6-2 Data augmentation

In [None]:
# Méthode d'augmentation des données
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,# détermine le ration training/validation
    preprocessing_function=preprocess_input)

## 6-3 Stage 1 : HGSC vs Others

### 6-3-1 Training

In [None]:
print('Model training on HGSC - other data')
data = new_data_1

# Number of classes
nb_lab_1 = len(data['label'].unique())

# Class list
le1 = LabelEncoder()
le1.fit_transform(data['label'])
list_lab_1 = le1.classes_
#['HGSC', 'Other']

print('1-Model creation (!new nb_lab!)')
# Model creation
# 408ms

with tf.device('/gpu:1'): 
    model = create_model_fct(nb_lab_1)
    # Call back creation
    model_save_path = "model_best_weights_4.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]

print('2-Data augmentation - train-val split')
# Augmentation des données et split en train et val
train_flow = data_flow_fct(data, datagen_train, data_type='training',batch_size=batch_size)#divisor_train)
val_flow = data_flow_fct(data, datagen_train, data_type='validation',batch_size=batch_size)#divisor_val)

print('3-Model training')
# Training
history = model.fit(train_flow, epochs=epochs, 
                    steps_per_epoch=len(train_flow),
                    callbacks=callbacks_list, 
                    validation_data=val_flow,
                    validation_steps=len(val_flow),
                    verbose=1)

### 6-3-2 Performances

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab_1))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le1.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

### 6-3-3 Predictions

In [None]:
# Prediction
# Méthode d'augmentation des données
datagen_test = ImageDataGenerator(
    #no transformation necessary for prediction
    validation_split=None,# no test/train split for prediction
    preprocessing_function=preprocess_input)

# Fonction d'augmentation des données
def data_flow_fct(data, datagen, data_type=None, batch_size=None):
    data_flow = datagen.flow_from_dataframe(data,
                                            x_col='image_path',
                                            y_col='label',
                                            weight_col=None,
                                            target_size=(256, 256),
                                            classes=None,
                                            class_mode='categorical',
                                            batch_size=batch_size,
                                            shuffle=False,  # Ne pas mélanger les données pour garantir l'ordre correct des prédictions
                                            seed=42,
                                            subset=data_type)
    return data_flow

# Création du générateur de flux de test
test_flow = data_flow_fct(data, datagen_test, data_type=None, batch_size=1)

# Prédiction des étiquettes
y_pred = model.predict(test_flow)

# Ajout des prédictions au DataFrame
data['prediction'] = list(map(lambda x: le1.inverse_transform(np.array([np.argmax(x)]))[0], y_pred))
data_1 = data.copy()
data_1.head()

In [None]:
print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = data_1['label']
y_pred_indices = data_1['prediction']

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val_indices, y_pred_indices))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_cat_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Reds")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

## 6-4 Stage 2 : EC vs CC vs Others

### 6-4-1 Training

In [None]:
print('Model training on EC vs CC vs Others')
data = new_data_2

# Number of classes
nb_lab_2 = len(data['label'].unique())

# Class list
le2 = LabelEncoder()
le2.fit_transform(data['label'])
list_lab_2 = le2.classes_
#['HGSC', 'Other']

print('1-Model creation (!new nb_lab!)')
# Model creation
# 408ms

with tf.device('/gpu:1'): 
    model = create_model_fct(nb_lab_2)
    # Call back creation
    model_save_path = "model_best_weights_5.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]

print('2-Data augmentation - train-val split')
# Augmentation des données et split en train et val
train_flow = data_flow_fct(data, datagen_train, data_type='training',batch_size=batch_size)#divisor_train)
val_flow = data_flow_fct(data, datagen_train, data_type='validation',batch_size=batch_size)#divisor_val)

print('3-Model training')
# Training
history = model.fit(train_flow, epochs=epochs, 
                    steps_per_epoch=len(train_flow),
                    callbacks=callbacks_list, 
                    validation_data=val_flow,
                    validation_steps=len(val_flow),
                    verbose=1)

### 6-4-2 Performances

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab_2))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le2.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

### 6-4-3 Predictions

In [None]:
# Prediction
# Méthode d'augmentation des données
datagen_test = ImageDataGenerator(
    #no transformation necessary for prediction
    validation_split=None,# no test/train split for prediction
    preprocessing_function=preprocess_input)

# Fonction d'augmentation des données
def data_flow_fct(data, datagen, data_type=None, batch_size=None):
    data_flow = datagen.flow_from_dataframe(data,
                                            x_col='image_path',
                                            y_col='label',
                                            weight_col=None,
                                            target_size=(256, 256),
                                            classes=None,
                                            class_mode='categorical',
                                            batch_size=batch_size,
                                            shuffle=False,  # Ne pas mélanger les données pour garantir l'ordre correct des prédictions
                                            seed=42,
                                            subset=data_type)
    return data_flow

# Création du générateur de flux de test
test_flow = data_flow_fct(data, datagen_test, data_type=None, batch_size=1)

# Prédiction des étiquettes
y_pred = model.predict(test_flow)

# Ajout des prédictions au DataFrame
data['prediction'] = list(map(lambda x: le2.inverse_transform(np.array([np.argmax(x)]))[0], y_pred))
data_2 = data.copy()
data_2.head()

In [None]:
print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = data_2['label']
y_pred_indices = data_2['prediction']

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val_indices, y_pred_indices))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_cat_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])

# Proceeding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Reds")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

## 6-5 Stage 3 : LGSC and MC

### 6-5-1 Training

In [None]:
print('Model training on LGSC and MC data')
data = new_data_3

# Number of classes
nb_lab_3 = len(data['label'].unique())

# Class list
le3 = LabelEncoder()
le3.fit_transform(data['label'])
list_lab_3 = le3.classes_
#['HGSC', 'Other']

print('1-Model creation (!new nb_lab!)')
# Model creation
# 408ms

with tf.device('/gpu:1'): 
    model = create_model_fct(nb_lab_3)
    # Call back creation
    model_save_path = "model_best_weights_6.h5"
    checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)
    callbacks_list = [checkpoint, es]

print('2-Data augmentation - train-val split')
# Augmentation des données et split en train et val
train_flow = data_flow_fct(data, datagen_train, data_type='training',batch_size=batch_size)#divisor_train)
val_flow = data_flow_fct(data, datagen_train, data_type='validation',batch_size=batch_size)#divisor_val)

print('3-Model training')
# Training
history = model.fit(train_flow, epochs=epochs, 
                    steps_per_epoch=len(train_flow),
                    callbacks=callbacks_list, 
                    validation_data=val_flow,
                    validation_steps=len(val_flow),
                    verbose=1)

### 6-5-2 Performances

In [None]:
%%time
# Performances
print('1/6-val accuracy/epochs')
show_history(history)
plot_history(history, path="history.png")
plt.close()

print('2/6-predicting y_pred')
#1min 28 for batch_size = 32
y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size)

print('3/6-getting y_val')
nombre_total_val = len(val_flow) * batch_size

# Initialisation d'un tableau pour stocker les étiquettes réelles
y_val = np.zeros((nombre_total_val, nb_lab_3))  

# Itérer sur le générateur pour extraire les étiquettes réelles
for i in range(len(val_flow)):
    _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
    start_index = i * batch_size
    end_index = start_index + len(batch_y_val)
    y_val[start_index:end_index] = batch_y_val

print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
y_pred_indices = y_pred.argmax(axis=1)

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
list_cat_labels = le3.inverse_transform(list_num_labels)

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

### 6-5-3 Predictions

In [None]:
# Prediction
# Méthode d'augmentation des données
datagen_test = ImageDataGenerator(
    #no transformation necessary for prediction
    validation_split=None,# no test/train split for prediction
    preprocessing_function=preprocess_input)

# Fonction d'augmentation des données
def data_flow_fct(data, datagen, data_type=None, batch_size=None):
    data_flow = datagen.flow_from_dataframe(data,
                                            x_col='image_path',
                                            y_col='label',
                                            weight_col=None,
                                            target_size=(256, 256),
                                            classes=None,
                                            class_mode='categorical',
                                            batch_size=batch_size,
                                            shuffle=False,  # Ne pas mélanger les données pour garantir l'ordre correct des prédictions
                                            seed=42,
                                            subset=data_type)
    return data_flow

# Création du générateur de flux de test
test_flow = data_flow_fct(data, datagen_test, data_type=None, batch_size=1)

# Prédiction des étiquettes
y_pred = model.predict(test_flow)

# Ajout des prédictions au DataFrame
data['prediction'] = list(map(lambda x: le3.inverse_transform(np.array([np.argmax(x)]))[0], y_pred))
data_3 = data.copy()
data_3.head()

In [None]:
print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = data_3['label']
y_pred_indices = data_3['prediction']

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val_indices, y_pred_indices))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_cat_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Reds")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

In [None]:
y_val_indices

## 6-6 Stacking predictions from model_1, model_2 and model_3

### 6-6-1 Data concatenation

In [None]:
data_stack = pd.concat([data_1, data_2])
data_stack = pd.concat([data_stack,data_3])
print(data_stack.shape)
data_stack = data_stack.loc[data_stack['label']!='Other',:]
print(data_stack.shape)

In [None]:
data_stack.head()

## 6-6-2 Performances

In [None]:
# Label processing
data = data_stack

# Number of classes
nb_lab_4 = len(data['label'].unique())

# Class list
le4 = LabelEncoder()
le4.fit_transform(data['label'])
list_lab_4 = le4.classes_
#['HGSC', 'Other']

In [None]:
print('4/6-building the basic confusion matrix')
# Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
y_val_indices = data_stack['label']
y_pred_indices = data_stack['prediction']

# Générer la matrice de confusion
cm = confusion_matrix(y_val_indices, y_pred_indices)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val_indices, y_pred_indices))

print('6/6-building the sns confusion matrix')
# Finding the matching categorical labels for the numerical labels
list_cat_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])

# Proceding with sns
df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Reds")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")

plt.show()

In [None]:
chrono = time() - chrono
# recording method,val_accuracy, processing_time
results.loc[len(results)] = ['Model4-5-6 : Data augmentation + Balancing',accuracy_score(y_val_indices,y_pred_indices),chrono]
results