### Lors de cette compétition Kaggle il est demandé de réaliser une classification supervisée d'images d'histopathologie des cinq sous-types de cancer ovarien (high-grade serous carcinoma - HGSC, clear-cell ovarian carcinoma - CC, endometrioid - EC, low-grade serous - LGSC, and mucinous carcinoma - MC). 
### Ces images proviennent de plusieurs établissements. Elles sont de deux types : Whole Slide Image (WSI) ou Tissue MicroArray (TMA). Les WSI sont magnifiéés 20x et les TMA 40x.

# 1 - Import

In [None]:
%pip install plot_keras_history

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from time import time
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.utils import class_weight, shuffle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from tensorflow.keras.models import Model, Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Dropout, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical

from plot_keras_history import show_history, plot_history

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.optimizers import SGD

# For normalization
import cv2
from skimage.exposure import match_histograms

os.environ["TF_KERAS"]='1'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# 2 - Data Collection

In [None]:
df = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
df['image_path'] = [''.join(['/kaggle/input/UBC-OCEAN/train_thumbnails/', str(x), '_thumbnail.png']) if ''.join([str(x), '_thumbnail.png']) in os.listdir('/kaggle/input/UBC-OCEAN/train_thumbnails') else ''.join(['/kaggle/input/UBC-OCEAN/train_images/', str(x), '.png']) for x in df['image_id']]

In [None]:
df.head()

In [None]:
%%time
# Wall time 18s
# Visualisation de quelques images en vrac
plt.figure(figsize=(10, 10)) 

for i, row in df.head(9).iterrows():
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(plt.imread(row['image_path']))
    plt.title(f"Label: {row['label']}")
    plt.axis('off')

plt.show()

In [None]:
%%time
# Wall time s
# Visualisation de quelques images TMA
plt.figure(figsize=(10, 10)) 

for i, row in df.loc[df['is_tma']==True].reset_index().head(9).iterrows():
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(plt.imread(row['image_path']))
    plt.title(f"Label: {row['label']}")
    plt.axis('off')

plt.show()

In [None]:
# We add a column called "zoom" giving the zoom on the raw picture in the database (raw TMA are 40, raw WSI are 20)
df['zoom'] = 0
# for col raw TMA we put 40
df.loc[df['is_tma']==True ,'zoom'] = 40
df.loc[(df['is_tma'] == False, 'zoom')] = 20
df

In [None]:
%%time
# Walltime 2min 9s
# What is the reduction rate between raw image and thumbed image
# Let's mirror the width dimension of thumbed picture with the with dimension on the raw pictures
# First we creat a new col named thumb_width
df['thumb_width'] = df['image_path'].apply(lambda x : cv2.imread(x).shape[0])
df = df[['image_id', 'label', 'image_width', 'thumb_width', 'image_height', 'is_tma',
       'image_path', 'zoom']]
df

In [None]:
# then we multiply the zoom by thumb_width/image_width => this give the zoom value of the thumbnail
df['zoom'] = df['zoom']*df['thumb_width']/df['image_width']
df

In [None]:
df.info()

## 3 - Functions

In [None]:
# Normalization function (input : image - output : np.array)
def normalization_processing_np_np(np_image):
    # Appliquer une normalisation de l'image sur le modèle d'une image de référence ref_path
    ref_path = r"/kaggle/input/UBC-OCEAN/train_thumbnails/12522_thumbnail.png"

    # Charger l'image en couleur
    ref_img = cv2.imread(ref_path, cv2.IMREAD_COLOR)
        
    # Resize des images pour avoir la même taille
    ref_img = cv2.resize(ref_img, (256, 256), interpolation=cv2.INTER_AREA)
    
    image = cv2.resize(np_image, (256, 256), interpolation=cv2.INTER_AREA)

    # Appliquer la normalisation
    aft_img = match_histograms(image, ref_img)

    # Convert the image to uint8 and BGR
    aft_img = cv2.convertScaleAbs(aft_img, alpha=1.0, beta=0.0)

    return aft_img

In [None]:
# Exemple
image_path = '/kaggle/input/UBC-OCEAN/train_thumbnails/12522_thumbnail.png'
fig,ax = plt.subplots(1,2,figsize=(10,10))
ax[0].imshow(cv2.resize(plt.imread(image_path), (256, 256), interpolation=cv2.INTER_AREA))
ax[1].imshow(normalization_processing_np_np(plt.imread(image_path)))

In [None]:
#NEW NEW
def tile_generator(image_path, tile_size, zoom):
    # Read the image
    img = cv2.imread(image_path)

    # Review tile_size taking into account the zoom
    int_tile_size = int(tile_size * zoom / 40)

    # Perform tiling
    for i in range(0, img.shape[0], int_tile_size):
        for j in range(0, img.shape[1], int_tile_size):
            tile = img[i:i+int_tile_size, j:j+int_tile_size]

            # Review image size to get tile 256x256
            tile = cv2.resize(tile, (tile_size, tile_size))

            yield tile

def tile_cutter(image_path, tile_dir, tile_size, zoom):
    # Create the tile directory if it doesn't exist
    os.makedirs(tile_dir, exist_ok=True)

    # Save each tile
    for i, tile in tqdm(enumerate(tile_generator(image_path, tile_size, zoom))):
        img_tile = cv2.convertScaleAbs(tile, cv2.COLOR_BGR2RGB)  # Convert to RGB
        name = os.path.basename(image_path).replace('.png','')
        plt.imsave(os.path.join(tile_dir, f'{name}_{i}.png'), img_tile)

In [None]:
def tile_marker(image_path, color):
    # Dictionnaire des couleurs
    color_dic = {
        'red': [255, 0, 0],
        'blue': [0, 0, 255],
        'white': [255, 255, 255],
        'green': [0, 255, 0],
        'yellow': [0, 255, 255],
        'black' : [0, 0, 0]
                }
    
    # Lire l'image
    img = cv2.imread(image_path)

    # Vérifier si l'image a 3 canaux de couleur (couleur) ou 1 canal (niveau de gris)
    is_color = len(img.shape) == 3 and img.shape[2] == 3

    # Déterminer la couleur en fonction du mode de l'image (couleur ou niveau de gris)
    if is_color:
        color_value = np.array(color_dic[color][::-1])  # Inverser l'ordre pour passer de RGB à BGR
    else:
        color_value = np.array([color])

    # Ajouter une marge de la couleur spécifiée
    img[:20, :] = color_value
    img[-20:, :] = color_value
    img[:, :20] = color_value
    img[:, -20:] = color_value

    # Afficher l'image résultante
    #plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    #plt.show()
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

In [None]:
# Example usage
plt.imshow(tile_marker('/kaggle/input/UBC-OCEAN/train_thumbnails/10077_thumbnail.png','red'))

In [None]:
# take tiles and assemble them back into a reconstructed picture. Take into account the zoom
# as an ouput we have the thumbnail reconstructed
# CHANGE LINE 14 AS NEEDED
def tile_assembler(tile_dir, output_path, rows, columns, tile_width, zoom):
    int_tile_size = int(tile_width * zoom / 40)
    tile_width = int_tile_size
    img_assembled = np.zeros((rows * tile_width, columns * tile_width, 3), dtype=np.uint8)
    c = 1


    for i in range(rows):
        for j in range(columns):
            if c < len(os.listdir(tile_dir)):#108 :
                img_filename = f'32035_thumbnail_{c}.png'
                img_path = os.path.join(tile_dir, img_filename)

                # Read and resize each tile image
                tile_img = cv2.imread(img_path)
                tile_img = cv2.resize(tile_img, (int_tile_size, int_tile_size))

                if tile_img is None or tile_img.size == 0:
                    print(f"Error loading image: {img_path}")
                    continue

                tile_img = cv2.resize(tile_img, (tile_width, tile_width))

                # Calculate the position to paste each tile in the assembled image
                row_start = i * tile_width
                col_start = j * tile_width

                # Paste the tile into the assembled image
                img_assembled[row_start:row_start + tile_width, col_start:col_start + tile_width] = tile_img
                c += 1
            else:
                break

    # Save the assembled image
    cv2.imwrite(output_path, img_assembled)
    #return cv2.imread(output_path)

### On va créer une classe supplémentaire pour les tiles qui ne représentent pas un type de cancer
### Pour cela on va découper une image en tiles et récupérer les tiles appropriés pour les stocker dans un dossier et les répertorier dans la base

In [None]:
# On choisit l'image is_tma false avec le zoom le plus grand
df.loc[df['zoom']!=40].sort_values(by='zoom', ascending=False)
#c'est l'image_id 63897 zoom = 17.381955

In [None]:
img_str = df.loc[df['image_id']==63897,'image_path'].iloc[0]
img = plt.imread(img_str)
plt.imshow(img)

In [None]:
%%time
# 1min 11s
# On la coupe en tiles
tile_cutter(df[df['image_id'] == 63897]['image_path'].iloc[0], '/kaggle/working/classe_sup', 256, 17.381955)

In [None]:
# On sélectionne des tiles qu'on va labelliser 'other'
plt.figure(figsize=(10, 10)) 

for i, img in enumerate(np.random.choice(os.listdir("/kaggle/working/classe_sup"),16)):#os.listdir("/kaggle/working/classe_sup")[0:8]:
    ax = plt.subplot(4, 4, i + 1)
    plt.imshow(plt.imread("/kaggle/working/classe_sup/"+img))
    plt.title(f"Label: {img}",fontsize=6)
    plt.axis('off')

plt.show()

In [None]:
# Comparaison avec les TMA concernées
# len(df.loc[(df['is_tma']==True) & (df['label']=='HGSC'),'image_path'])
plt.figure(figsize=(10, 10)) 
for i, img in enumerate(df.loc[(df['is_tma']==True) & (df['label']=='HGSC'),'image_path']):   
    ax = plt.subplot(3, 2, i + 1)
    plt.imshow(plt.imread(img))
    plt.title(f"Label: {img}",fontsize=6)
    plt.axis('off')

In [None]:
# A ce point on garde une copie du df au cas où
df_save = df.copy()
df_save

In [None]:
# On ajoute des lignes d'images de la nouvelle classe
for number in [2325, 367, 948,2214,240,2102,2215,583,1707]:
    img_path =  "/kaggle/working/classe_sup/63897_thumbnail_"+str(number)+".png"
    #print(img_path)
    df_other = pd.DataFrame({'image_id':0,'label':'other','image_path':img_path, 'is_tma':True, 'zoom':40},index=[f'other_{str(number)}'])

    df = pd.concat([df,df_other])
df.tail()

# 4 - Model preparation

### Pour classifier des images j'ai choisi d'utiliser le réseau de neurones VGG16 en mode transfer learning avec Data Augmentation.

In [None]:
# Step parameter definition
epochs_dic = {1:1, 2:15, 3:30}
batch_size_dic = {1:16, 2:32, 3:64}
optimizer_dic = {1:'Adam', 2:'rmsprop'}
normalization_dic = {1:None, 2:'CC', 3:'EC', 4:'HGSC', 5:'LGSC', 6:'MC', 7:'other'}
label_dic = {1:['CC', 'EC', 'HGSC', 'LGSC', 'MC', 'other'], 2:['HGSC', 'Others']}
preprocessing_input_dic = {1:preprocess_input, 2:normalization_processing_np_np}

In [None]:
# Step class management (to be reviewed !)
# Number of classes
nb_lab = len(df['label'].unique())

# Class list
le = LabelEncoder()
le.fit_transform(df['label'])
list_lab = le.classes_

In [None]:
# Data augmentation function for train-val
def data_flow_fct(data, datagen, data_type=None, batch_size=None) :

    data_flow = datagen.flow_from_dataframe(data,
                                            #directory=dir_, # Pas besoin
                                            x_col='image_path',  # Utilisez 'image_path' comme colonne des chemins d'images
                                            y_col='label',#_name',
                                            weight_col=None,
                                            target_size=(256, 256),
                                            classes=None,
                                            class_mode='categorical',
                                            batch_size=batch_size,
                                            shuffle=False,
                                            seed=42,
                                            subset=data_type)
    return data_flow

In [None]:
# Data augmentation function
def datagen_trainer(preprocessing_input):
    datagen_train = ImageDataGenerator(
    #    featurewise_center=True,
    #    featurewise_std_normalization=True,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        validation_split=0.25,# détermine le ration training/validation
        preprocessing_function=preprocessing_input)
    return datagen_train

def datagen_tester(preprocessing_input):
    datagen_test = ImageDataGenerator(
        validation_split=0,
        preprocessing_function=preprocess_input)
    return datagen_test

In [None]:
# Model creation function
def create_model_fct(nb_lab) :
    #weights_path = "/kaggle/input/vgg16-weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5" # ATTENTION : activer hors connexion
    weights_path = 'imagenet'
    # Charger le modèle VGG16 pré-entraîné
    model0 = VGG16(include_top=False, weights=weights_path, input_shape=(256, 256, 3)) 
    
    # Layer non entraînables = on garde les poids du modèle pré-entraîné
    for layer in model0.layers:
        layer.trainable = False

    # Récupérer la sortie de ce réseau
    x = model0.output
    # Compléter le modèle
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(nb_lab, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=model0.input, outputs=predictions)
       
    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer=optimizer_dic[1], metrics=["accuracy"])

    print(model.summary())
    
    return model

In [None]:
# Step flow definition
def define_flow(preprocessing_flow, batch_size_flow):
    # Data augmentation for train-val
    train_flow = data_flow_fct(data, datagen_trainer(preprocessing_flow), data_type='training',batch_size=batch_size_flow)
    val_flow = data_flow_fct(data, datagen_trainer(preprocessing_flow), data_type='validation',batch_size=batch_size_flow)
    test_flow = data_flow_fct(data, datagen_tester(preprocessing_flow), data_type=None, batch_size=1)
    return train_flow, val_flow, test_flow

In [None]:
# Step model creation and training
def model_creation_training(step,train_flow, val_flow,epochs_entry):
    # 4min35 for epochs = 1 and batch_size = 32
    # Model creation and training
    with tf.device('/gpu:1'):

        # Model creation
        print('1/3-Model creation')
        model = create_model_fct(nb_lab)

        # Call back creation
        print('2/3-Callbacks')
        model_save_path = f"model_best_weights_{step}.h5"
        checkpoint = ModelCheckpoint(model_save_path, monitor='val_accuracy', verbose=1, mode='max', save_best_only=True)
        es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=10)
        callbacks_list = [checkpoint, es]

        # Training
        print('3/3-Training')
        history = model.fit(train_flow, epochs=epochs_entry, 
                            steps_per_epoch=len(train_flow),
                            callbacks=callbacks_list, 
                            validation_data=val_flow,
                            validation_steps=len(val_flow),
                            verbose=1)
    return model, history

In [None]:
# Step performance train_val
def performance_train_val(step,history, model, val_flow, batch_size_entry):
    # Performances
    print('1/6-val accuracy/epochs')
    show_history(history)
    plot_history(history, path=f"history_{step}.png")
    plt.close()

    print('2/6-predicting y_pred')
    #1min 28 for batch_size = 32
    y_pred = model.predict(val_flow, steps=len(val_flow), batch_size=batch_size_entry)

    print('3/6-getting y_val')
    nombre_total_val = len(val_flow) * batch_size_entry

    # Initialisation d'un tableau pour stocker les étiquettes réelles
    y_val = np.zeros((nombre_total_val, nb_lab))  

    # Itérer sur le générateur pour extraire les étiquettes réelles
    for i in range(len(val_flow)):
        _, batch_y_val = val_flow[i]  # Supposons que le générateur génère des paires (X_val, y_val)
        start_index = i * batch_size_dic[2]
        end_index = start_index + len(batch_y_val)
        y_val[start_index:end_index] = batch_y_val

    print('4/6-building the basic confusion matrix')
    # Obtenez les indices des classes prédites et réelles pour les échantillons disponibles
    y_val_indices = y_val.argmax(axis=1)[0:len(y_pred)]
    y_pred_indices = y_pred.argmax(axis=1)

    # Générer la matrice de confusion
    cm = confusion_matrix(y_val_indices, y_pred_indices)

    # Afficher la matrice de confusion
    print(cm)

    # Afficher le rapport de classification
    print("\n5/6-building the classification report")
    print(classification_report(y_val.argmax(axis=1)[0:len(y_pred)], y_pred.argmax(axis=1)))

    print('6/6-building the sns confusion matrix')
    # Finding the matching categorical labels for the numerical labels
    list_num_labels = sorted([x for x in set(y_val_indices)|set(y_pred_indices)])
    list_cat_labels = le.inverse_transform(list_num_labels)

    # Proceding with sns
    df_cm = pd.DataFrame(cm, index=list_cat_labels, columns=list_cat_labels)

    plt.figure(figsize=(6, 4))
    ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

    # Ajouter des étiquettes aux axes
    ax.set_xlabel("Prediction")
    ax.set_ylabel("True")

    plt.show()

In [None]:
# Step performance test
# Performance
def performance_test(step, model, test_flow):
    print('getting y_pred')
    # Testing on whole dataset
    #y_pred = model.predict(images_np)
    y_pred = model.predict(test_flow, steps=len(test_flow), batch_size=1)

    print('4/6-building the basic confusion matrix')
    # get y_val and y_pred
    y_pred_indices = y_pred.argmax(axis=1)
    y_pred_cat = le.inverse_transform(y_pred_indices)

    y_val_cat = data['label'][0:len(y_pred_cat)]

    # Générer la matrice de confusion
    cm = confusion_matrix(y_val_cat, y_pred_cat)

    # Afficher la matrice de confusion
    print(cm)

    # Afficher le rapport de classification
    print("\n5/6-building the classification report")
    print(classification_report(y_val_cat, y_pred_cat))

    print('6/6-building the sns confusion matrix')

    # Proceding with sns
    df_cm = pd.DataFrame(cm, index=label_dic[1], columns=label_dic[1])

    plt.figure(figsize=(6, 4))
    ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

    # Ajouter des étiquettes aux axes
    ax.set_xlabel("Prediction")
    ax.set_ylabel("True")
    
    # Sauvegardez l'image dans un fichier
    plt.savefig(f'confusion_matrix_{step}.png')

    plt.show()
    return y_val_cat, y_pred_cat, y_pred

# 5 - Etape 1 : Model training

### -Model_0 : Entraînement et test du modèle VGG uniquement sur les données is_tma

In [None]:
%%time
# 2min 30s
# Pipeline
# Step parameter definition
epochs_dic = {1:1, 2:15, 3:30}
batch_size_dic = {1:16, 2:32, 3:64}
optimizer_dic = {1:'Adam', 2:'rmsprop'}
normalization_dic = {1:None, 2:'CC', 3:'EC', 4:'HGSC', 5:'LGSC', 6:'MC', 7:'other'}
label_dic = {1:['CC', 'EC', 'HGSC', 'LGSC', 'MC', 'other'], 2:['HGSC', 'Others']}
preprocessing_input_dic = {1:preprocess_input, 2:normalization_processing_np_np}
step = 'step_0'
data = df.loc[df['is_tma']==True]

results = pd.DataFrame(columns=['optimizer','method','test_accuracy', 'processing_time'])
chrono = time()
for i in range(2,3):#rmsprop
    for j in range(1,2):#preprocess_input
#preprocess_input
        epochs_entry = epochs_dic[2]#15
        batch_size_entry = batch_size_dic[1]#16
        optimizer_entry = optimizer_dic[i]
        preprocessing_entry = preprocessing_input_dic[j]

        print('Step 1 : class management (to be reviewed !)')
        # Step class management (to be reviewed !)
        # Number of classes
        nb_lab = len(data['label'].unique())

        # Class list
        le = LabelEncoder()
        le.fit_transform(data['label'])
        list_lab = le.classes_

        print('Step 2 : flow definition')
        # Step flow definition
        train_flow, val_flow, test_flow = define_flow(preprocessing_entry, batch_size_entry)

        print('Step 3 : model creation and training')
        # Step model creation and training
        model_0, history_0 = model_creation_training(step,train_flow, val_flow, epochs_entry)

        print('Step 4 : performance train_val')
        # Step performance train_val
        performance_train_val(step,history_0, model_0, val_flow, batch_size_entry)

        #print('Step 5 : préparation des images')
        # Step préparation des images (les images ne sont pas traitées !!)
        #images_np = image_prep_fct(data)
        #print(images_np.shape)

        print('Step 6 : performance test')
        # Step performance test
        #data=df.loc[df['is_tma']==False,:]
        #test_flow = data_flow_fct(df.loc[df['is_tma']==False,:], datagen_tester(preprocessing_entry), data_type=None, batch_size=1)
        y_val_cat, y_pred_cat, y_pred = performance_test(step, model_0, test_flow)

        print('Step 7 : recording performance')
        chrono = time() - chrono
        results.loc[len(results)] = [optimizer_entry, preprocessing_entry.__name__, accuracy_score(y_val_cat, y_pred_cat), chrono]
        chrono = time()
results

In [None]:
print(results)
plt.imshow(plt.imread('/kaggle/working/confusion_matrix_step_0.png'))
plt.axis('off')

# 6 - Prediction sur une image

In [None]:
# on prend une image au hasard. Par exemple la 32035 qui est labellisée EC
df[df['image_id']==32035]

In [None]:
%%time
# 23s - 3min 7s
# on la découpe en tiles nommés 32035_thumbnails_{i}.png stockés dans un répertoire /kaggle/working/32035
tile_cutter(image_path='/kaggle/input/UBC-OCEAN/train_thumbnails/32035_thumbnail.png', tile_dir='/kaggle/working/example_32035', tile_size=256, zoom=10)

In [None]:
# On va créer un df qui prend le chemin de chaque tile sur la colonne 'image_path'
df_32035 = pd.DataFrame()
df_32035['image_path'] = os.listdir('/kaggle/working/example_32035')
df_32035['image_path'] = list(map(lambda x : '/kaggle/working/example_32035/'+x,df_32035['image_path']))
df_32035['label'] = 'a'

In [None]:
# On définit le X_test
X_test = df_32035

In [None]:
%%time
# 7ms
# Transformer X_test en test_flow
test_flow = data_flow_fct(X_test, datagen_tester(preprocessing_entry), data_type=None, batch_size=1)

In [None]:
%%time
# 18s
# On prédit le test_flow => np array à 6 col (nb de labels)
y_pred = model_0.predict(test_flow)
y_pred[0:10]

In [None]:
# On convertit le y_pred en une liste des labels catégoriels
y_pred_ = np.argmax(y_pred, axis=1)
y_pred_ = le.inverse_transform(y_pred_)
y_pred_[0:10]

In [None]:
# On ajoute cette liste en colonne aux data d'entrée
data = df_32035
data.loc[:,'pred'] = y_pred_

In [None]:
# Pour visualiser le résultat, d'abord on effectue un marking des tiles
for img_path, pred in zip(data['image_path'], data['pred']):
    #print(img_path,' ',pred)
    if pred == 'CC':
        img = tile_marker(img_path, 'red')
        cv2.imwrite(img_path, img)
    elif pred == 'EC':
        img = tile_marker(img_path, 'blue')
        cv2.imwrite(img_path, img)
    elif pred == 'HGSC':
        img = tile_marker(img_path, 'yellow')
        cv2.imwrite(img_path, img)
    elif pred == 'LGSC':
        img = tile_marker(img_path, 'green')
        cv2.imwrite(img_path, img)
    elif pred == 'MC':
        img = tile_marker(img_path, 'white')
        cv2.imwrite(img_path, img)
    elif pred == 'other':
        img = tile_marker(img_path, 'black')
        cv2.imwrite(img_path, img)

In [None]:
%%time
#2s
# Puis on assemble les tiles marqués
tile_width = 256
zoom = 10
image_width, image_height, color_channels = (cv2.imread('/kaggle/input/UBC-OCEAN/train_thumbnails/32035_thumbnail.png')).shape
rows = math.ceil(image_width / int(tile_width * zoom / 40))
columns = math.ceil(image_height / int(tile_width * zoom / 40))
tile_dir = '/kaggle/working/example_32035'
output_path = '/kaggle/working/example_img_assembled_32035.png'
tile_assembler(tile_dir, output_path, rows, columns, tile_width, zoom)

In [None]:
plt.figure(figsize=(20, 20))
plt.imshow(cv2.imread('/kaggle/working/example_img_assembled_32035.png'))

In [None]:
# On comptabilise le nombre de tiles par sous-type de cancer
data['pred'].value_counts()

In [None]:
# Le label est donné par le DEUXIEME plus gros effectif (si le premier effectif est other)
label = data['pred'].value_counts().index[1]
label

In [None]:
# On ajoute une colonne pred au df de départ
df['pred'] = 0
# Et on attribue la DEUXIEME valeur majoritaire à l'image dans le df initial
df.loc[df['image_id']==32035, 'pred'] = label

In [None]:
df.loc[df['image_id'] == 32035]

# 7 - Généralisation

In [None]:
%%time
# 37min 31s
# pour tile_size=512, zoom=38
# Pour chaque image du df on effectue un tiling. Les tiles sont stockées dans des répertoires au nom de l'image_id'
for img_path, zoom, img_id in zip(df['image_path'], df['zoom'], df['image_id']):
    tile_dir = '/kaggle/working/'+ str(img_id)
    tile_cutter(image_path=img_path, tile_dir=tile_dir, tile_size=512, zoom=20)

In [None]:
%%time
# Pour chaque sac de tiles on effectue une prediction qu'on stocke dans une colonne 'pred' de df
for tile_dir in list(map(lambda x : '/kaggle/working/'+ str(x), df['image_id'])):
    print('1 - On va créer un df qui prend le chemin de chaque tile sur la colonne image_path')
    df_ = pd.DataFrame()
    df_['image_path'] = os.listdir(tile_dir)
    df_['image_path'] = list(map(lambda x : tile_dir + '/'+ x, df_['image_path']))
    df_['label'] = 'a'
    print('2 - Ensuite on va créer le test_flow')
    test_flow = data_flow_fct(df_, datagen_tester(preprocessing_entry), data_type=None, batch_size=1)
    print('3 - On prédit le test_flow')
    y_pred = model_0.predict(test_flow)
    print('4 - On convertit le np.array y_pred en un liste des labels catgoriels')
    y_pred= le.inverse_transform(np.argmax(y_pred, axis=1))
    print('5 - On met ce y_pred dans la colonne label de df_')
    df_['label'] = y_pred
    print('6 - On enregistre la prédiction majoritaire dans une variable label')
    label = df_['label'].value_counts().index[1]
    print('7 - On attribue cette label à l\'image_id dans la colonne pred du df de départ')
    df.loc[df['image_id']==os.path.basename(tile_dir), 'pred'] = label

# 8 - Performance

In [None]:
step = '8_performance'
y_val_cat = df['label']
y_pred_cat = df['pred']

# Générer la matrice de confusion
cm = confusion_matrix(y_val_cat, y_pred_cat)

# Afficher la matrice de confusion
print(cm)

# Afficher le rapport de classification
print("\n5/6-building the classification report")
print(classification_report(y_val_cat, y_pred_cat))

print('6/6-building the sns confusion matrix')

# Proceding with sns
df_cm = pd.DataFrame(cm, index=label_dic[1], columns=label_dic[1])

plt.figure(figsize=(6, 4))
ax = sns.heatmap(df_cm, annot=True, cmap="Blues")

# Ajouter des étiquettes aux axes
ax.set_xlabel("Prediction")
ax.set_ylabel("True")

# Sauvegardez l'image dans un fichier
plt.savefig(f'confusion_matrix_{step}.png')

In [None]:
plt.show(plt.imread('/kaggle/working/confusion_matrix_8_performance.png'))