# <font color='white'>**Libraries**</font>

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [2]:
import tensorflow as tf
import sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.preprocessing import image
from tensorflow import keras
import imageio
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
AUTOTUNE = tf.data.AUTOTUNE
from tqdm import tqdm
from numpy import asarray
import random
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from tensorflow import keras

In [4]:
# for keras
from classification_models.keras import Classifiers

# <font color='red'>**Helper functions**</font>
Previously executed the "splitsCsv.ipynb" file

# <font color='red'>**Data**</font>

### Loading csv files and dataframes by fold

In [None]:
def load_folds(fold):
    """
    Loads data from the specified fold (train or test) into pandas DataFrames and splits it into train,
    validation, and test sets.

    Parameters:
    fold (str): The fold for which the data needs to be loaded. Can be either 'train' or 'test'.

    Returns:
    train_df (pandas.DataFrame): DataFrame containing the training data with two columns, 'path' and 'label'.
    val_df (pandas.DataFrame): DataFrame containing the validation data with two columns, 'path' and 'label'.
    test_df (pandas.DataFrame): DataFrame containing the test data with two columns, 'path' and 'label'.

    Notes:
    - The function assumes that the data is available in CSV files with the specified format in the given paths.
    - The 'fold' parameter should be a string indicating the fold (either 'train' or 'test').
    - The function loads the data from the respective CSV files, assigns column names to the DataFrames,
      and splits the training data into training and validation sets using a 90-10 split ratio.
    - The 'label' column is considered as the target label for the classification task.
    - The 'path' column contains the file paths to the images or data points.
    """
    train_path = '../../../data/binary/'+fold+'/train_NBI.csv'
    test_path = '../../../data/binary/'+fold+'/test_NBI.csv'

    # Load the training data from the specified CSV file
    gen_train_df = pd.read_csv(train_path)
    gen_train_df.columns = ['path', 'label']
    X = gen_train_df['path']
    y = gen_train_df['label']

    # Split the training data into training and validation sets (90-10 split ratio)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=14)

    # Concatenate the training data and their labels to create the training DataFrame
    train_df = pd.concat([X_train, y_train], axis=1)

    # Concatenate the validation data and their labels to create the validation DataFrame
    val_df = pd.concat([X_val, y_val], axis=1)

    # Load the test data from the specified CSV file
    test_df = pd.read_csv(test_path)
    test_df.columns = ['path', 'label']

    # Return the DataFrames containing the loaded and split data
    return train_df, val_df, test_df


In [None]:
def load_dataframes():
    """
    Loads data from three CSV files (training, validation, and test) into pandas DataFrames.

    Returns:
    train_df (pandas.DataFrame): DataFrame containing the training data with two columns, 'path' and 'label'.
    val_df (pandas.DataFrame): DataFrame containing the validation data with two columns, 'path' and 'label'.
    test_df (pandas.DataFrame): DataFrame containing the test data with two columns, 'path' and 'label'.

    Notes:
    - The function assumes that the CSV files are located in the specified directory structure:
      '../data/csv_files/adeVshyp/NBI/trainNBI.csv', '../data/csv_files/adeVshyp/NBI/valNBI.csv', and
      '../data/csv_files/adeVshyp/NBI/testNBI.csv'.
    - The CSV files are assumed to have no headers, so the function will assign column names 'path' and 'label'
      to the respective columns after loading the data.
    """
    # Set the path for the CSV files
    gen_path = '../data/csv_files/adeVshyp/NBI/'
    train_path = gen_path + 'trainNBI.csv'
    val_path = gen_path + 'valNBI.csv'
    test_path = gen_path + 'testNBI.csv'
    
    # Load the training data from the CSV file and assign column names to the DataFrame
    train_df = pd.read_csv(train_path, header=None)
    train_df.columns = ['path', 'label']

    # Load the validation data from the CSV file and assign column names to the DataFrame
    val_df = pd.read_csv(val_path, header=None)
    val_df.columns = ['path', 'label']

    # Load the test data from the CSV file and assign column names to the DataFrame
    test_df = pd.read_csv(test_path, header=None)
    test_df.columns = ['path', 'label']

    # Return the DataFrames containing the loaded data
    return train_df, val_df, test_df


# <font color='red'>**Networks**</font>

In [None]:
def transfer_learning(arquitectura, base_model, IMG_HEIGHT, IMG_WIDTH, num_clases):
    """
    Creates a new transfer learning model by adding custom layers on top of the given base model.

    Parameters:
    arquitectura (str): The architecture of the base model to be used. Can be 'ResNet18', 'Vgg16', or any other value.
    base_model: The pre-trained base model to which custom layers will be added.
    IMG_HEIGHT (int): The target height for the input images.
    IMG_WIDTH (int): The target width for the input images.
    num_clases (int): The number of output classes for the classification task.

    Returns:
    model (tensorflow.keras.models.Model): A new transfer learning model with custom layers on top of the base model.

    Notes:
    - The function allows transfer learning using different pre-trained base models like 'ResNet18', 'Vgg16', etc.
    - The base model's lower layers are frozen to prevent further training and fine-tuning.
    - The custom layers are added on top of the base model to adapt it for the specific classification task.
    - The activation function for the last layer is set to 'softmax' for multi-class classification tasks.
    """
    print("making transfer learning...")

    # Freeze the base_model lower layers
    for layer in base_model.layers[:10]:
        layer.trainable = False
    for layer in base_model.layers[10:]:
        layer.trainable = True

    # Create a new model on top based on the specified architecture
    if arquitectura == 'ResNet18':
        inputs = tf.keras.layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
        x = tf.keras.layers.Rescaling(scale=1 / 127.5, offset=-1)(inputs)
        x = base_model(x)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        predictions = tf.keras.layers.Dense(num_clases, activation='softmax')(x)
        model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
    elif arquitectura == 'Vgg16':
        inputs = tf.keras.layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
        x = tf.keras.layers.Rescaling(scale=1 / 127.5, offset=-1)(inputs)
        x = base_model(x)
        x = tf.keras.layers.Flatten(name='flatten')(x)
        x = tf.keras.layers.Dense(4096, activation='relu', name='fc1')(x)
        x = tf.keras.layers.Dense(4096, activation='relu', name='fc2')(x)
        predictions = tf.keras.layers.Dense(num_clases, activation='softmax', name='predictions')(x)
        model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
    else:
        inputs = tf.keras.layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
        x = tf.keras.layers.Rescaling(scale=1 / 127.5, offset=-1)(inputs)
        x = base_model(x)
        x = GlobalAveragePooling2D()(x)
        x = Dense(1024, activation='relu')(x)
        x = Dropout(0.5)(x)
        predictions = Dense(num_clases, activation='softmax')(x)
        model = Model(inputs=inputs, outputs=predictions)

    return model


In [None]:
def make_model(arquitectura, HEIGHT, WIDTH):
    """
    Creates a base model for transfer learning based on the specified architecture.

    Parameters:
    arquitectura (str): The architecture of the base model to be created. Can be 'MobileNet', 'Vgg16', or 'EfficientNetV2B0'.
    HEIGHT (int): The target height for the input images.
    WIDTH (int): The target width for the input images.

    Returns:
    base_model (tensorflow.keras.models.Model): A pre-trained base model suitable for transfer learning.

    Notes:
    - The function uses Keras applications to load pre-trained base models for transfer learning.
    - The 'MobileNet', 'Vgg16', and 'EfficientNetV2B0' models are available for selection.
    - The input shape for the base model is set to (HEIGHT, WIDTH, 3) to match the target image size and 3 color channels.
    - The 'weights' parameter is set to 'imagenet' to use pre-trained weights on the ImageNet dataset.
    - The 'include_top' parameter is set to False to exclude the top classification layers of the base model.
    """
    print("loading : ", arquitectura)

    # Create a base model for transfer learning based on the specified architecture
    if arquitectura == 'MobileNet':
        base_model = tf.keras.applications.MobileNet(weights='imagenet', include_top=False,
                                                     input_shape=(HEIGHT, WIDTH, 3))
    elif arquitectura == 'Vgg16':
        base_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False,
                                                 input_shape=(HEIGHT, WIDTH, 3))
    elif arquitectura == 'EfficientNetV2B0':
        base_model = tf.keras.applications.EfficientNetV2B0(weights='imagenet', include_top=False,
                                                            input_shape=(HEIGHT, WIDTH, 3))

    return base_model


## Data generator

In [None]:
def make_generator(df_train, df_val, HEIGHT, WIDTH, tipo, batch_size):
    """
    Creates image generators for training and validation data from the given DataFrames.

    Parameters:
    df_train (pandas.DataFrame): DataFrame containing the training data with two columns, 'path' and 'label'.
    df_val (pandas.DataFrame): DataFrame containing the validation data with two columns, 'path' and 'label'.
    HEIGHT (int): The target height for the input images.
    WIDTH (int): The target width for the input images.
    tipo (str): The class mode for the generator. Can be 'binary' for binary classification or 'categorical' for multi-class.
    batch_size (int): The batch size for the data generator.

    Returns:
    train_generator (tensorflow.python.keras.preprocessing.image.DataFrameIterator): A data generator for training data.
    valid_generator (tensorflow.python.keras.preprocessing.image.DataFrameIterator): A data generator for validation data.

    Notes:
    - The function creates two image data generators, one for training data and one for validation data.
    - The training data generator reads data from the DataFrame 'df_train', and the validation data generator reads
      data from the DataFrame 'df_val'.
    - The 'tipo' parameter determines the class mode for the generator ('binary' for binary classification or
      'categorical' for multi-class classification).
    - The 'target_size' parameter is set to (HEIGHT, WIDTH) to resize the images to the specified dimensions.
    - The 'seed' parameter is set to 42 for reproducibility of random transformations applied to the images.
    - The 'shuffle' parameter is set to True to shuffle the data during each epoch.
    """
    # Generator for training data
    datagen = ImageDataGenerator()

    train_generator = datagen.flow_from_dataframe(directory=None, 
                                                  dataframe=df_train,
                                                  x_col='path', 
                                                  y_col='label', 
                                                  target_size=(HEIGHT, WIDTH),
                                                  class_mode=tipo, 
                                                  batch_size=batch_size,
                                                  seed=42,
                                                  shuffle=True)

    # Generator for validation data
    val_datagen = ImageDataGenerator()

    valid_generator = val_datagen.flow_from_dataframe(directory=None,
                                                      dataframe=df_val,
                                                      x_col='path',
                                                      y_col='label',
                                                      batch_size=batch_size,
                                                      seed=42,
                                                      shuffle=True,
                                                      class_mode=tipo,
                                                      target_size=(HEIGHT, WIDTH))

    return train_generator, valid_generator


## Train

In [10]:
#this cell is for wide training using different nets
def train(arquitectura, HEIGHT, WIDTH, df_train, df_val, lr, train_epochs, batch_size, loss, 
          tipo, clases, augmenting_factor, activacion_final):
    
    base_model = make_model(arquitectura, HEIGHT, WIDTH)
    # making trasfer learning
    finetune_model = transfer_learning(arquitectura, base_model, HEIGHT, WIDTH, clases, activacion_final)
    # creating generadores
    print("getting generators...")
    train_generator, valid_generator = make_generator(df_train, df_val, HEIGHT,WIDTH, tipo, batch_size)

    # training steps over each dataset
    NUM_EPOCHS = train_epochs
    num_train_images = len(df_train)*augmenting_factor
    STEP_SIZE_TRAIN=num_train_images//train_generator.batch_size
    STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
    
    print("amount of epocas: ", NUM_EPOCHS)
    print("amount of images to train: ", num_train_images)
    print("amount of images to valid: ", valid_generator.n)

    
    if task == 'binary':
        save_path = "../path/to_save/your_model.h5 "
    else:
        save_path = "../path/to_save/your_model.h5 "
        
    callback_list = [tf.keras.callbacks.EarlyStopping(patience=4, monitor='val_auc', mode='max'),                                     
                    tf.keras.callbacks.ModelCheckpoint(filepath=save_path,
                                                      monitor = 'val_auc',
                                                      verbose=1,
                                                      save_best_only=True,
                                                      mode = 'max',
                                                      save_weights_only=False,
                                                      save_freq='epoch')]
    
    solver = Adam(lr=lr)
    print("class weights")
    total = df_train.shape[0]
    weights = (total/df_train.groupby('label').count().values)/2
    class_weight = {0:weights[0][0], 1:weights[1][0]}
    
    print("compiling model")
    finetune_model.compile(solver, loss=loss, metrics=[tf.keras.metrics.AUC()])
    
    print("model training...")
    history = finetune_model.fit(train_generator,
                                 steps_per_epoch = STEP_SIZE_TRAIN,
                                 epochs=NUM_EPOCHS,
                                 validation_data=valid_generator,
                                 validation_steps=STEP_SIZE_VALID,
                                 class_weight=class_weight,
                                 callbacks=callback_list)
    
    return finetune_model, history, train_generator, valid_generator 

In [11]:
train_df, val_df, test_df = load_dataframes()

In [12]:
print("lengh of train: {}, lengh of valid: {}, lengh of test: {}".format(len(train_df), len(val_df), len(test_df)))

lengh of train: 19508, lengh of valid: 5115, lengh of test: 5538


In [None]:
current_train_df = pd.concat([train_df, val_df], axis=0)
len(current_train_df)

In [None]:
#this cell is for a general classification knowledge
nets = ['Vgg16', 'MobileNet', 'EfficientNetV2B0']

num_clases=2

HEIGHT, WIDTH = 256, 256
activacion_final = 'softmax'
x_col_name = 'path'
y_col_name = 'label'
tipo = 'categorical'
batch_size = 16
augmenting_factor = 2
train_epochs = 20
lr=0.00001
loss='categorical_crossentropy' 

for net in nets:
    arquitectura = net 
    np.random.seed(42)
    tf.random.set_seed(42)
    tf.keras.backend.clear_session()

    finetune_model, history, train_generator, valid_generator = train(arquitectura, HEIGHT, WIDTH, 
                                                                      current_train_df, val_df, lr, train_epochs,
                                                                      batch_size, loss, tipo, num_clases,
                                                                      augmenting_factor, activacion_final)

## Full test frames and serrated samples 

In [None]:
path = '../imgs_results/binary/embcBaseline/fold1/full_frames/test/embcBaselineArtifNbifold1.csv'
general_df = pd.read_csv(path, header=None)
general_df.columns = ['path', 'label']

serrated_df = general_df[general_df['label']=='serrated']

test_df = general_df[general_df['label']!='serrated']
test_df.tail()

In [None]:
#path = '../data/csv_files/adeVshyp/NBI/test.csv'
path = '../imgs_results/binary/' +  experiment + '/full_frames/' + experiment+ 'ArtifNbi.csv'
general_df = pd.read_csv(path, header=None)
general_df.columns = ['path', 'label']

serrated_df = general_df[general_df['label']=='serrated']

test_df = general_df[general_df['label']!='serrated']
test_df.tail()

In [None]:
test_df.groupby(['label']).count()

In [None]:
test_df.iloc[0]['path']

## Loading test dataframes

### Making generator

In [13]:
def make_generator(df_test, HEIGHT, WIDTH, tipo, batch_size):
    
    test_datagen=ImageDataGenerator()
    
    test_generator=test_datagen.flow_from_dataframe(directory=None,
                                                    dataframe=df_test,
                                                    x_col='path',
                                                    y_col='label',
                                                    batch_size=batch_size,
                                                    seed=42,
                                                    shuffle=False,
                                                    class_mode=tipo,
                                                    target_size=(HEIGHT,WIDTH))

    return test_generator

In [14]:
HEIGHT, WIDTH = 256, 256
tipo = 'categorical'
batch_size = 16

In [15]:
test_gen = make_generator(test_df, HEIGHT, WIDTH, tipo, batch_size)

Found 5538 validated image filenames belonging to 2 classes.


In [16]:
#loading the 
trained_model_path = "/path/classifier/model.h5"
model = keras.models.load_model(trained_model_path, compile=True)

modelo cargado!


In [17]:
#Confution Matrix and Classification Report
test_gen.reset()
logits = model.predict(test_gen, test_df.shape[0] // batch_size+1)
y_pred_class = np.argmax(logits, axis=1)
#predicted_class_probab=np.max(logits,axis=1)


target_names = ['Adenoma', 'Hyperplastic']   
  

print('Confusion Matrix')
print(confusion_matrix(test_gen.classes, y_pred_class))
print('Classification Report')
print(classification_report(test_gen.classes, y_pred_class, target_names=target_names))

Confusion Matrix
[[2816 1345]
 [ 450  927]]
Classification Report
              precision    recall  f1-score   support

     Adenoma       0.86      0.68      0.76      4161
Hyperplastic       0.41      0.67      0.51      1377

    accuracy                           0.68      5538
   macro avg       0.64      0.67      0.63      5538
weighted avg       0.75      0.68      0.70      5538



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
target_names = ['adenoma', 'hyperplastic']
cm = confusion_matrix(test_gen.classes, y_pred_class, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp = disp.plot(include_values=True, cmap=plt.cm.Blues, xticks_rotation='horizontal', values_format='.2f')

In [None]:
AUC = tf.keras.metrics.AUC()
AUC.update_state(test_gen.classes, y_pred_class)
AUC.result()