## TRANSFER LEARNING: IMPROVING ALGORITHM
---

### Libraries

In [None]:
import pickle
from os import listdir

#basic
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import plotly.graph_objects as go

#tensorflow and keras
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Dense, GlobalAveragePooling2D, Flatten, MaxPooling2D, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Model

#sklearn
from sklearn.model_selection import train_test_split
from fast_ml.model_development import train_valid_test_split

#open cv
import cv2 as cv

#functions
from utils.images import load_images, read_image_file
from utils.train import series_to_array, get_classes_dictionary

from cascid import config

### Constants

In [None]:
# PATH = "/home/fernandofincatti/Documents/insper/pfe/ComputerAidedSkinCancerIdentificationAndDiagnosis/data/"
RANDOM_STATE = 42
TRAIN_SIZE = 0.7
VALIDATION_SIZE = 0.15
TEST_SIZE = 0.15
EPOCHS = 250
IMAGE_SHAPE = (128, 128, 3)

FERNANDO_PATH = config.DATA_DIR / 'experiments' / 'fernando'
FERNANDO_PATH.mkdir(exist_ok=True, parents=True)

FULL_DATAFRAME_FILE = FERNANDO_PATH / 'full_dataframe.pkl'
FEATURES_FILE = FERNANDO_PATH / 'features.pkl'

READ_FULL_DATAFRAME = False
COMPUTE_FEATURES = False


### Loading data

In [None]:
def read_images():
    images = []
    file_names = []
    for path in config.IMAGE_DIR.glob('*.png'):
        images.append(read_image_file(str(path), IMAGE_SHAPE))
        file_names.append(path.name)
    images_dataframe = pd.DataFrame({
        "image_array": images,
        "img_id": file_names
    })
    return images_dataframe


def read_metadata():
    return pd.read_csv(config.DATA_FILE)


def read_data():
    metadata = read_metadata()
    images_dataframe = read_images()
    full_dataframe = metadata.merge(images_dataframe, how="right", on="img_id")
    return full_dataframe

In [None]:
if READ_FULL_DATAFRAME:
    full_dataframe = read_data()

    with open(FULL_DATAFRAME_FILE, 'wb') as file:
        pickle.dump(full_dataframe, file)
else:
    with open(FULL_DATAFRAME_FILE, 'rb') as file:
        full_dataframe = pickle.load(file)


In [None]:
full_dataframe.head()

### Test image

In [None]:
img_test = full_dataframe["image_array"][0]
file_test = full_dataframe["img_id"][0]

In [None]:
print(file_test)
plt.imshow(img_test)
plt.show()

### Split train, validation and test

In [None]:
features = full_dataframe["image_array"]
target = full_dataframe["diagnostic"]

In [None]:
x_train, y_train, x_valid, y_valid, x_test, y_test = train_valid_test_split(
    full_dataframe,
    target="diagnostic",
    train_size=TRAIN_SIZE,
    valid_size=VALIDATION_SIZE,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

print("Train examples: {0}".format(x_train.shape[0]))
print("Validation examples: {0}".format(x_valid.shape[0]))
print("Test examples: {0}".format(x_test.shape[0]))

In [None]:
predictions_dictionary = get_classes_dictionary(y_train)

In [None]:
predictions_dictionary

In [None]:
x_train_to_array = np.asarray(x_train["image_array"].tolist()).astype(
    np.float32)
x_test_to_array = np.asarray(x_test["image_array"].tolist()).astype(np.float32)
x_valid_to_array = np.asarray(x_valid["image_array"].tolist()).astype(
    np.float32)


In [None]:
y_train_to_array = series_to_array(y_train, predictions_dictionary)
y_test_to_array = series_to_array(y_test, predictions_dictionary)
y_valid_to_array = series_to_array(y_valid, predictions_dictionary)


### Data augmentation

In [None]:
# train_data_augmentation_generator = ImageDataGenerator(
#     rotation_range=20,
#     width_shift_range=0.2,
#     height_shift_range=0.2,
#     horizontal_flip=True,
# )

### Model

In [None]:
if COMPUTE_FEATURES:
    feature_extractor = keras.applications.ResNet50(
        weights='imagenet',
        input_shape=IMAGE_SHAPE,
        pooling='avg',
        include_top=False,
    )
    feature_extractor.trainable = False  #to make sure it's not being trained
    features = feature_extractor(x_train_to_array)

    with open(FEATURES_FILE, 'wb') as file:
        pickle.dump(features.numpy(), file)
else:
    with open(FEATURES_FILE, 'rb') as file:
        features = pickle.load(file)

In [None]:
features.shape

In [None]:
input_layer = Input(shape=(2048, ))
output_layer = Dense(6, activation='softmax')(input_layer)
model = Model(input_layer, output_layer)

In [None]:
# feature_extractor.trainable = False #to make sure it's not being trained
# input_layer = Input(shape=IMAGE_SHAPE)
# pre_treined_model = feature_extractor(input_layer, training=False) #add input layer
# pre_treined_model_with_polling = GlobalAveragePooling2D()(pre_treined_model) #add pooling layer
# output_layer = Dense(6, activation='softmax')(pre_treined_model_with_polling)
# model = Model(input_layer, output_layer)

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
model.summary()

### Train

In [None]:
best_model_checkpoint_early_stopping = EarlyStopping(monitor='val_accuracy',
                                                     mode='max',
                                                     verbose=1,
                                                     patience=100,
                                                     restore_best_weights=True)

best_model_checkpoint = ModelCheckpoint(
    filepath=
    "/home/fernandofincatti/Documents/insper/pfe/ComputerAidedSkinCancerIdentificationAndDiagnosis/experiments/fernando/transfer-learning/test02/model",
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=True)


In [None]:
# training_history = model.fit(
#     x = train_data_augmentation_generator.flow(
#         x_train_to_array,
#         y_train_to_array),
#     epochs = EPOCHS,
#     validation_data = (x_valid_to_array, y_valid_to_array),
#     batch_size=8,
#     callbacks=[best_model_checkpoint, best_model_checkpoint_early_stopping]
# )

In [None]:
training_history = model.fit(
    features,
    y_train_to_array,
    epochs=EPOCHS,
    validation_split=0.2,
    batch_size=256,
    #callbacks=[best_model_checkpoint, best_model_checkpoint_early_stopping]
)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(y=training_history.history['val_accuracy'],
               mode='lines',
               name='Acc - Validation'))
fig.add_trace(
    go.Scatter(y=training_history.history['accuracy'],
               mode='lines',
               name='Acc - Training'))


In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(y=training_history.history['val_loss'],
               mode='lines',
               name='Validation loss'))
fig.add_trace(
    go.Scatter(y=training_history.history['loss'],
               mode='lines',
               name='Training loss'))


### Evaluating the model

In [None]:
loss, acc = model.evaluate(x=x_test_to_array, y=y_test_to_array)
print("model loss: {0}".format(loss))
print("model accuracy: {0}".format(acc))

In [None]:
predictions = model.predict(x_test_to_array)

In [None]:
classe_names = y_train.unique().tolist()
classe_names

In [None]:
predictions_categorical = list()
for list_ar in predictions:
    predictions_categorical.append(classe_names[np.argmax(list_ar)])

In [None]:
y_test.value_counts(True)

In [None]:
pd.Series(predictions_categorical).value_counts(True)