### Imports

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']
import csv
import datetime
import itertools
import json
import os
import tarfile
import zipfile
from shutil import copy, copyfile
from textwrap import wrap
from urllib.request import urlretrieve

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import tensorflow as tf
from google.colab import drive
from keras_preprocessing.image import ImageDataGenerator
from numpy import argmax
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import (EarlyStopping, ModelCheckpoint,
                                        TensorBoard)
from tensorflow.keras.metrics import (AUC, CategoricalAccuracy,
                                      CategoricalCrossentropy, FalseNegatives,
                                      FalsePositives,
                                      MeanAbsolutePercentageError, Precision,
                                      Recall, TrueNegatives, TruePositives)
from tensorflow.keras.models import load_model
from tensorflow_addons.metrics import F1Score

In [None]:
drive.mount('/content/drive')

### Download dataset

In [None]:
in_situ_url = 'http://grozi.calit2.net/GroZi-120/inSitu.zip'
in_vitro_url = 'http://grozi.calit2.net/GroZi-120/inVitro.zip'
labels_url = 'http://grozi.calit2.net/GroZi-120/index2/UPC_index.txt'

os.makedirs('temp')
print("Downloading dataset.")
urlretrieve(in_situ_url, "./temp/inSitu.zip")
urlretrieve(in_vitro_url, "./temp/inVitro.zip")
urlretrieve(labels_url, "./temp/UPC_index.txt")
print("Done.")

In [None]:
in_situ_path = '/content/temp/inSitu.zip'
in_vitro_path = '/content/temp/inVitro.zip'
labels_path = '/content/temp/UPC_index.txt'

In [None]:
!rm -rf grozi120/
dataset_path = '/content/grozi120/'
new_path = '/content/grozi120/processed'
os.makedirs('grozi120')
with zipfile.ZipFile(in_situ_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_path)
with zipfile.ZipFile(in_vitro_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_path)
copyfile(labels_path, os.path.join(dataset_path, 'UPC_index.txt'))

### Preprocessing


In [None]:
resize_pad = True
IMG_SIZE = 200
BATCH_SIZE = 128

In [None]:
from PIL import Image, ImageOps

def resize_with_pad(file_path, img_size):
  img = Image.open(file_path)
  old_size = img.size  
  ratio = float(img_size) / max(old_size)
  new_size = tuple([int(x * ratio) for x in old_size])

  img = img.resize(new_size, Image.ANTIALIAS)

  new_img = Image.new("RGB", (img_size, img_size))
  new_img.paste(img, ((img_size - new_size[0]) // 2,
                  (img_size - new_size[1]) // 2))
  return new_img

In [None]:
if not os.path.exists(os.path.join(new_path, 'inVitro')):
  os.makedirs(os.path.join(new_path, 'inVitro'))

labels = {}
with open(os.path.join(dataset_path, 'UPC_index.txt'), 'r') as f:
  lines = [line.rstrip() for line in f.readlines()[1:] if line != '\n']
  for i in range(0, len(lines)-2, 3):
    labels[int(lines[i])] = [lines[i+1], lines[i+2]]

dataset = {
  'file_name': [], 
  'label': []
}
for product in os.listdir(os.path.join(dataset_path, 'inVitro')):
  for product_file in os.listdir(os.path.join(dataset_path, 'inVitro', product, 'web', 'JPEG')):
    if product_file == 'Thumbs.db':
      continue
    dataset['file_name'].append(os.path.join(dataset_path, 'inVitro', product, 'web', 'JPEG', product_file))
    dataset['label'].append(labels[int(product)][1])

df = pd.DataFrame(dataset, columns=['file_name', 'label'])

In [None]:
if not os.path.exists(os.path.join(new_path, 'inSitu')):
    os.makedirs(os.path.join(new_path, 'inSitu'))

evaluation_dataset = {
  'file_name': [], 
  'label': []
}

for product in os.listdir(os.path.join(dataset_path, 'inSitu')):
  for product_file in os.listdir(os.path.join(dataset_path, 'inSitu', product, 'video')):
    if product_file == 'Thumbs.db':
      continue
    evaluation_dataset['file_name'].append(os.path.join(dataset_path, 'inSitu', product, 'video', product_file))
    evaluation_dataset['label'].append(labels[int(product)][1])

evaluation_df = pd.DataFrame(evaluation_dataset, columns=['file_name', 'label'])

with open(os.path.join(new_path, 'inSitu', 'evaluation.csv'), mode='w') as dataset_file:
  dataset_writer = csv.writer(dataset_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  dataset_writer.writerow(['file_name', 'label'])
  for file_name, label in zip(evaluation_dataset['file_name'], evaluation_dataset['label']):
    split = file_name.split('/')
    dataset_writer.writerow(['{}_{}'.format(split[-3], split[-1]), label])
    if resize_pad:
      padded_img = resize_with_pad(file_name, IMG_SIZE)
      padded_img.save(os.path.join(new_path, 'inSitu','{}_{}'.format(split[-3], split[-1])))
    else:
      copyfile(file_name, os.path.join(new_path, 'inSitu','{}_{}'.format(split[-3], split[-1])))

In [None]:
train_filenames, validation_filenames, train_labels, validation_labels = train_test_split(df['file_name'], df['label'], train_size=0.75, random_state=42, stratify=df['label'])

if not os.path.exists(os.path.join(new_path, 'inVitro', 'train')):
    os.makedirs(os.path.join(new_path, 'inVitro', 'train'))
with open(os.path.join(new_path, 'inVitro', 'train.csv'), mode='w') as dataset_file:
    dataset_writer = csv.writer(dataset_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    dataset_writer.writerow(['file_name', 'label'])
    for file_name, label in zip(train_filenames, train_labels):
        split = file_name.split('/')
        dataset_writer.writerow(['{}_{}'.format(split[-4], split[-1]), label])
        if resize_pad:
          padded_img = resize_with_pad(file_name, IMG_SIZE)
          padded_img.save(os.path.join(new_path, 'inVitro', 'train','{}_{}'.format(split[-4], split[-1])))
        else:
          copyfile(file_name, os.path.join(new_path, 'inVitro', 'train','{}_{}'.format(split[-4], split[-1])))

if not os.path.exists(os.path.join(new_path, 'inVitro', 'validation')):
    os.makedirs(os.path.join(new_path, 'inVitro', 'validation'))
with open(os.path.join(new_path, 'inVitro', 'validation.csv'), mode='w') as dataset_file:
    dataset_writer = csv.writer(dataset_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    dataset_writer.writerow(['file_name', 'label'])
    for file_name, label in zip(validation_filenames, validation_labels):
        split = file_name.split('/')
        dataset_writer.writerow(['{}_{}'.format(split[-4], split[-1]), label])
        if resize_pad:
          padded_img = resize_with_pad(file_name, IMG_SIZE)
          padded_img.save(os.path.join(new_path, 'inVitro', 'validation','{}_{}'.format(split[-4], split[-1])))
        else:
          copyfile(file_name, os.path.join(new_path, 'inVitro', 'validation','{}_{}'.format(split[-4], split[-1])))

#### !!! Execute next cell **ONLY** if you want to train the model with both *in vitro* and *in situ* images

In [None]:
!rm -rf grozi120/processed/all
new_path = '/content/grozi120/processed/all'
if not os.path.exists(new_path):
    os.makedirs(new_path)

insitup = '/content/grozi120/processed/inSitu'
invitrot = '/content/grozi120/processed/inVitro/train'
invitrov = '/content/grozi120/processed/inVitro/validation'
folderp = [insitup, invitrot, invitrov]

for p in folderp:
  for f in os.listdir(p):
    copyfile(os.path.join(p,f), os.path.join(new_path, f))

!rm -R grozi120/processed/all/evaluation.csv
with open(os.path.join('/content/grozi120/processed/', 'all.csv'), mode='w') as dataset_file:
    dataset_writer = csv.writer(dataset_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    dataset_writer.writerow(['file_name', 'label'])
    for f in os.listdir(new_path):
      prod_num = int(f.split('_')[0])
      dataset_writer.writerow([f, labels[prod_num][1]])
dataset_df = pd.read_csv('/content/grozi120/processed/all.csv')

train_validate_filenames, evaluation_filenames, train_validate_labels, evaluation_labels = train_test_split(dataset_df['file_name'], dataset_df['label'], train_size=0.8, random_state=42, stratify=dataset_df['label'])

train_validate_df = pd.DataFrame(zip(train_validate_filenames, train_validate_labels), columns=['file_name', 'label'])

train_filenames, validation_filenames, train_labels, validation_labels = train_test_split(train_validate_df['file_name'], train_validate_df['label'], train_size=0.875, random_state=42, stratify=train_validate_df['label'])

train_df = pd.DataFrame(zip(train_filenames, train_labels), columns=['file_name', 'label'])

validation_df = pd.DataFrame(zip(validation_filenames, validation_labels), columns=['file_name', 'label'])

evaluation_df = pd.DataFrame(zip(evaluation_filenames, evaluation_labels), columns=['file_name', 'label'])

#### Define data augmentations

In [None]:
train_df = pd.read_csv(os.path.join(new_path, 'inVitro', 'train.csv'))
validation_df = pd.read_csv(os.path.join(new_path, 'inVitro', 'validation.csv'))
evaluation_df = pd.read_csv(os.path.join(new_path, 'inSitu', 'evaluation.csv'))

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    featurewise_center=False,  
    featurewise_std_normalization=False, 
    samplewise_center=False,  
    samplewise_std_normalization=False,  
    zca_whitening=False,  
    rotation_range=25,  
    zoom_range = 0.5,
    shear_range = 30,
    width_shift_range=0.15,  
    height_shift_range=0.15,
    brightness_range=[0.25,1],
    horizontal_flip=False,  
    vertical_flip=False,
    rescale=1. / 255)

validation_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

evaluation_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=os.path.join(new_path, 'inVitro', 'train'),
    x_col="file_name",
    y_col="label",
    class_mode="categorical",
    target_size=(IMG_SIZE,IMG_SIZE),
    batch_size=BATCH_SIZE)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_df,
    directory=os.path.join(new_path, 'inVitro', 'validation'),
    x_col="file_name",
    y_col="label",
    class_mode="categorical",
    target_size=(IMG_SIZE,IMG_SIZE),
    batch_size=BATCH_SIZE)

evaluation_generator  = evaluation_datagen.flow_from_dataframe(
    dataframe=evaluation_df,
    directory=os.path.join(new_path, 'inSitu'),
    x_col="file_name",
    y_col="label",
    class_mode="categorical",
    target_size=(IMG_SIZE,IMG_SIZE),
    batch_size=BATCH_SIZE)

class_indices = evaluation_generator.class_indices
class_indices_inverted = {v: k for k, v in class_indices.items()}

x,y = train_generator.next()
train_generator.reset()

fig,ax=plt.subplots(5,5)
fig.set_size_inches(15,15)
num = 0
for i in range(5):
  for j in range (5):
    ax[i,j].imshow(x[num])
    mydict = evaluation_generator.class_indices
    inverted = list(mydict.keys())[list(mydict.values()).index(argmax(y[num]))]
    ax[i,j].set_title("\n".join(wrap(inverted, 10)))
    ax[i,j].axis('off')
    num += 1
  num += 1
        
plt.tight_layout()

#### Save the mapping of the classes into a json file

In [None]:
indices = train_generator.class_indices
with open('indices.json', 'w') as indices_json:
    json.dump(indices, indices_json)

### Define Callbacks and Metrics

In [None]:
cur_date = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = os.path.join('content', 'training', cur_date)
model_filename = 'weights.hdf5'

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

callbacks = []
early_stopping = EarlyStopping(
  monitor='val_auc_pr',
  verbose=1, 
  patience=10, 
  mode='max', 
  restore_best_weights=True)

callbacks.append(early_stopping)

tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1)
callbacks.append(tensorboard)
model_checkpoint = ModelCheckpoint(model_filename, 
                                   monitor='val_auc_pr', 
                                   verbose=1, 
                                   save_best_only=True,
                                   save_weights_only=True, 
                                   mode='max', 
                                   save_freq='epoch', 
)
callbacks.append(model_checkpoint)

num_classes = len(train_generator.class_indices)

metrics = [CategoricalAccuracy(name='categorical_accuracy'),
    Precision(name='precision'),
    Recall(name='recall'),
    AUC(name='auc_pr', curve='PR'),
    AUC(name='auc_roc', curve='ROC'),
    F1Score(name='f1score', num_classes=num_classes), 
    TrueNegatives(name='tn'), 
    TruePositives(name='tp'), 
    FalseNegatives(name='fn'), 
    FalsePositives(name='fp')]

### Define model

#### Create base model with ImageNet weights

In [None]:
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

base_model = tf.keras.applications.InceptionResNetV2(
    input_shape=IMG_SHAPE,
    include_top=False,
    weights='imagenet')
                                    
base_model.trainable = False

#### Define layers for new classifier

In [None]:
regularizer = tf.keras.regularizers.L1L2(l1=0.0001, l2=0.0001)
global_average_pooling_layer = tf.keras.layers.GlobalAveragePooling2D()
dropout_layer = tf.keras.layers.Dropout(0.3)
dense = tf.keras.layers.Dense(2048, activation='relu')
prediction_layer = tf.keras.layers.Dense(num_classes, activation='softmax', activity_regularizer=regularizer)

#### Build the model

In [None]:
inputs = tf.keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = base_model(inputs, training=False)
x = global_average_pooling_layer(x)
x = dropout_layer(x)
x = dense(x)
y = prediction_layer(x)

model = tf.keras.Model(inputs, y)

In [None]:
base_learning_rate = 0.00005
model.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=metrics)

model.summary()

### Training

#### Initial training with frozen base model layers to fit the new classifier

In [None]:
num_epochs = 300

history = model.fit(train_generator, epochs=num_epochs, validation_data=validation_generator,  callbacks=callbacks)

#### Save training hisory to a json file

In [None]:
hist_df = pd.DataFrame(history.history) 

hist_json_file = '/content/history.json'
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

### Fine-tuning

#### Definition of callbacks for fine-tuning

In [None]:
cur_date = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = os.path.join('content', 'training', cur_date)
model_filename = 'fine_tuning_weights.hdf5'
checkpoint_path = os.path.join(log_dir, 'checkpoints', model_filename)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

callbacks_ft = []
early_stopping_ft = EarlyStopping(
  monitor='val_auc_pr',
  verbose=1, 
  patience=10, 
  mode='max', 
  restore_best_weights=True)

callbacks_ft.append(early_stopping_ft)

tensorboard_ft = TensorBoard(log_dir=log_dir, histogram_freq=1)
callbacks_ft.append(tensorboard_ft)
model_checkpoint_ft = ModelCheckpoint(model_filename, 
                                   monitor='val_auc_pr', 
                                   verbose=1, 
                                   save_best_only=True,
                                   save_weights_only=True, 
                                   mode='max', 
                                   save_freq='epoch', 
)
callbacks_ft.append(model_checkpoint_ft)


#### Recompile model with unfrozen base model layers

In [None]:
base_model.trainable = True

base_learning_rate = 0.000001
model.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=metrics)

model.summary()

#### Fine-tune the model

In [None]:
fine_tune_epochs = 20

total_epochs =  history.epoch[-1] + 1 + fine_tune_epochs

history_fine = model.fit(train_generator,
                         epochs=total_epochs,
                         initial_epoch=history.epoch[-1],
                         validation_data=validation_generator, callbacks=callbacks_ft)

#### Save fine-tuning history to a json file

In [None]:
hist_ft_df = pd.DataFrame(history_fine.history) 

hist_ft_json_file = '/content/history_fine.json'
with open(hist_ft_json_file, mode='w') as f:
    hist_ft_df.to_json(f)

### Evaluation

In [None]:
evaluation = model.evaluate(evaluation_generator)

evaluation_data_df = pd.DataFrame(evaluation) 

evaluation_json_file = '/content/evaluation.json'
with open(evaluation_json_file, mode='w') as f:
    evaluation_data_df.to_json(f)

### Save and load model


In [None]:
model_path =  '/content/drive/My Drive/Colab Notebooks/models/grozi120/InceptionResNetV2_imagenet_{}.h5'.format(datetime.datetime.now().strftime("%d_%m_%y_%H_%M_%S"))
model.save(model_path) 
model_path

In [None]:
model = load_model(model_path)

### Plot confusion matrix

In [None]:
# from https://deeplizard.com/learn/video/km7pxKy4UHU
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues, file_name='confusion_matrix'):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    plt.figure(figsize=(64,64))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(os.path.join('content', file_name) + '.svg')

In [None]:
def get_image (path):
    img = image.load_img(path, target_size=(IMG_SIZE, IMG_SIZE))
    x = np.array(img).astype('float64')/255
    x = np.expand_dims(x, axis=0)
    return x

In [None]:
class_numbers_predicted = []
class_numbers_actual = []

labels_ = evaluation_df.groupby("label")
for label in labels_:
  for path in label[1]["file_name"]:
    x = get_image(os.path.join('/content/grozi120/processed/all', path))
    p = model.predict(x)
    predicted = class_indices_inverted[argmax(p)]
    class_numbers_predicted.append(argmax(p))
    actual = label[0]
    class_numbers_actual.append(class_indices[actual])

In [None]:
cm = confusion_matrix(y_true=class_numbers_actual, y_pred=class_numbers_predicted)
cm_plot_labels = evaluation_generator.class_indices
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, normalize=False, title='Confusion Matrix GroZi-120', file_name='confusion_matrix_grozi120')