<a href="https://colab.research.google.com/github/Angelvj/Alzheimer-disease-classification/blob/main/code/experiment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
kaggle = False

# Imports

In [2]:
import tensorflow as tf
from google.colab import drive
from tensorflow import keras
import os, shutil, re
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
if kaggle:
    from kaggle_datasets import KaggleDatasets
    from kaggle_secrets import UserSecretsClient
else:
    from google.colab import drive
import nibabel as nib

# Import the most used layers
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Input, BatchNormalization, Dropout

In [3]:
import shutil
import sys
import tensorflow as tf

if tf.io.gfile.exists('Alzheimer-disease-classification'):
    shutil.rmtree('Alzheimer-disease-classification')
! git clone https://github.com/Angelvj/Alzheimer-disease-classification.git

if not kaggle:
    sys.path.insert(0,'/content/Alzheimer-disease-classification/code')
else:
    sys.path.insert(0, './Alzheimer-disease-classification/code')

Cloning into 'Alzheimer-disease-classification'...
remote: Enumerating objects: 1010, done.[K
remote: Counting objects: 100% (259/259), done.[K
remote: Compressing objects: 100% (207/207), done.[K
remote: Total 1010 (delta 150), reused 137 (delta 52), pack-reused 751[K
Receiving objects: 100% (1010/1010), 20.23 MiB | 22.18 MiB/s, done.
Resolving deltas: 100% (508/508), done.


In [4]:
import functions.lr_schedules as lr_schedules
import functions.io_utils as io
from functions.model_evaluation import repeated_kfold, plot_epochs_history, get_rkf_history
from functions.tfrec_loading import count_data_items
from functions.general import change_input_shape
from models import feedforward_models_pet, ResNet

# Hardware configuration

In [5]:
DEVICE = 'TPU' # or TPU
tpu = None

if DEVICE == 'TPU':
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        STRATEGY = tf.distribute.experimental.TPUStrategy(tpu)
    except ValueError:
        print('Could not connect to TPU, setting default strategy')
        tpu = None
        STRATEGY = tf.distribute.get_strategy()
elif DEVICE == 'GPU':
    STRATEGY = tf.distribute.MirroredStrategy()
    
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = STRATEGY.num_replicas_in_sync

print(f'Number of accelerators: {REPLICAS}')

Could not connect to TPU, setting default strategy
Number of accelerators: 1


In [6]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Create model with two inputs

In [7]:
# Model with two inputs
def build_model():

    # Base pet model
    pet_model = feedforward_models_pet.model_4(input_shape=(79, 95, 68, 1))
    # Delete classifier
    pet_model = keras.Model(pet_model.input, pet_model.layers[-3].output)

    # Input for pet images
    pet_input = keras.Input(shape=(79, 95, 68, 1), name='pet_input')
    pet_features = pet_model(pet_input)

    # Base mri model
    pretrained_resnet = keras.models.load_model('/content/drive/MyDrive/pretrained_models/pretrained_3D_resnet18.h5')
    # Delete classifier
    pretrained_resnet = keras.Model(pretrained_resnet.input, pretrained_resnet.layers[-2].output)
    # Change input shape
    pretrained_resnet = change_input_shape(pretrained_resnet, (121, 145, 121, 1), 'new_input')
    pretrained_resnet.trainable = False
    # Input for pet images
    mri_input = keras.Input(shape=(121, 145, 121, 1), name='mri_input')
    mri_features = pretrained_resnet(mri_input)

    x = keras.layers.concatenate([pet_features, mri_features])
    pred = keras.layers.Dense(3, name='label')(x)
    model = keras.Model(inputs=[pet_input, mri_input], outputs = [pred])

    return model

# Read tfrecords

In [8]:
# Now each tfrecord has two images and one label
def read_tfrecord(example):
    
    tfrec_format = {
        "image_pet": tf.io.VarLenFeature(tf.float32),
        "image_mri": tf.io.VarLenFeature(tf.float32),
        "one_hot_label": tf.io.VarLenFeature(tf.float32)
    }

    example = tf.io.parse_single_example(example, tfrec_format)
    one_hot_label = tf.sparse.to_dense(example['one_hot_label'])
    one_hot_label = tf.reshape(one_hot_label, [NUM_CLASSES])
    image_pet = tf.reshape(tf.sparse.to_dense(example['image_pet']), PET_SHAPE)
    image_mri = tf.reshape(tf.sparse.to_dense(example['image_mri']), MRI_SHAPE)

    return {'pet_input':image_pet, 'mri_input':image_mri}, {'label':one_hot_label}

def load_dataset(filenames, labels, no_order=True):
    
    # Allow order-altering optimizations
    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    if no_order:
        dataset = dataset.with_options(option_no_order)
    dataset = dataset.map(read_tfrecord, num_parallel_calls = AUTO)

    return dataset

def get_dataset(filenames, labels=None, batch_size = 4, train=False, cache=True, no_order=True):
    
    dataset =  load_dataset(filenames, labels, no_order)
    
    if cache:
        dataset = dataset.cache() # Do it only if dataset fits in ram
    if train:
        dataset = dataset.repeat()
        dataset = dataset.shuffle(count_data_items(filenames))

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Evaluation functions

We have to redefine some functions because of the problems caused by using two
inputs (I'll fix it in future)

In [9]:
# Training and evaluation
def evaluate_model_kfold(model_builder, train_filenames, n_folds, batch_size, epochs, 
                         plot_fold_results = True, plot_avg_results = True, train_labels=None, 
                         stratify=False, shuffle=True, random_state=None, cbks=None):
    
    # np_rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_state)))
    folds_histories = []

    if stratify:
        skf = StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)
    else:
        skf = KFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)

    for fold, (idx_train, idx_val) in enumerate(skf.split(train_filenames, train_labels)):
        if tpu != None:
            tf.tpu.experimental.initialize_tpu_system(tpu)

        # np_rs.shuffle(idx_train)
        X_train = train_filenames[idx_train]
        X_val = train_filenames[idx_val]
        y_train = train_labels[idx_train]
        y_val = train_labels[idx_val]

        # Build model
        tf.keras.backend.clear_session()
        with STRATEGY.scope():
            model = model_builder()
            # Optimizers and Losses create TF variables --> should always be initialized in the scope
            OPT = tf.keras.optimizers.Adam(learning_rate=LR)
            LOSS = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.00)
            model.compile(optimizer=OPT, loss=LOSS, metrics=METRICS) #steps_per_execution=8)

        # Train
        print(f'Training for fold {fold + 1} of {n_folds}...')
        history = model.fit(
            get_dataset(X_train, y_train, train=True, batch_size=batch_size), 
            epochs = EPOCHS, callbacks = cbks,
            steps_per_epoch = max(1, int(np.rint(count_data_items(X_train)/batch_size))),
            validation_data = get_dataset(X_val, y_val, batch_size = batch_size, train=False) ,
            validation_steps= max(1, int(np.rint(count_data_items(X_val)/batch_size))))
    
        if tf.__version__ == "2.4.1": # TODO: delete when tensorflow fixes the bug
            scores = model.evaluate(get_dataset(X_train, y_train, batch_size = batch_size, train=False), 
                                    batch_size = batch_size, steps = max(1, int(np.rint(count_data_items(X_train)/batch_size))))
            for i in range(len(model.metrics_names)):
                history.history[model.metrics_names[i]][-1] = scores[i]
            
        folds_histories.append(history.history)
        
        if plot_fold_results:
            plot_epochs_history(epochs, history.history)
        
    avg_history = avg_results_per_epoch(folds_histories)
            
    if plot_avg_results:
        
        plot_epochs_history(epochs, avg_history)

        print('-'*80)
        print('Results per fold')
        for i in range(n_folds):
            print('-'*80)
            out = f"> Fold {i + 1} - loss: {folds_histories[i]['loss'][-1]} - accuracy: {folds_histories[i]['accuracy'][-1]}"
            out += f" - val_loss.: {folds_histories[i]['val_loss'][-1]} - val_accuracy: {folds_histories[i]['val_accuracy'][-1]}"
            print(out)

        print('-'*80)
        print('Average results over folds (on last epoch):')
        print(f"> loss: {avg_history['loss'][-1]}")
        print(f"> accuracy: {avg_history['accuracy'][-1]}")
        print(f"> cval_loss: {avg_history['val_loss'][-1]}")
        print(f"> cval_accuracy: {avg_history['val_accuracy'][-1]}")
        print('-'*80)

    return folds_histories

def repeated_kfold(model_builder, train_filenames, n_folds, batch_size, epochs, reps=5, train_labels=None,
                   stratify=True, shuffle=True, random_state=None, cbks=None):
    
    reps_histories = []
    
    for i in range(reps):
        print(f'Repetition {i + 1}')
        folds_histories = evaluate_model_kfold(model_builder, train_filenames, n_folds,
                                             batch_size, epochs, train_labels=train_labels, stratify=stratify,
                                             shuffle=shuffle, random_state=random_state, cbks=cbks)

        reps_histories.append(folds_histories)

    return reps_histories

def test_model_rkfold(model_builder, results_filename):
    # Evaluate model with repeated k-fold (because of the high variance)
    reps_results = repeated_kfold(model_builder, X_train, FOLDS, BATCH_SIZE, EPOCHS, reps=REPS, train_labels=y_train,
                   random_state=SEED, cbks=CBKS)
    
    # Save results to disk
    f = open(results_filename, 'w' )
    f.write(repr(reps_results))
    f.close()
    
    
def avg_results_per_epoch(histories):
    
    keys = list(histories[0].keys())
    epochs = len(histories[0][keys[0]])
    
    avg_histories = dict()
    for key in keys:
        avg_histories[key] = [np.mean([x[key][i] for x in histories]) for i in range(epochs)]
        
    return avg_histories

def avg_reps_results(reps_histories):
    return avg_results_per_epoch([avg_results_per_epoch(history) for history in reps_histories])
    
def show_rkfold_results(results_file):
    # Load results from disk
    f = open(results_file, 'r')
    reps_results = eval(f.read())
    
    reps_avgd_per_kfold = [avg_results_per_epoch(history) for history in reps_results]
    reps_avg = avg_results_per_epoch(reps_avgd_per_kfold)
    
    # Plot final result over epochs
    plot_epochs_history(EPOCHS, reps_avg)
    
    print('-'*80)
    print('Results per repetition (on last epoch)')
    for i in range(REPS):
        print('-'*80)
        print(f"> Repetition {i + 1} - Loss: {reps_avgd_per_kfold[i]['val_loss'][-1]} - Accuracy : {reps_avgd_per_kfold[i]['val_accuracy'][-1]}")

    print('-'*80)
    print('Average results over repetitions (on last epoch):')
    print(f"> Train Accuracy: {reps_avg['accuracy'][-1]}")
    print(f"> Train Loss: {reps_avg['loss'][-1]}")
    print(f"> CV accuracy: {reps_avg['val_accuracy'][-1]}")
    print(f"> CV Loss: {reps_avg['val_loss'][-1]}")
    print('-'*80)

# Train and evaluate

In [None]:
SEED = 268 # arbitrary seed

# Datasets
TFREC_DATASETS = ['tfrec-ensemble_mri_pet']

PET_SHAPE = (79, 95, 68, 1)
MRI_SHAPE = (121, 145, 121, 1)

EPOCHS = 50
BATCH_SIZE = 4
REPS = 5
FOLDS = 10
LR = 0.00001
CBKS = None

CLASSES = ['NOR', 'AD', 'MCI']
NUM_CLASSES = len(CLASSES)

METRICS = ['accuracy']

drive.mount('/content/drive')
INPUT_DATAPATH = '/content/drive/MyDrive/data/'
METADATA_PATH = '/content/drive/MyDrive/data/'


def select_dataset(ds_id):
    global DS, PET_SHAPE, DS_PATH, INPUT_DATAPATH
    DS = TFREC_DATASETS[ds_id]
    # IMG_SHAPE = SHAPES[ds_id]
    if INPUT_DATAPATH == None:
        user_secrets = UserSecretsClient()
        user_credential = user_secrets.get_gcloud_credential()
        user_secrets.set_tensorflow_credential(user_credential)
        DS_PATH = KaggleDatasets().get_gcs_path(DS)
    else:
        DS_PATH = INPUT_DATAPATH + DS


select_dataset(0)

metadata_train = pd.read_csv(METADATA_PATH + DS + '/train/train_summary.csv', encoding='utf-8')
metadata_test = pd.read_csv(METADATA_PATH + DS + '/test/test_summary.csv', encoding='utf-8')

X_train = DS_PATH + '/train/' + metadata_train.iloc[:, 0].to_numpy()
y_train = np.argmax(metadata_train.iloc[:,-len(CLASSES):].to_numpy(), axis=1)
X_test = DS_PATH + '/test/' + metadata_test.iloc[:, 0].to_numpy()
y_test = np.argmax(metadata_test.iloc[:,-len(CLASSES):].to_numpy(), axis=1)

test_model_rkfold(build_model, 'ensemble_results.txt')
show_rkfold_results('ensemble_results.txt')