**SCC0276 — Machine Learning**

**Project:** Alzheimer disease detection

**Authors:**
- Alice Valença De Lorenci - 11200289
- Gabriel Soares Gama - 10716511
- Marcos Antonio Victor Arce - 10684621

In [6]:
# libraries
import sklearn
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
import random
import sys

# sklearn
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# custom libs
path = '../'
data_path = path + 'data/'
SVM_path = path + 'models/SVM/'

sys.path.append(path + 'preprocessing/')
sys.path.append(path + 'models/')
from load_dataframe import load_dataframe
from confusionMatrix import plot_confusion_matrix

In [7]:
# set seeds
np.random.seed(0)
random.seed(0)

# Load Dataset

In [8]:
# load data frame
df = load_dataframe( data_path + 'features.npz', data_path + 'name_class.csv')
display( df.head() )
print("Dataframe shape:", df.shape)

# class encoding
classes = df["class"].unique()
class_code = dict( zip( df["class"].astype('category').cat.categories, np.arange( 0, len( classes) ) ) )
print( "\nClass encoding: ", class_code )

# data split
X = df.drop( columns=["class", "img_name"] )
y = df["class"].copy()

print('\nX shape: ',X.shape)
print('Y shape: ',y.shape)

# normalize (Z-score)
for column in X.columns:
    X[column] = ( X[column] - np.mean( X[column] ) )/np.std( X[column] )

# train and test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.1, random_state=0 )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1272,1273,1274,1275,1276,1277,1278,1279,class,img_name
0,1.110234,1.507227,0.0,0.0,0.118538,0.0,0.61166,0.0,0.216659,0.0,...,0.0,0.582409,0.0,0.0,0.0,0.292292,0.080777,0.342733,CN,ADNI_941_S_4292_MR_ADNI_gre_field_mapping_br_r...
1,0.377716,0.804323,0.0,0.044572,0.469181,0.0,1.109573,0.0,0.427474,0.0,...,0.018944,0.851525,0.0,0.239364,0.111541,0.0,0.363659,0.683264,CN,ADNI_941_S_4292_MR_ADNI_gre_field_mapping_br_r...
2,1.291019,0.071864,0.177774,1.815525,0.103777,0.035584,0.019683,0.0,0.0,0.0,...,0.0,0.226819,0.0,0.445593,0.652496,0.022925,0.0,0.357433,CN,ADNI_941_S_4292_MR_ADNI_gre_field_mapping_br_r...
3,0.390894,0.746412,0.039011,0.298714,1.402811,0.0,0.052603,0.098031,0.058295,0.0,...,0.00499,0.480598,0.0,0.0,1.700429,0.416542,0.03975,2.452053,CN,ADNI_941_S_4292_MR_ADNI_gre_field_mapping_br_r...
4,1.896747,0.958916,0.002194,0.096892,0.260261,0.0,0.86198,0.04371,0.040442,0.0,...,0.079554,0.058523,0.0,0.10071,0.003879,0.023642,0.020626,0.359759,CN,ADNI_941_S_4292_MR_ADNI_gre_field_mapping_br_r...


Dataframe shape: (18748, 1282)

Class encoding:  {'AD': 0, 'CN': 1, 'EMCI': 2, 'LMCI': 3}

X shape:  (18748, 1280)
Y shape:  (18748,)


# Training

In [None]:
# define number of folds for cross validation
nfolds = 5
kf = KFold( n_splits=nfolds, shuffle=True, random_state=0 )

# kernels to be tested
kernels = ['linear', 'poly', 'poly', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf']
degrees = [       3,      3,      5,     3,     3,     3,     3,     3]
gammas  = [ 'scale','scale','scale',   0.1,   1.0,  10.0,   0.1,   0.1]
Cs      = [     1.0,    1.0,    1.0,   1.0,   1.0,   1.0,    10,   100]

# fit models
results = [ cross_val_score( SVC( kernel=kernel, degree=degree, gamma=gamma, C=C, class_weight='balanced' ),\
                            x_train, y_train, cv=kf ) for kernel, degree, gamma, C in zip(kernels, degrees, gammas, Cs) ]

In [None]:
# print results
pd.options.display.float_format = "{:,.4f}".format

results_data = [ (results[i].mean(), results[i].std()) for i in range(len(ngs) )] 

results_df = pd.DataFrame( data = results_data, columns = ["mean", "std"] )
results_df.insert( 0, "kernel", kernels )
results_df.insert( 1, "degree", degrees )
results_df.insert( 2, "gamma", gammas )
results_df.insert( 3, "C", Cs )

display( results_df )

# best result
idx = np.argmax( results_data[:][0] )
best_param = results_df.loc[ idx ]
print( "Best SVM parameters:\n", best_param )

bestSVM = SVC( kernel=kernels[idx], degree=degrees[idx], gamma=gammas[idx], C=Cs[idx], class_weight='balanced' )
bestSVM.fit(x_train, y_train)
print("Best SVM accuracy (retrained using all training data):", bestSVM.score(x_test, y_test))

# save SVM model
filename = SVM_path + 'SVM' + '.pkl'
joblib.dump(bestSVM, open(filename, 'wb'))

# Evaluation metrics

In [None]:
# evaluate the model
y_pred = bestSVM.predict(x_test)
y_pred = pd.DataFrame(y_pred)
y_true = pd.DataFrame(y_test)

# plot confusion matrix and compute accuracy, specificity and sensitivity per class
y_pred = y_pred[0].astype('category').cat.codes
y_true = y_true["class"].astype('category').cat.codes
plot_confusion_matrix(y_true, y_pred, list(class_code.keys()), save=True, path=data_path+"SVM_cm.pdf")

# Load model

In [None]:
# load saved model
filename = SVM_path + 'SVM.pkl'
svm = joblib.load(open(filename, 'rb'))