In [15]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score 

In [16]:
df_train = pd.read_csv("ADNIMERGE_train.csv")
df_test = pd.read_csv("ADNIMERGE_test.csv")

In [17]:
df_train.columns

Index(['RID', 'DX_bl', 'PTGENDER', 'PTEDUCAT', 'PTRACCAT_Asian',
       'PTRACCAT_Black', 'PTRACCAT_Hawaiian/Other_PI',
       'PTRACCAT_More_than_one', 'PTRACCAT_Unknown', 'PTRACCAT_White',
       'PTETHCAT_Not_Hisp/Latino', 'PTMARRY_Married', 'PTMARRY_Never_married',
       'PTMARRY_Widowed', 'APOE4', 'APOE4_slope', 'CSF_ABETA',
       'CSF_ABETA_slope', 'CSF_TAU', 'CSF_TAU_slope', 'CSF_PTAU',
       'CSF_PTAU_slope', 'FDG', 'FDG_slope', 'AV45', 'AV45_slope', 'CDRSB',
       'CDRSB_slope', 'ADAS13', 'ADAS13_slope', 'MMSE', 'MMSE_slope',
       'RAVLT_immediate', 'RAVLT_immediate_slope', 'RAVLT_learning',
       'RAVLT_learning_slope', 'RAVLT_forgetting', 'RAVLT_forgetting_slope',
       'RAVLT_perc_forgetting', 'RAVLT_perc_forgetting_slope', 'MOCA',
       'MOCA_slope', 'EcogPtMem', 'EcogPtMem_slope', 'EcogPtLang',
       'EcogPtLang_slope', 'EcogPtVisspat', 'EcogPtVisspat_slope',
       'EcogPtPlan', 'EcogPtPlan_slope', 'EcogPtOrgan', 'EcogPtOrgan_slope',
       'EcogPtDivatt', 'Eco

### LDA & QDA

In [18]:
# normalization
cols_continuous = ['APOE4', 'APOE4_slope', 'CSF_ABETA',
                   'CSF_ABETA_slope', 'CSF_TAU', 'CSF_TAU_slope', 'CSF_PTAU',
                   'CSF_PTAU_slope', 'FDG', 'FDG_slope', 'AV45', 'AV45_slope', 'CDRSB',
                   'CDRSB_slope', 'ADAS13', 'ADAS13_slope', 'MMSE', 'MMSE_slope',
                   'RAVLT_immediate', 'RAVLT_immediate_slope', 'RAVLT_learning',
                   'RAVLT_learning_slope', 'RAVLT_forgetting', 'RAVLT_forgetting_slope',
                   'RAVLT_perc_forgetting', 'RAVLT_perc_forgetting_slope', 'MOCA',
                   'MOCA_slope', 'EcogPtMem', 'EcogPtMem_slope', 'EcogPtLang',
                   'EcogPtLang_slope', 'EcogPtVisspat', 'EcogPtVisspat_slope',
                   'EcogPtPlan', 'EcogPtPlan_slope', 'EcogPtOrgan', 'EcogPtOrgan_slope',
                   'EcogPtDivatt', 'EcogPtDivatt_slope', 'EcogSPMem', 'EcogSPMem_slope',
                   'EcogSPLang', 'EcogSPLang_slope', 'EcogSPVisspat',
                   'EcogSPVisspat_slope', 'EcogSPPlan', 'EcogSPPlan_slope', 'EcogSPOrgan',
                   'EcogSPOrgan_slope', 'EcogSPDivatt', 'EcogSPDivatt_slope', 'FAQ',
                   'FAQ_slope', 'Ventricles', 'Ventricles_slope', 'Hippocampus',
                   'Hippocampus_slope', 'WholeBrain', 'WholeBrain_slope', 'Entorhinal',
                   'Entorhinal_slope', 'Fusiform', 'Fusiform_slope', 'MidTemp',
                   'MidTemp_slope', 'ICV', 'ICV_slope']

X_train = df_train.drop(['RID', 'DX_bl'], axis=1).copy()
y_train = df_train['DX_bl'].copy()
X_test = df_test.drop(['RID', 'DX_bl'], axis=1).copy()
y_test = df_test['DX_bl'].copy()
for i in cols_continuous:
    col_mean = np.mean(X_train[i])
    col_sd = np.std(X_train[i])
    if col_sd < 1e-10*col_mean:
        X_train.loc[i] = (X_train[i]-col_mean)/col_sd
        X_test.loc[i] = (X_test[i]-col_mean)/col_sd
        

In [19]:
## LDA, QDA (standardization needed)

lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()

lda.fit(X_train,y_train)
qda.fit(X_train,y_train)

print('LDA Train Score: ',lda.score(X_train,y_train))
print('QDA Train Score: ',qda.score(X_train,y_train))

print('LDA Test Score: ',lda.score(X_test,y_test))
print('QDA Test Score: ',qda.score(X_test,y_test))


def score(model, X_train, y_train, X_test, y_test):
    train_acc = model.score(X_train,y_train)
    test_acc = model.score(X_test,y_test)
    test_class0 = model.score(X_test[y_test==0], y_test[y_test==0])
    test_class1 = model.score(X_test[y_test==1], y_test[y_test==1])
    test_class2 = model.score(X_test[y_test==2], y_test[y_test==2])
    return pd.Series([train_acc, test_acc, test_class0, test_class1, test_class2],
                    index = ['Train accuracy', 'Test accuracy', 
                             "Test accuracy CN", "Test accuracy CI", "Test accuracy AD"])

lda_score = score(lda, X_train, y_train, X_test, y_test)
qda_score = score(qda, X_train, y_train, X_test, y_test)

LDA Train Score:  0.869565217391
QDA Train Score:  0.933977455717
LDA Test Score:  0.827160493827
QDA Test Score:  0.802469135802




### KNN

In [20]:
## KNN
X_train = df_train.drop(['RID', 'DX_bl'], axis=1).copy()
y_train = df_train['DX_bl'].copy()
X_test = df_test.drop(['RID', 'DX_bl'], axis=1).copy()
y_test = df_test['DX_bl'].copy()

max_score = 0
max_k = 0 

for k in range(1,60):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn_val_score = cross_val_score(knn, X_train, y_train).mean()
    if knn_val_score > max_score:
        max_k = k
        max_score = knn_val_score
        
knn = KNeighborsClassifier(n_neighbors = max_k)
knn.fit(X_train,y_train)

print("Optimal number of neighbours: ", max_k)
print('KNN Train Score: ', knn.score(X_train,y_train))
print('KNN Test Score: ', knn.score(X_test,y_test))

knn_score = score(knn, X_train, y_train, X_test, y_test)

Optimal number of neighbours:  37
KNN Train Score:  0.566827697262
KNN Test Score:  0.592592592593


In [21]:
score_df = pd.DataFrame({'knn': knn_score, 
                         'lda': lda_score,
                         'qda': qda_score})
score_df

Unnamed: 0,knn,lda,qda
Train accuracy,0.566828,0.869565,0.933977
Test accuracy,0.592593,0.82716,0.802469
Test accuracy CN,0.02381,0.690476,0.761905
Test accuracy CI,0.989247,0.88172,0.817204
Test accuracy AD,0.111111,0.851852,0.814815


## Bootstrap

In [22]:
iterations = 1000
boot = np.zeros((X_train.shape[1], iterations))
for i in range(iterations):
    boot_rows = np.random.choice(range(X_train.shape[0]),
                                 size=X_train.shape[0], replace=True)
    X_train_boot = X_train.values[boot_rows]
    y_train_boot = y_train.values[boot_rows]
    model_boot = LinearDiscriminantAnalysis()
    model_boot.fit(X_train_boot, y_train_boot)
    boot[:,i] = model_boot.coef_[2,:]
    
boot_ci_upper = np.percentile(boot, 97.5, axis=1)
boot_ci_lower = np.percentile(boot, 2.5, axis=1)
sig_b_ct = []
for i in range(X_train.shape[1]):
    if boot_ci_upper[i]<0 or boot_ci_lower[i]>0:
        sig_b_ct.append(i)
        
print("Most significant coefficients: ")
print(X_train.columns[sig_b_ct])



Most significant coefficients: 
Index(['PTEDUCAT', 'CSF_PTAU_slope', 'CDRSB', 'ADAS13', 'MMSE', 'FAQ'], dtype='object')
