In [14]:
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.linear_model
import sklearn.neighbors

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk')

import warnings
warnings.filterwarnings("ignore")

import mlxtend.feature_selection

In [15]:
full_data = pd.read_csv('full_data_short.csv')

In [16]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

In [17]:
list_full_data_columns = list(full_data.columns)
list_train_data_columns = list(train_data.columns)

In [18]:
list_dropped_features = list(set(list_full_data_columns).difference(set(list_train_data_columns)))

In [19]:
print('train_data shape:',np.shape(np.array(train_data)))
print('val_data shape:',np.shape(np.array(val_data)))
print('test_data shape:',np.shape(np.array(test_data)))
print('full_data shape:',np.shape(np.array(full_data)))

train_data shape: (202, 1139)
val_data shape: (36, 1139)
test_data shape: (42, 1139)
full_data shape: (280, 1347)


Inspecting the data reveals that 200 or so proteins were excluded due to NaN values

Split the training and validation data into groups

In [20]:
train_data

Unnamed: 0,group,KV37,LV469,LV861,LVX54,LV746,LV218,LV316,LV312,LV310,...,ARMD4,C1QT5,B2L12,PLXA4,B3GN2,PCDGL,AGRB1,RECK,PSA1,F174A
0,Healthy,28.398290,23.968009,24.296234,21.073690,24.306090,20.834275,22.875021,21.873272,22.012922,...,18.424523,19.702409,28.571054,17.582277,19.833578,17.908683,21.082685,19.540313,20.402270,24.200471
1,Healthy,28.493880,21.512320,22.687815,22.182946,24.306486,23.766681,22.963114,22.786492,22.369101,...,17.826979,19.526250,27.953139,18.670746,18.612532,19.756105,21.337060,19.896682,20.481528,24.270255
2,Healthy,28.860446,22.780248,24.378107,23.365133,25.866827,22.743360,23.808291,22.685879,22.985000,...,18.153627,20.114505,27.944534,18.229467,19.640964,18.612921,21.079712,19.559118,20.400989,24.200733
3,Healthy,28.780151,22.422778,22.902736,21.364631,25.018863,24.221923,23.466170,22.392122,21.878784,...,14.228847,20.311589,28.000618,17.943092,20.497359,19.247203,21.691704,20.267159,20.199615,24.667232
4,Healthy,28.277717,24.656308,24.440401,22.411287,26.695574,26.185975,24.878200,23.932789,24.279117,...,16.580325,19.468810,28.342385,17.563858,19.788633,18.069438,21.078241,19.565659,20.400481,24.200606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,Healthy,28.158586,21.952626,23.201193,21.006439,25.086022,24.276773,23.159882,22.978791,22.439971,...,17.911971,20.330860,28.611495,18.646857,19.733981,17.915158,21.077880,19.101285,20.973856,24.481528
198,PD,28.185324,22.250517,23.253823,21.194145,25.206602,23.695250,22.543163,21.755287,22.051425,...,18.185042,19.820794,27.770040,18.378343,20.378248,19.015282,21.220598,18.624962,20.498130,24.121522
199,PD_MCI_LBD,27.312868,22.469750,21.808856,22.077800,26.440536,24.833839,22.943242,22.217539,22.220150,...,18.413029,20.109011,28.308943,19.640486,20.524291,17.673355,21.588789,20.221517,21.116644,24.988787
200,Healthy,27.906681,22.416611,23.390858,18.371032,26.355765,23.609924,22.921723,21.936902,22.170309,...,18.103972,19.795203,28.039519,18.147929,19.988414,18.120577,21.360518,19.704548,20.530018,24.254393


In [21]:
train_AD_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'AD_MCI')]
train_PD_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD')]
train_PDMCI_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD_MCI_LBD')]

In [22]:
val_AD_healthy = val_data.loc[(val_data['group'] == 'Healthy') | (val_data['group'] == 'AD_MCI')]
val_PD_healthy = val_data.loc[(val_data['group'] == 'Healthy') | (val_data['group'] == 'PD')]
val_PDMCI_healthy = train_data.loc[(train_data['group'] == 'Healthy') | (train_data['group'] == 'PD_MCI_LBD')]

In [23]:
test_AD_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'AD_MCI')]
test_PD_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'PD')]
test_PDMCI_healthy = test_data.loc[(test_data['group'] == 'Healthy') | (test_data['group'] == 'PD_MCI_LBD')]

In [24]:
X_train_AD_healthy = train_AD_healthy.drop(columns='group')
y_train_AD_healthy = train_AD_healthy['group']
X_val_AD_healthy = val_AD_healthy.drop(columns='group')
y_val_AD_healthy = val_AD_healthy['group']
X_test_AD_healthy = test_AD_healthy.drop(columns='group')
y_test_AD_healthy = test_AD_healthy['group']


X_train_PD_healthy = train_PD_healthy.drop(columns='group')
y_train_PD_healthy = train_PD_healthy['group']
X_val_PD_healthy = val_PD_healthy.drop(columns='group')
y_val_PD_healthy = val_PD_healthy['group']
X_test_PD_healthy = test_PD_healthy.drop(columns='group')
y_test_PD_healthy = test_PD_healthy['group']


X_train_PDMCI_healthy = train_PDMCI_healthy.drop(columns='group')
y_train_PDMCI_healthy = train_PDMCI_healthy['group']
X_val_PDMCI_healthy = val_PDMCI_healthy.drop(columns='group')
y_val_PDMCI_healthy = val_PDMCI_healthy['group']
X_test_PDMCI_healthy = test_PDMCI_healthy.drop(columns='group')
y_test_PDMCI_healthy = test_PDMCI_healthy['group']

In [25]:
train_AD_healthy['group'].value_counts()

Healthy    110
AD_MCI      41
Name: group, dtype: int64

In [26]:
print('AD_healthy')
print('train\n',np.shape(np.array(X_train_AD_healthy)),'\n',np.shape(np.array(y_train_AD_healthy)),'\n',train_AD_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_AD_healthy)),'\n',np.shape(np.array(y_val_AD_healthy)),'\n',val_AD_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_AD_healthy)),'\n',np.shape(np.array(y_test_AD_healthy)),'\n',test_AD_healthy['group'].value_counts())

print('\nPD_healthy')
print('train\n',np.shape(np.array(X_train_PD_healthy)),'\n',np.shape(np.array(y_train_PD_healthy)),'\n',train_PD_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_PD_healthy)),'\n',np.shape(np.array(y_val_PD_healthy)),'\n',val_PD_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_PD_healthy)),'\n',np.shape(np.array(y_test_PD_healthy)),'\n',test_PD_healthy['group'].value_counts())

print('\nPDMCI_healthy')
print('train\n',np.shape(np.array(X_train_PDMCI_healthy)),'\n',np.shape(np.array(y_train_PDMCI_healthy)),'\n',train_PDMCI_healthy['group'].value_counts())
print('val\n',np.shape(np.array(X_val_PDMCI_healthy)),'\n',np.shape(np.array(y_val_PDMCI_healthy)),'\n',val_PDMCI_healthy['group'].value_counts())
print('test\n',np.shape(np.array(X_test_PDMCI_healthy)),'\n',np.shape(np.array(y_test_PDMCI_healthy)),'\n',test_PDMCI_healthy['group'].value_counts())

AD_healthy
train
 (151, 1138) 
 (151,) 
 Healthy    110
AD_MCI      41
Name: group, dtype: int64
val
 (26, 1138) 
 (26,) 
 Healthy    22
AD_MCI      4
Name: group, dtype: int64
test
 (30, 1138) 
 (30,) 
 Healthy    24
AD_MCI      6
Name: group, dtype: int64

PD_healthy
train
 (140, 1138) 
 (140,) 
 Healthy    110
PD          30
Name: group, dtype: int64
val
 (24, 1138) 
 (24,) 
 Healthy    22
PD          2
Name: group, dtype: int64
test
 (28, 1138) 
 (28,) 
 Healthy    24
PD          4
Name: group, dtype: int64

PDMCI_healthy
train
 (131, 1138) 
 (131,) 
 Healthy       110
PD_MCI_LBD     21
Name: group, dtype: int64
val
 (131, 1138) 
 (131,) 
 Healthy       110
PD_MCI_LBD     21
Name: group, dtype: int64
test
 (32, 1138) 
 (32,) 
 Healthy       24
PD_MCI_LBD     8
Name: group, dtype: int64


Inspecting the isolated condition groups for data size... PDMCI is pretty small

Define model to feature select with:

In [27]:
model = sklearn.linear_model.RidgeClassifier()
sfs = mlxtend.feature_selection.SequentialFeatureSelector(model, k_features=20, forward=True,verbose=2,n_jobs=-1)

Start with AD_MCI

In [63]:
sfs_AD = sfs.fit(X_train_AD_healthy,y_train_AD_healthy)
top20_AD = sfs_AD.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    2.9s finished

[2023-02-26 06:16:33] Features: 1/20 -- score: 0.8212903225806452[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    1.9s finished

[2023-02-26 06:16:35] Features: 2/20 -- score: 0.8681720430107529[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1136 out of 1136 | elapsed:    1.8s finished

[2023-02-26 06:16:37] Features: 3/20 -- score

In [89]:
top20_AD = list(top20_AD)
print('Top 20 AD Proteins:',top20_AD)

Top 20 AD Proteins: ['LVX54', 'KVD15', 'KV311', 'HV601', 'HV374', 'VTM2B', 'SIRB1', 'QSOX1', 'NRP1', 'CLU', 'CAH11', 'TNR21', 'DDAH1', 'PPN', 'PDYN', 'TAU', 'AK1C1', 'TSP2', 'GLT10', 'ENPP5']


In [28]:
top20_AD = ['LVX54', 'KVD15', 'KV311', 'HV601', 'HV374', 'VTM2B', 'SIRB1', 'QSOX1', 'NRP1', 'CLU', 'CAH11', 'TNR21', 'DDAH1', 'PPN', 'PDYN', 'TAU', 'AK1C1', 'TSP2', 'GLT10', 'ENPP5']

In [70]:
sfs_PD = sfs.fit(X_train_PD_healthy,y_train_PD_healthy)
top20_PD = sfs_PD.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    1.6s finished

[2023-02-26 06:20:13] Features: 1/20 -- score: 0.8071428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    1.8s finished

[2023-02-26 06:20:15] Features: 2/20 -- score: 0.8285714285714285[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1136 out of 1136 | elapsed:    1.8s finished

[2023-02-26 06:20:17] Features: 3/20 -- score: 0.8571428571428571[Parallel(n_jobs=-1)]: Using backend LokyB

In [90]:
top20_PD = list(top20_PD)
print('Top 20 PD Proteins:',top20_PD)

Top 20 PD Proteins: ['LV218', 'KV240', 'HV404', 'KV127', 'KVD30', 'KVD15', 'LV136', 'A0A075B7F0', 'HV43D', 'HV428', 'A0A0J9YWU9', 'KVD12', 'APOL1', 'CHAD', 'ENO1', 'CO8A', 'H90B2', 'HLAC', 'PPIB', 'MMP17']


In [30]:
top20_PD = ['LV218', 'KV240', 'HV404', 'KV127', 'KVD30', 'KVD15', 'LV136', 'A0A075B7F0', 'HV43D', 'HV428', 'A0A0J9YWU9', 'KVD12', 'APOL1', 'CHAD', 'ENO1', 'CO8A', 'H90B2', 'HLAC', 'PPIB', 'MMP17']

In [72]:
sfs_PDMCI = sfs.fit(X_train_PDMCI_healthy,y_train_PDMCI_healthy)
top20_PDMCI = sfs_PDMCI.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1138 out of 1138 | elapsed:    1.7s finished

[2023-02-26 06:22:10] Features: 1/20 -- score: 0.8549857549857549[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1137 out of 1137 | elapsed:    1.9s finished

[2023-02-26 06:22:12] Features: 2/20 -- score: 0.8626780626780628[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1136 out of 1136 | elapsed:    1.9s finished

[2023-02-26 06:22:14] Features: 3/20 -- score: 0.8777777777777779[Parallel(n_jobs=-1)]: Using backend LokyB

In [91]:
top20_PDMCI = list(top20_PDMCI)
print('Top 20 PDMCI Proteins:',top20_PDMCI)

Top 20 PDMCI Proteins: ['LV861', 'LVX54', 'LV746', 'LV312', 'LV545', 'KVD15', 'KV621', 'HV309', 'TEX51', 'IGLO5', 'SHSA7', 'MATN2', 'VGF', 'PPN', 'PAI1', 'AK1C1', 'PLTP', 'LG3BP', 'KLH21', 'B2L12']


In [31]:
top20_PDMCI = ['LV861', 'LVX54', 'LV746', 'LV312', 'LV545', 'KVD15', 'KV621', 'HV309', 'TEX51', 'IGLO5', 'SHSA7', 'MATN2', 'VGF', 'PPN', 'PAI1', 'AK1C1', 'PLTP', 'LG3BP', 'KLH21', 'B2L12']

In [32]:
all_top_20 = []
all_top_20.append(top20_AD)
all_top_20.append(top20_PD)
all_top_20.append(top20_PDMCI)
all_top_20 = np.array(all_top_20)
all_top_20 = list(np.unique(all_top_20))
print(all_top_20)

['A0A075B7F0', 'A0A0J9YWU9', 'AK1C1', 'APOL1', 'B2L12', 'CAH11', 'CHAD', 'CLU', 'CO8A', 'DDAH1', 'ENO1', 'ENPP5', 'GLT10', 'H90B2', 'HLAC', 'HV309', 'HV374', 'HV404', 'HV428', 'HV43D', 'HV601', 'IGLO5', 'KLH21', 'KV127', 'KV240', 'KV311', 'KV621', 'KVD12', 'KVD15', 'KVD30', 'LG3BP', 'LV136', 'LV218', 'LV312', 'LV545', 'LV746', 'LV861', 'LVX54', 'MATN2', 'MMP17', 'NRP1', 'PAI1', 'PDYN', 'PLTP', 'PPIB', 'PPN', 'QSOX1', 'SHSA7', 'SIRB1', 'TAU', 'TEX51', 'TNR21', 'TSP2', 'VGF', 'VTM2B']


['A0A075B7F0', 'A0A0J9YWU9', 'AK1C1', 'APOL1', 'B2L12', 'CAH11', 'CHAD', 'CLU', 'CO8A', 'DDAH1', 'ENO1', 'ENPP5', 'GLT10', 'H90B2', 'HLAC', 'HV309', 'HV374', 'HV404', 'HV428', 'HV43D', 'HV601', 'IGLO5', 'KLH21', 'KV127', 'KV240', 'KV311', 'KV621', 'KVD12', 'KVD15', 'KVD30', 'LG3BP', 'LV136', 'LV218', 'LV312', 'LV545', 'LV746', 'LV861', 'LVX54', 'MATN2', 'MMP17', 'NRP1', 'PAI1', 'PDYN', 'PLTP', 'PPIB', 'PPN', 'QSOX1', 'SHSA7', 'SIRB1', 'TAU', 'TEX51', 'TNR21', 'TSP2', 'VGF', 'VTM2B']


Create three regressors for each disease conditions based on training data with only the top 20 biomarkers

In [33]:
X_train20_AD_healthy = train_AD_healthy[top20_AD]
y_train20_AD_healthy = train_AD_healthy['group']

X_train20_PD_healthy = train_PD_healthy[top20_PD]
y_train20_PD_healthy = train_PD_healthy['group']

X_train20_PDMCI_healthy = train_PDMCI_healthy[top20_PDMCI]
y_train20_PDMCI_healthy = train_PDMCI_healthy['group']

In [47]:
df = pd.DataFrame(data=y_train20_AD_healthy)

In [48]:
df.value_counts()

group  
Healthy    110
AD_MCI      41
dtype: int64

In [162]:
model = sklearn.linear_model.RidgeClassifier()
AD_binary_model = model.fit(X_train20_AD_healthy,y_train20_AD_healthy)
model = sklearn.linear_model.RidgeClassifier()
PD_binary_model = model.fit(X_train20_PD_healthy,y_train20_PD_healthy)
model = sklearn.linear_model.RidgeClassifier()
PDMCI_binary_model = model.fit(X_train20_PDMCI_healthy,y_train20_PDMCI_healthy)

Generate binary predictions for the validation data

In [166]:
val_data_conditions = pd.DataFrame(data=val_data['group'])

val_set_AD_predictions = []
for i in range(len(val_data)):
    ilocdf = pd.DataFrame(data=dict(val_data.iloc[i][top20_AD]),index=[0])
    prediction = AD_binary_model.predict(ilocdf)
    if prediction == 'Healthy':
        prediction = 'not_AD'
    # For some reason, the output of the model has brackets so this is elif ensures that the prediction column is just a string with no brackets
    elif prediction == 'AD_MCI':
        prediction = 'AD_MCI'
    val_set_AD_predictions.append(prediction)
val_data_conditions['AD_pred'] = val_set_AD_predictions


val_set_PD_predictions = []
for i in range(len(val_data)):
    ilocdf = pd.DataFrame(data=dict(val_data.iloc[i][top20_PD]),index=[0])
    prediction = PD_binary_model.predict(ilocdf)
    if prediction == 'Healthy':
        prediction = 'not_PD'
    elif prediction == 'PD':
        prediction = 'PD'
    val_set_PD_predictions.append(prediction)
val_data_conditions['PD_pred'] = val_set_PD_predictions

val_set_PDMCI_predictions = []
for i in range(len(val_data)):
    ilocdf = pd.DataFrame(data=dict(val_data.iloc[i][top20_PDMCI]),index=[0])
    prediction = PDMCI_binary_model.predict(ilocdf)
    if prediction == 'Healthy':
        prediction = 'not_PD_MCI_LBD'
    elif prediction == 'PD_MCI_LBD':
        prediction = 'PD_MCI_LBD'
    val_set_PDMCI_predictions.append(prediction)
val_data_conditions['PD_MCI_pred'] = val_set_PDMCI_predictions




val_data_conditions

Unnamed: 0,group,AD_pred,PD_pred,PD_MCI_pred
0,Healthy,not_AD,not_PD,not_PD_MCI_LBD
1,Healthy,not_AD,not_PD,not_PD_MCI_LBD
2,Healthy,not_AD,not_PD,not_PD_MCI_LBD
3,Healthy,not_AD,not_PD,not_PD_MCI_LBD
4,Healthy,not_AD,not_PD,not_PD_MCI_LBD
5,AD_MCI,AD_MCI,not_PD,PD_MCI_LBD
6,Healthy,not_AD,not_PD,not_PD_MCI_LBD
7,PD_MCI_LBD,AD_MCI,not_PD,not_PD_MCI_LBD
8,Healthy,not_AD,not_PD,not_PD_MCI_LBD
9,AD_MCI,not_AD,not_PD,not_PD_MCI_LBD


### Training the binary models on just the condition vs healthy frames

In [175]:
model = sklearn.linear_model.RidgeClassifier()
AD_binary_model = model.fit(X_train20_AD_healthy,y_train20_AD_healthy)
model = sklearn.linear_model.RidgeClassifier()
PD_binary_model = model.fit(X_train20_PD_healthy,y_train20_PD_healthy)
model = sklearn.linear_model.RidgeClassifier()
PDMCI_binary_model = model.fit(X_train20_PDMCI_healthy,y_train20_PDMCI_healthy)

In [176]:
def ridge_class_w_selected_features(frame,features,model,target='group'):
    model = sklearn.linear_model.RidgeClassifier()
    X = frame[features]
    y = frame[target]
    return model.score(X,y)    

In [205]:
print('AD_healthy binary classifier scores')
print('Training data:',ridge_class_w_selected_features(train_AD_healthy,top20_AD,AD_binary_model))
print('Val data:',ridge_class_w_selected_features(val_AD_healthy,top20_AD,AD_binary_model))
print('Test data:',ridge_class_w_selected_features(test_AD_healthy,top20_AD,AD_binary_model))

print('\nPD_healthy binary classifier scores')
print('Training data:',ridge_class_w_selected_features(train_PD_healthy,top20_PD,PD_binary_model))
print('Val data:',ridge_class_w_selected_features(val_PD_healthy,top20_PD,PD_binary_model))
print('Test data:',ridge_class_w_selected_features(test_PD_healthy,top20_PD,PD_binary_model))

print('\nPD_MCI_healthy binary classifier scores')
print('Training data:',ridge_class_w_selected_features(train_PDMCI_healthy,top20_PDMCI,PDMCI_binary_model))
print('Val data:',ridge_class_w_selected_features(val_PDMCI_healthy,top20_PDMCI,PDMCI_binary_model))
print('Test data:',ridge_class_w_selected_features(test_PDMCI_healthy,top20_PDMCI,PDMCI_binary_model))

AD_healthy binary classifier scores
Training data: 0.9271523178807947
Val data: 0.8846153846153846
Test data: 0.9

PD_healthy binary classifier scores
Training data: 0.9214285714285714
Val data: 0.8333333333333334
Test data: 0.8214285714285714

PD_MCI_healthy binary classifier scores
Training data: 0.9389312977099237
Val data: 0.9389312977099237
Test data: 0.6875


In [13]:
val_AD_healthy['group'].value_counts()

NameError: name 'val_AD_healthy' is not defined

In [231]:
def three_way_binary(frame,AD_features,PD_features,PDMCI_features,target='group'):
    three_way_df = pd.DataFrame()
    three_way_df['group'] = frame[target]
    
    AD_predictions = []
    for i in range(len(frame)):
        ilocdf = pd.DataFrame(data=dict(frame.iloc[i][AD_features]),index=[0])
        prediction = AD_binary_model.predict(ilocdf)
        if prediction == 'Healthy':
            prediction = 'none'
        # For some reason, the output of the model has brackets so this is elif ensures that the prediction column is just a string with no brackets
        elif prediction == 'AD_MCI':
            prediction = 'AD_MCI'
        AD_predictions.append(prediction)
    three_way_df['AD_pred'] = AD_predictions
    
    PD_predictions = []
    for i in range(len(frame)):
        ilocdf = pd.DataFrame(data=dict(frame.iloc[i][PD_features]),index=[0])
        prediction = PD_binary_model.predict(ilocdf)
        if prediction == 'Healthy':
            prediction = 'none'
        # For some reason, the output of the model has brackets so this is elif ensures that the prediction column is just a string with no brackets
        elif prediction == 'PD':
            prediction = 'PD'
        PD_predictions.append(prediction)
    three_way_df['PD_pred'] = PD_predictions    

    PDMCI_predictions = []
    for i in range(len(frame)):
        ilocdf = pd.DataFrame(data=dict(frame.iloc[i][PDMCI_features]),index=[0])
        prediction = PDMCI_binary_model.predict(ilocdf)
        if prediction == 'Healthy':
            prediction = 'none'
        # For some reason, the output of the model has brackets so this is elif ensures that the prediction column is just a string with no brackets
        elif prediction == 'PD':
            prediction = 'PD'
        PDMCI_predictions.append(prediction)
    three_way_df['PDMCI_pred'] = PDMCI_predictions
    
    return three_way_df

In [229]:
three_way_df = pd.DataFrame(data=three_way_binary(val_data,top20_AD,top20_PD,top20_PDMCI), columns=['actual','AD_pred','PD_pred','PDMCI_pred'])

In [234]:
three_way_df = three_way_binary(test_data,top20_AD,top20_PD,top20_PDMCI)

In [235]:
three_way_df

Unnamed: 0,group,AD_pred,PD_pred,PDMCI_pred
0,Healthy,none,none,none
1,PD_MCI_LBD,none,none,none
2,Healthy,none,none,[PD_MCI_LBD]
3,Healthy,none,PD,none
4,PD,none,none,none
5,PD,none,none,none
6,Healthy,none,none,none
7,Healthy,none,none,none
8,AD_MCI,AD_MCI,none,none
9,Healthy,none,none,[PD_MCI_LBD]


### Training the binary models on binary datasets with all conditions included

In [193]:
binary_data_AD_train = train_data.replace(to_replace=['PD','PD_MCI_LBD','Healthy'],value='not_AD')
binary_data_AD_val = val_data.replace(to_replace=['PD','PD_MCI_LBD','Healthy'],value='not_AD')
binary_data_AD_test = test_data.replace(to_replace=['PD','PD_MCI_LBD','Healthy'],value='not_AD')

binary_data_PD_train = train_data.replace(to_replace=['AD_MCI','PD_MCI_LBD','Healthy'],value='not_AD')
binary_data_PD_val = val_data.replace(to_replace=['AD_MCI','PD_MCI_LBD','Healthy'],value='not_PD')
binary_data_PD_test = test_data.replace(to_replace=['AD_MCI','PD_MCI_LBD','Healthy'],value='not_PD')

binary_data_PDMCI_train = train_data.replace(to_replace=['PD','AD_MCI','Healthy'],value='not_PDMCI')
binary_data_PDMCI_val = val_data.replace(to_replace=['PD','AD_MCI','Healthy'],value='not_PDMCI')
binary_data_PDMCI_test = test_data.replace(to_replace=['PD','AD_MCI','Healthy'],value='not_PDMCI')

In [199]:
binary_data_PD_test

Unnamed: 0,group,KV37,LV469,LV861,LVX54,LV746,LV218,LV316,LV312,LV310,...,ARMD4,C1QT5,B2L12,PLXA4,B3GN2,PCDGL,AGRB1,RECK,PSA1,F174A
0,not_PD,29.095311,22.914746,23.465673,24.543944,25.807818,24.247465,23.014983,22.783692,22.133837,...,18.365925,19.746515,28.184519,17.85797,19.919932,18.875353,21.328279,19.894098,20.480407,24.265616
1,not_PD,28.616975,22.266832,22.600108,18.431455,24.773448,23.408153,22.863952,22.578072,22.425962,...,14.231228,20.224879,28.149344,18.816604,20.509034,18.183611,22.333313,20.090041,20.840459,24.291441
2,not_PD,28.004649,22.263307,23.664214,21.614948,24.897457,23.717144,23.63504,22.718097,23.194275,...,17.253986,20.178723,28.817759,16.759801,19.488997,18.065153,21.283092,19.138285,20.659743,24.430869
3,not_PD,29.058831,22.567414,23.916463,21.914058,25.557151,23.946191,22.782418,22.172497,21.511763,...,18.079633,19.776229,28.034288,18.131934,19.962431,18.103656,21.35002,19.683708,20.529492,24.247193
4,PD,28.881609,22.65847,21.910208,20.46629,24.527,24.004398,21.884848,21.021232,20.963709,...,20.069476,19.639023,27.739303,19.326313,20.487923,18.559469,20.629128,18.917739,20.519102,24.136346
5,PD,28.833728,22.104417,23.626751,15.248723,24.636469,24.695328,22.307081,21.567867,21.492075,...,17.381783,20.219334,28.297581,18.084594,20.164663,15.770773,20.877399,19.236433,20.596072,24.198477
6,not_PD,28.215222,23.456092,23.94854,22.495857,25.648726,24.086456,22.618295,21.972739,21.755923,...,18.088279,20.327758,28.753138,18.044162,20.4019,17.933399,21.3435,20.138425,20.360648,24.511899
7,not_PD,26.758964,21.118196,22.778541,21.002959,23.999265,23.168366,22.174694,21.250284,21.075635,...,19.532082,19.998261,28.149196,19.710384,20.792139,19.127583,22.158528,19.883552,21.055747,24.668104
8,not_PD,28.403123,21.703692,20.310644,20.348068,23.664406,23.811062,22.956995,23.741478,22.513721,...,14.347183,20.141805,28.642463,17.506197,20.145225,19.185181,21.211442,20.375947,20.51776,24.612473
9,not_PD,28.468191,23.427728,23.605409,15.131982,25.252005,23.000942,22.159893,21.680271,21.465589,...,18.593808,19.932572,28.170378,19.002192,20.680688,18.725862,21.33256,19.889075,20.481241,24.267293


In [None]:
def 

In [187]:
print('AD_all binary classifier scores')
print('Training data:',ridge_class_w_selected_features(train_AD_healthy,top20_AD,AD_binary_model))
print('Val data:',ridge_class_w_selected_features(val_AD_healthy,top20_AD,AD_binary_model))
print('Test data:',ridge_class_w_selected_features(test_AD_healthy,top20_AD,AD_binary_model))

not_AD    161
AD_MCI     41
Name: group, dtype: int64

In [202]:
def train_ridge_selected_features (train,val,test,features,target='group'):
    # Train the model
    model = sklearn.linear_model.RidgeClassifier()
    train_X = train[features]
    train_y = train[target]
    model.fit(train_X,train_y)
    
    val_X = val[features]
    val_y = val[target]
    
    test_X = test[features]
    test_y = test[target]
    
    # Score the model
    print('Training score:',model.score(train_X,train_y))
    print('Val score:',model.score(val_X,val_y))
    print('Test score:',model.score(test_X,test_y))
    
    return model

In [204]:
print('AD Binary classifier scores:')
train_ridge_selected_features(binary_data_AD_train,binary_data_AD_val,binary_data_AD_test,top20_AD)

print('\nPD Binary classifier scores:')
train_ridge_selected_features(binary_data_PD_train,binary_data_PD_val,binary_data_PD_test,top20_PD)

print('\nPD_MCI Binary classifier scores:')
train_ridge_selected_features(binary_data_PDMCI_train,binary_data_PDMCI_val,binary_data_PDMCI_test,top20_PDMCI)

AD Binary classifier scores:
Training score: 0.8861386138613861
Val score: 0.9444444444444444
Test score: 0.9047619047619048

PD Binary classifier scores:
Training score: 0.8861386138613861
Val score: 0.0
Test score: 0.0

PD_MCI Binary classifier scores:
Training score: 0.9158415841584159
Val score: 0.7777777777777778
Test score: 0.7857142857142857


In [None]:
def multi_class_from_binary(frame,features,model,target='group')