In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, root_mean_squared_error, r2_score

In [2]:
dx_labels = {0: 'AD', 1: 'MCI', 2: 'CN', 3: 'SMC'}
gender_labels = {0: 'Male', 1: 'Female'}

In [3]:
data_dir = 'data'
df = pd.read_csv(os.path.join(data_dir, 'CSF_COG_PET_data_cleaned.csv'))
df.drop(columns=['TAU_bl', 'PTAU_bl', 'ABETA_bl', # CSF biomarkers
                 'DX_bl', 'DX', 'FHQDAD', 'FHQDADAD', 'FHQMOM', 'FHQMOMAD', # Clinical variables
                 'PTRACCAT', # Race since it is heavily imbalanced
                 'MMSE_bl', 'EcogPtTotal_bl', 'LDELTOTAL_BL', 'mPACCdigit_bl', 'mPACCtrailsB_bl', 'RAVLT_immediate_bl', 'RAVLT_forgetting_bl', 'RAVLT_learning_bl', 'RAVLT_perc_forgetting_bl'], # Cognitive scores
        inplace=True)
df.head()

Unnamed: 0,AGE,PTGENDER,PTEDUCAT,SUVr_ACC_pre_L.nii,SUVr_ACC_pre_R.nii,SUVr_ACC_sub_L.nii,SUVr_ACC_sub_R.nii,SUVr_ACC_sup_L.nii,SUVr_ACC_sup_R.nii,SUVr_Amygdala_L.nii,...,SUVr_Vermis_1_2.nii,SUVr_Vermis_3.nii,SUVr_Vermis_4_5.nii,SUVr_Vermis_6.nii,SUVr_Vermis_7.nii,SUVr_Vermis_8.nii,SUVr_Vermis_9.nii,SUVr_Vermis_10.nii,SUVr_VTA_L.nii,SUVr_VTA_R.nii
0,67.5,0,16,0.0338,0.042295,0.010627,0.008586,0.03932,0.038841,0.011054,...,0.003369,0.011351,0.03088,0.018253,0.009289,0.011917,0.008881,0.006231,0.000712,0.000659
1,71.8,1,15,0.05652,0.063619,0.01649,0.012617,0.057261,0.053242,0.013008,...,0.003325,0.011246,0.029007,0.016903,0.009108,0.012666,0.008033,0.006029,0.00077,0.000699
2,78.0,1,12,0.060682,0.068784,0.017864,0.013864,0.054272,0.053113,0.01362,...,0.003402,0.011694,0.033622,0.019224,0.010413,0.012902,0.007799,0.00573,0.000804,0.00076
3,64.9,0,16,0.042112,0.049571,0.011925,0.010232,0.044834,0.042373,0.011938,...,0.003367,0.011986,0.034076,0.019834,0.010011,0.012782,0.009789,0.006613,0.000748,0.000675
4,75.1,1,18,0.041158,0.046395,0.012714,0.009992,0.042674,0.039067,0.011457,...,0.002935,0.009448,0.02874,0.017897,0.009361,0.012595,0.008533,0.00517,0.000675,0.000675


# Age

In [16]:
X = df.drop(columns=['AGE', 'PTGENDER', 'PTEDUCAT'])
y = df.filter(['AGE'])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
model = KernelRidge(kernel='rbf', alpha=1.0, gamma=0.1)
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,kernel,'rbf'
,gamma,0.1
,degree,3
,coef0,1
,kernel_params,


In [19]:
y_pred_train = model.predict(X_train)
print("Train RMSE:", root_mean_squared_error(y_train, y_pred_train))
print("Train R2:", r2_score(y_train, y_pred_train))
print()
y_pred_test = model.predict(X_test)
print("Test RMSE:", root_mean_squared_error(y_test, y_pred_test))
print("Test R2:", r2_score(y_test, y_pred_test))

Train RMSE: 35.94452385880727
Train R2: -25.0119848576903

Test RMSE: 71.29014380776724
Test R2: -89.59661608788738


# Sex

In [26]:
X = df.drop(columns=['AGE', 'PTGENDER', 'PTEDUCAT'])
y = df.filter(['PTGENDER'])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
model = SVC(kernel='rbf', C=1, gamma='scale')
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [28]:
y_pred_train = model.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Train F1 Score:", f1_score(y_train, y_pred_train, average='macro'))
print("Train AUROC:", roc_auc_score(y_train, y_pred_train))
print()
y_pred_test = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Test F1 Score:", f1_score(y_test, y_pred_test, average='macro'))
print("Test AUROC:", roc_auc_score(y_test, y_pred_test))

Train Accuracy: 0.8953488372093024
Train F1 Score: 0.8950986398061358
Train AUROC: 0.8947072072072072

Test Accuracy: 0.8240740740740741
Test F1 Score: 0.8228438228438228
Test AUROC: 0.8299438990182327


# Years of Education

In [10]:
X = df.drop(columns=['AGE', 'PTGENDER', 'PTEDUCAT'])
y = df.filter(['PTEDUCAT'])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
model = KernelRidge(kernel='rbf', alpha=1.0, gamma=0.1)
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,kernel,'rbf'
,gamma,0.1
,degree,3
,coef0,1
,kernel_params,


In [12]:
y_pred_train = model.predict(X_train)
print("Train RMSE:", root_mean_squared_error(y_train, y_pred_train))
print("Train R2:", r2_score(y_train, y_pred_train))
print()
y_pred_test = model.predict(X_test)
print("Test RMSE:", root_mean_squared_error(y_test, y_pred_test))
print("Test R2:", r2_score(y_test, y_pred_test))

Train RMSE: 8.131478601103183
Train R2: -8.714543405220013

Test RMSE: 16.188227901224373
Test R2: -41.51788040500489
