In [475]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
import funcs
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [748]:
data = pd.read_csv("../objects/df_imputed_not_complete_874.csv", index_col=0)
data.head()

Unnamed: 0,RID,PTHAND,PTMARRY,PTEDUCAT,PTNOTRT,PTHOME,PTPLANG,PTETHCAT,PTRACCAT,age,...,RCT29,RCT3,RCT392,RCT4,RCT5,RCT6,RCT8,RCT9,apoe,DIAGNOSIS
1,2,2,1,16,1,1,1,2,5,74.333333,...,0.2,19,1.1,21,19,15,5.6,3.0,2,Control
2,3,1,1,18,1,3,1,2,5,81.25,...,0.1,21,1.3,16,19,19,6.9,3.5,3,AZ
3,4,1,1,10,0,1,2,1,5,67.583333,...,0.1,16,1.2,23,18,25,6.0,2.9,2,MCI
4,5,1,1,16,1,1,1,2,5,73.666667,...,0.2,16,1.0,17,24,14,5.7,3.2,2,Control
5,6,1,1,13,1,1,1,2,5,80.416667,...,0.2,19,1.1,16,19,23,5.3,4.5,2,MCI


In [750]:
data['apoe_2'] = data['apoe'].replace({1: 0, 2: 0, 3: 1, 4: 1})
data['education_category'] = pd.cut(
    data['PTEDUCAT'], 
    bins=[0, 12, 17, 19, 100],  # Define the bins for categories
    labels=[1, 2, 3, 4]  # Corresponding categories
)
data['education_category'] = data['education_category'].astype(np.int64)
data['PTMARRY_bin'] = np.where(data['PTMARRY'] == 2, 1, 0)
data['PTMARRY_bin'] = data['PTMARRY_bin'].astype(np.int64)

## Importing MCI cluster labels

In [695]:
df_mci = pd.read_csv('../objects/clusters_MCI_2.csv', index_col=0)

In [499]:
df_mci = df_mci[["RID", "cluster"]]

In [752]:
df_mci = pd.read_csv('../objects/MCI_4.csv', index_col=0)

In [754]:
ids_to_exclude_AD = df_mci[df_mci['cluster'] == "MCI2"]['RID'].to_list()
ids_to_exclude_Healthy = df_mci[df_mci['cluster'] == "MCI3"]['RID'].to_list()
ids_to_exclude_Middle = df_mci[df_mci['cluster'] == "MCI1"]['RID'].to_list()

In [756]:
data_no_MCI_AD = data[~data['RID'].isin(ids_to_exclude_AD)].copy()
data_no_MCI_Healthy = data[~data['RID'].isin(ids_to_exclude_Healthy)].copy()
data_no_MCI_Middle = data[~data['RID'].isin(ids_to_exclude_Middle)].copy()

## No MCI_healthy subgroup

In [758]:
outcome = data_no_MCI_Healthy['DIAGNOSIS'].copy()
outcome = outcome.replace({"Control": 0,
                  "MCI": 1,
                  "AZ": 2})
data_no_MCI_Healthy.drop(['RID', 'DIAGNOSIS'], axis = 1, inplace=True)

In [641]:
feat = ['weight',
 'height',
 'VSPULSE',
 'LIMMTOTAL',
 'LDELTOTAL',
 'GDTOTAL',
 'HMT100',
 'HMT15',
 'HMT16',
 'HMT8',
 'GDMEMORY',
 'apoe',
 'MHPSYCH',
 'PTEDUCAT',
 'GDDROP',
 'NXGAIT',
 'PTHOME',
 'GDBETTER',
 'GDBORED',
 'GDHOPE',
 'GDHELP']

In [764]:
feat = ['NXGAIT', "MHPSYCH", "GDDROP", "GDBORED", "GDMEMORY", "GDBETTER", "GDTOTAL", "HMT15", "HMT8","LDELTOTAL", "LIMMTOTAL", "apoe_2", "PTMARRY_bin", "education_category"]

In [766]:
X, X_test, y, y_test = train_test_split(data_no_MCI_Healthy[feat], outcome, test_size=0.2, random_state=43)

In [768]:
smote_over = SMOTE(random_state=44)
X, y = smote_over.fit_resample(X, y)

In [257]:
best_params_feat_selection = {"iterations": 1800, 
                              "learning_rate": 0.20649746303659136, 
                              "l2_leaf_reg": 4.37841702433753, 
                              "bagging_temperature": 1.6766419657563723, 
                              "random_strength": 1.9555985333019168, 
                              "depth": 7, 
                              "min_data_in_leaf": 91, 
                              'random_seed': 42,
                              'loss_function': "MultiClass",
                              "colsample_bylevel": 0.9759404466998405}

In [655]:
best_params = {"iterations": 1900, 
               "learning_rate": 0.0980771768328211, 
               "l2_leaf_reg": 1.0838481536171913, 
               "bagging_temperature": 0.7706864112026646, 
               "random_strength": 2.283941634872743, 
               "depth": 8, 
               'random_seed': 42,
               "min_data_in_leaf": 72}

In [770]:
final_model = CatBoostClassifier(**best_params, verbose=False)

In [772]:
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x1aacb1df750>

In [774]:
predictions = final_model.predict(X_test)
predictions_proba = final_model.predict_proba(X_test)

In [776]:
funcs.metrics_merged(y_test, predictions, predictions_proba)

Unnamed: 0,Class,Recall,Precision,Specificity,NPV,AUC
0,Control,0.909091,0.952381,0.98,0.960784,0.993182
1,MCI,0.753623,0.83871,0.866667,0.792683,0.907633
2,AD,0.806452,0.625,0.867257,0.942308,0.926349


## No MCI_Middle subgroup

In [778]:
outcome = data_no_MCI_Middle['DIAGNOSIS']
outcome = outcome.replace({"Control": 0,
                  "MCI": 1,
                  "AZ": 2})
data_no_MCI_Middle.drop(['RID', 'DIAGNOSIS'], axis = 1, inplace=True)

In [780]:
X, X_test, y, y_test = train_test_split(data_no_MCI_Middle[feat], outcome, test_size=0.2, random_state=43)

In [782]:
smote_over = SMOTE(random_state=44)
X, y = smote_over.fit_resample(X, y)

In [529]:
final_model = CatBoostClassifier(**best_params, verbose=False)

In [784]:
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x1aacb1df750>

In [785]:
predictions = final_model.predict(X_test)
predictions_proba = final_model.predict_proba(X_test)

In [786]:
funcs.metrics_merged(y_test, predictions, predictions_proba)

Unnamed: 0,Class,Recall,Precision,Specificity,NPV,AUC
0,Control,0.846154,0.846154,0.95082,0.95082,0.968474
1,MCI,0.8375,0.744444,0.716049,0.816901,0.833796
2,AD,0.595238,0.78125,0.941176,0.868217,0.887555


## No MCI_AD subgroup

In [790]:
outcome = data_no_MCI_AD['DIAGNOSIS']
outcome = outcome.replace({"Control": 0,
                  "MCI": 1,
                  "AZ": 2})
data_no_MCI_AD.drop(['RID', 'DIAGNOSIS'], axis = 1, inplace=True)

In [792]:
X, X_test, y, y_test = train_test_split(data_no_MCI_AD[feat], outcome, test_size=0.2, random_state=43)

In [794]:
smote_over = SMOTE(random_state=44)
X, y = smote_over.fit_resample(X, y)

In [566]:
final_model = CatBoostClassifier(**best_params, verbose=False)

In [796]:
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x1aacb1df750>

In [797]:
predictions = final_model.predict(X_test)
predictions_proba = final_model.predict_proba(X_test)

In [798]:
funcs.metrics_merged(y_test, predictions, predictions_proba)

Unnamed: 0,Class,Recall,Precision,Specificity,NPV,AUC
0,Control,0.891304,0.891304,0.936709,0.936709,0.970281
1,MCI,0.829268,0.755556,0.869048,0.9125,0.92712
2,AD,0.815789,0.911765,0.965517,0.923077,0.96703


In [802]:
confusion_matrix(y_test, predictions)

array([[41,  4,  1],
       [ 5, 34,  2],
       [ 0,  7, 31]], dtype=int64)