Import the nacc dataset and config file.

In [None]:
import os
# The path of NACC Dataset
NACC_DATASET_PATH = os.path.join("data", "NACC.csv")
# The path of the Config file 
DATA_CONFIG_PATH = os.path.join("config", "data_config.yaml")
print(NACC_DATASET_PATH)
print(DATA_CONFIG_PATH)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils.config_utils import get_config

config = get_config(DATA_CONFIG_PATH)
print(config)

df = pd.read_csv(NACC_DATASET_PATH, low_memory=False)

In [None]:
target_variable = list(config['target_variable'])
missing_values_threshold = config['missing_values_threshold']

Y = df[target_variable].copy()
X = df.drop(columns=target_variable, axis=1)
print(X.shape)
#calculate missing values ratio
missing_ratio = X.isnull().mean()
#retain the values which are less than the threshold
variables_to_be_retained = X.columns[missing_ratio <= missing_values_threshold]
X_filtered = X[variables_to_be_retained]
print(X_filtered.shape)

In [None]:
# # For being simple, text are converted to 1, emopty cols are converted to 0
# X_filled = X_filtered.applymap(lambda x: 1 if isinstance(x, str) and x.strip() != '' else x)

# The samplest way to fill missing numerical values is to fill them with 0
X_filled = X_filtered.fillna(0)
# check if there are any null values
has_null_values = X_filled.isnull().any().any()
if has_null_values:
    print("has null values.")
else:
    print("has no null values.")


In [None]:
# print the variables that are strings(not numerical)
groups = X_filled['NACCID']
string_columns = X_filled.select_dtypes(include='object').columns


# print the variables that are numerical(not strings)
num_string_columns = len(string_columns)
print(f"String Columns in Dataframe are are listed below. There are {num_string_columns} colums in total.")

for col in string_columns:
    print(col)
"""
    TODO: discuss how to deal with the string columns
    1. drop the columns (current approach)
""" 
# drop the string columns
X_filled = X_filled.drop(string_columns, axis=1)


In [None]:
# One-Hot Encoding
X_encoded = pd.get_dummies(X_filled)
# 0-8 - Diagnosis of Alzheimer's Disease (NACCALZD)
# Y['NACCALZD'] = Y['NACCALZD'].map({0: 'Undiagnosed', 1: 'Diagnosed', 8: 'Undiagnosed'})
Y['NACCALZD'] = Y['NACCALZD'].map({0: 0, 1: 1, 8: 0})
# Y_encoded = pd.get_dummies(Y, columns=['NACCALZD'])
Y_encoded = Y.astype(float)
# Normalization
# 1. Min-Max Normalization
X_min_max = (X_filled - X_filled.min(numeric_only=True)) / (X_filled.max(numeric_only=True) - X_filled.min(numeric_only=True))


# 2. Z-Score Normalization
X_z_score = (X_filled - X_filled.mean(numeric_only=True)) / X_filled.std(numeric_only=True)


In [None]:
# There maybe some identical columns in the dataset, so we need to remove them. (Maybe there are other ways to do this)
columns_with_null = X_z_score.columns[X_z_score.isnull().any()]

# Print the column names with null values
print("Columns with null values:")
print(columns_with_null)
X_z_score.drop(columns_with_null, axis=1, inplace=True)

In [None]:
print(Y_encoded)

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
features = X_z_score.values
# features = X_min_max
labels = Y_encoded.values
COL_NUM = features.shape[1]


print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")

gbt_model = GradientBoostingClassifier()
# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 6],
    'subsample': [0.6, 1.0]
}

sgkf = StratifiedGroupKFold(n_splits=3)

print('StratifiedGroupKFold:')
for train_indices, test_indices in sgkf.split(features, labels, groups):
    X_train, X_test = features[train_indices], features[test_indices]
    y_train, y_test = labels[train_indices], labels[test_indices]
    y_train, y_test = y_train.ravel(), y_test.ravel()
    gbt_model.fit(X_train, y_train)
    y_pred = gbt_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

print('GridSearchCV + StratifiedGroupKFold:')
for train_indices, test_indices in sgkf.split(features, labels, groups):
    X_train, X_test = features[train_indices], features[test_indices]
    y_train, y_test = labels[train_indices], labels[test_indices]
    y_train, y_test = y_train.ravel(), y_test.ravel()

    grid_search = GridSearchCV(gbt_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    # 输出最佳参数组合和对应的性能
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    # 在测试集上评估最佳模型
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", accuracy)

print('GridSearchCV:')
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
grid_search = GridSearchCV(gbt_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

In [None]:

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Sensitivity (True Positive Rate)
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Sensitivity:", sensitivity)

# Specificity (True Negative Rate)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
print("Specificity:", specificity)

# F1 Score
f1_score = 2 * (sensitivity * specificity) / (sensitivity + specificity)
print("F1 Score:", f1_score)

# ROC & AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC:", roc_auc)

# False Positive Rate & False Negative Rate
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0])
false_negative_rate = conf_matrix[1, 0] / (conf_matrix[1, 0] + conf_matrix[1, 1])
print("False Positive Rate:", false_positive_rate)
print("False Negative Rate:", false_negative_rate)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))