Import the nacc dataset and config file.

In [1]:
import os
# The path of NACC Dataset
NACC_DATASET_PATH = os.path.join("data", "NACC.csv")
# The path of the Config file 
DATA_CONFIG_PATH = os.path.join("config", "data_config.yaml")
print(NACC_DATASET_PATH)
print(DATA_CONFIG_PATH)

data/NACC.csv
config/data_config.yaml


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils.config_utils import get_config

config = get_config(DATA_CONFIG_PATH)
print(config)

df = pd.read_csv(NACC_DATASET_PATH, low_memory=False)

{'batch_size': 32, 'num_epochs': 100, 'target_variable': ['NACCALZD'], 'missing_values_threshold': 0.5}


In [171]:
target_variable = list(config['target_variable'])
missing_values_threshold = config['missing_values_threshold']

Y = df[target_variable].copy()
X = df.drop(columns=target_variable, axis=1)
print(X.shape)
#calculate missing values ratio
missing_ratio = X.isnull().mean()
#retain the values which are less than the threshold
variables_to_be_retained = X.columns[missing_ratio <= missing_values_threshold]
X_filtered = X[variables_to_be_retained]
print(X_filtered.shape)

(25368, 996)
(25368, 835)


In [172]:
# # For being simple, text are converted to 1, emopty cols are converted to 0
# X_filled = X_filtered.applymap(lambda x: 1 if isinstance(x, str) and x.strip() != '' else x)

# The samplest way to fill missing numerical values is to fill them with 0
X_filled = X_filtered.fillna(0)
# check if there are any null values
has_null_values = X_filled.isnull().any().any()
if has_null_values:
    print("has null values.")
else:
    print("has no null values.")


has no null values.


In [173]:
# print the variables that are strings(not numerical)
string_columns = X_filled.select_dtypes(include='object').columns


# print the variables that are numerical(not strings)
num_string_columns = len(string_columns)
print(f"String Columns in Dataframe are are listed below. There are {num_string_columns} colums in total.")

for col in string_columns:
    print(col)
"""
    TODO: discuss how to deal with the string columns
    1. drop the columns (current approach)
""" 
# drop the string columns
X_filled = X_filled.drop(string_columns, axis=1)


String Columns in Dataframe are are listed below. There are 14 colums in total.
NACCID
PACKET
DRUG1
DRUG2
DRUG3
DRUG4
DRUG5
DRUG6
DRUG_ID1
DRUG_ID2
DRUG_ID3
DRUG_ID4
DRUG_ID5
DRUG_ID6


In [174]:
# One-Hot Encoding
X_encoded = pd.get_dummies(X_filled)
# 0-8 - Diagnosis of Alzheimer's Disease (NACCALZD)
Y['NACCALZD'] = Y['NACCALZD'].map({0: 'Undiagnosed', 1: 'Diagnosed', 8: 'Undiagnosed'})

Y_encoded = pd.get_dummies(Y, columns=['NACCALZD'])
Y_encoded = Y_encoded.astype(float)
# Normalization
# 1. Min-Max Normalization
X_min_max = (X_filled - X_filled.min(numeric_only=True)) / (X_filled.max(numeric_only=True) - X_filled.min(numeric_only=True))


# 2. Z-Score Normalization
X_z_score = (X_filled - X_filled.mean(numeric_only=True)) / X_filled.std(numeric_only=True)


In [175]:
# There maybe some identical columns in the dataset, so we need to remove them. (Maybe there are other ways to do this)
columns_with_null = X_z_score.columns[X_z_score.isnull().any()]

# Print the column names with null values
print("Columns with null values:")
print(columns_with_null)
X_z_score.drop(columns_with_null, axis=1, inplace=True)

Columns with null values:
Index(['DOWNS', 'HUNT', 'NACCMRSA'], dtype='object')


In [176]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

features = X_z_score
# features = X_min_max
labels = Y_encoded
COL_NUM = features.shape[1]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define a custom PyTorch Dataset for your data
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create CustomDataset instances for training and testing sets
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

# Create DataLoader for training and testing sets
batch_size = config['batch_size']
# batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [177]:
import torch.nn as nn
import torch.optim as optim
import tqdm as tqdm
import numpy as np
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report

num_epochs = config['num_epochs']
# num_epochs = 10


# Define a simple MLP model
class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):    
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        self.bacth_norm = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(0.5)
        self.softmax = nn.Softmax(dim=1)
        self._init()
    def _init(self):
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.constant_(self.fc1.bias, 0.1)
        nn.init.constant_(self.fc2.bias, 0.1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bacth_norm(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Create the MLP model
CLASS_NUM = 2
mlp_model = MLP(COL_NUM, 640, CLASS_NUM)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(mlp_model.parameters(), lr=0.01)
for epoch in tqdm.trange(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = mlp_model(inputs)
        loss = criterion(outputs, targets)        
        loss.backward()
        optimizer.step()


100%|██████████| 300/300 [10:43<00:00,  2.15s/it]


In [180]:
with torch.no_grad():
    mlp_model.eval()
    y_pred_tensor = mlp_model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, 1)
    y_pred = y_pred.numpy()
    y_test = torch.argmax(y_test_tensor, 1)
    y_test = y_test.numpy()

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Sensitivity (True Positive Rate)
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Sensitivity:", sensitivity)

# Specificity (True Negative Rate)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
print("Specificity:", specificity)

# F1 Score
f1_score = 2 * (sensitivity * specificity) / (sensitivity + specificity)
print("F1 Score:", f1_score)

# ROC & AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC:", roc_auc)

# False Positive Rate & False Negative Rate
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0])
false_negative_rate = conf_matrix[1, 0] / (conf_matrix[1, 0] + conf_matrix[1, 1])
print("False Positive Rate:", false_positive_rate)
print("False Negative Rate:", false_negative_rate)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.0
Confusion Matrix:
[[1704    0]
 [   0 3370]]
Sensitivity: 1.0
Specificity: 1.0
F1 Score: 1.0
AUC: 1.0
False Positive Rate: 0.0
False Negative Rate: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1704
           1       1.00      1.00      1.00      3370

    accuracy                           1.00      5074
   macro avg       1.00      1.00      1.00      5074
weighted avg       1.00      1.00      1.00      5074

