In [None]:
from torchtext.vocab import GloVe

# Load pre-trained Word2Vec embeddings(GloVe)
word_embeddings = GloVe(name='6B', dim=100)

In [6]:
import sys
sys.path.append('../')  # Add the parent folder to the system path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from artificial_nn import ANN
import util.preprocess as preprocess

# use below line for local use
# extracted_df = pd.read_csv("./nacc_processed.csv").astype("float")


# use below lines assuming you only have raw data (not filtered one)

data = pd.read_csv("../../nacc/newer/investigator_ftldlbd_nacc65.csv")

  data = pd.read_csv("../../nacc/newer/investigator_ftldlbd_nacc65.csv")


In [18]:
data['VISITDAY'] = data['VISITDAY'].astype(str).str.zfill(2)
data['VISITMO'] = data['VISITMO'].astype(str).str.zfill(2)
data['VISITYR'] = data['VISITYR'].astype(str)
data['VISITDT'] = data['VISITYR'] + data['VISITMO'] + data['VISITDAY']

columns_to_use = ['NACCID', 'VISITDT','NACCMOCA','CRAFTDRE','COMMUN','NACCMMSE','HOMEHOBB','JUDGMENT','LOGIMEM','CDRSUM','MEMORY', 'BOSTON', 'NACCUDSD']
extracted_df = data[columns_to_use]
extracted_df = extracted_df.sort_values(by=["NACCID", "VISITDT"], ascending=True)
#print(extracted_df.head())
new_csv = './nacc_processed.csv'
# write the DataFrame to a CSV file
extracted_df.to_csv(new_csv, index=False)

In [19]:
features_to_impute = columns_to_use[2:-1]

def forward_and_backward_impute(group):
    for feature in features_to_impute:
        # replace -4 with nan for processing
        group[feature] = group[feature].replace(-4, pd.NA)
        # forward fill nan values
        group[feature] = group[feature].ffill()
        # backward fill nan values
        group[feature] = group[feature].bfill()
    return group

# Group by patient_id and apply forward and backward impute function
extracted_df = extracted_df.groupby('NACCID').apply(forward_and_backward_impute)

for feature in features_to_impute:
    extracted_df[feature] = extracted_df[feature].fillna(-4)

new_csv = './fb_imputed.csv'
# write the DataFrame to a CSV file
extracted_df.to_csv(new_csv, index=False)

In [None]:
# handle missing values
missing_value_pairs  = {
    "NACCMOCA" : [-4,88,99],
    "CRAFTDRE" : [-4,95,96,97,98],
    "LOGIMEM" : [-4,95,96,97,98],
    "NACCMMSE" : [-4,88,95,96,97,98],
    "CDRSUM" : [99],
    "BOSTON" : [-4,95,96,97,98]
}

include_severity_vals = ["CRAFTDRE", "LOGIMEM", "NACCMMSE", "BOSTON"]

# create a seperate column for severity cases - binary indicator
for col in include_severity_vals:
    for val in [95,96,97,98]:
        extracted_df[f"{col}_{val}"] = (extracted_df[col] == val).astype(int)

# replace all missing values with NaN - ensure gloablity, easier to compare
for col, missing_values in missing_value_pairs.items():
    extracted_df[col] = extracted_df[col].replace(missing_values, np.nan)

# imputation values is a dictionary that contains mean values of columns with each label
# keys : column names , values : list with 4 (number of labels) values
imputation_values = {}

for col in missing_value_pairs.keys():
    means = []
    for label in range(1,5):
        means.append(extracted_df[extracted_df["NACCUDSD"] == label][col].mean())
    imputation_values[col] = means

# replace each missing value with its imputation values
for index, row in extracted_df.iterrows():
    for col in missing_value_pairs.keys():
        if np.isnan(row[col]):
            label = row["NACCUDSD"].astype("int64")
            imputation_val = imputation_values[col][label - 1]
            extracted_df.at[index, col] = imputation_val

# use below lines to filter the dataset using label values
# e.g. [1,4] will filter dataset such that only rows that labeled with 1 and 4 will remain
#labels_to_include = [1,4]
#main_df = preprocess.filter_by_label(extracted_df, "NACCUDSD", labels_to_include)
#print(main_df)
main_df = extracted_df

# below lines can be used to visualize new matrix - ensure everything is going okay basically

new_csv = './visualize_main_df.csv'
extracted_df.to_csv(new_csv, index=False)

In [None]:
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

suitable_word_embedded_columns = ["COMMUN", "HOMEHOBB", "JUDGMENT", "MEMORY"]

# delete the rows with 99 in it (missing value)
main_df = main_df[~main_df[suitable_word_embedded_columns].eq(99).any(axis=1)]

column_combinations = [[]]

# generate all combinations of different word embeddings
for r in range(1, len(suitable_word_embedded_columns) + 1):
    for combination in itertools.combinations(suitable_word_embedded_columns, r):
        column_combinations.append(list(combination))

#print(extracted_df["COMMUN"].unique())
#print(extracted_df["JUDGMENT"].unique())
#print(extracted_df["HOMEHOBB"].unique())
#print(extracted_df["MEMORY"].unique())

string_mapping = {
    0.0: 'no symptom',
    0.5: 'uncertain symptom',
    1.0: 'mild symptom',
    2.0: 'moderate symptom',
    3.0: 'severe symptom',
    # 99 : 'default'
}

for word_embedded_columns in column_combinations:
    
    dataframe = main_df.copy(deep = True)

    for col in word_embedded_columns:
        dataframe[col] = dataframe[col].map(string_mapping)

    # Features : X , Labels : y
    X, y = preprocess.sep_column(dataframe, "NACCUDSD")
    y = y.astype(int)
    
    print(y.unique())

    # split data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=462)

    # adjustment for pytorch nn training 
    """
    The targets should be in the range [0, 3] for our use case, as they are used to index the output tensor
    """   
    # match 1 to 0 and 4 to 1
    #y_train = (y_train == 4).astype(int)
    #y_test = (y_test == 4).astype(int)


    y_train = y_train - 1
    y_test = y_test - 1

    X_train = preprocess.replace_with_word_embeddings(X_train, word_embeddings, word_embedded_columns)
    X_test = preprocess.replace_with_word_embeddings(X_test, word_embeddings, word_embedded_columns)
    
    #print(X_train)
    
    X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
    X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.values)
    y_test_tensor = torch.tensor(y_test.values)
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    input_size = X_train_tensor.shape[1]
    hidden_sizes = [512,512,256,128,64]               # hidden layer size is hyperparameter
    output_size = len(y.unique())
    
    model = ANN.ArtificialNeuralNetwork(input_size, hidden_sizes, output_size)
    
    criterion = nn.CrossEntropyLoss()                       # cross entropy value is used as loss function
    optimizer = optim.Adam(model.parameters(), lr=0.0001)    # learning rate is hyperparameter
    
    num_epochs = 100
    loss_list = []
    
    for epoch in range(num_epochs):
        loss_of_epoch = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()               # clears the gradients before new back prop (new batch)
            outputs = model.forward(inputs)     # feed model with forward prop (get predictions - outputs)
            loss = criterion(outputs, labels)   # calculate loss value of predictions
            loss.backward()                     # perform back prop to compute gradient w.r.t model params
            optimizer.step()                    # update the model params (weights) according to LR and gradient

            loss_of_epoch += loss.item()

        loss_list.append(loss_of_epoch / len(train_loader))
    
        #print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

    plt.plot(range(1, num_epochs + 1), loss_list, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Training Loss Curve for replacing {" , ".join(word_embedded_columns)} attribute with word embedding vector')
    plt.legend()
    plt.show()
    
    # evaluation of model in test set
    with torch.no_grad():
        model.eval()
        outputs = model.forward(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print("\nClassification Report:")
    print(classification_report(y_test_tensor.cpu(), predicted.cpu(), digits=4))