In [None]:
import sys
sys.path.append('../')  # Add the parent folder to the system path

In [None]:
from torchtext.vocab import GloVe

# Load pre-trained Word2Vec embeddings(GloVe)
word_embeddings = GloVe(name='6B', dim=100)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from artificial_nn import ANN
from util import preprocess

data = pd.read_csv("../../nacc/newer/investigator_ftldlbd_nacc65.csv")

columns_to_use = ['NACCMOCA','CRAFTDRE','COMMUN','NACCMMSE','HOMEHOBB','JUDGMENT','LOGIMEM','CDRSUM','MEMORY', 'BOSTON', 'NACCUDSD']

extracted_df = data[columns_to_use]

#print(extracted_df.head())

new_csv = './nacc_processed.csv'

# Write the DataFrame to a CSV file
extracted_df.to_csv(new_csv, index=False)


In [None]:
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

suitable_word_embedded_columns = ['COMMUN', 'HOMEHOBB', 'JUDGMENT', 'MEMORY']

column_combinations = [[]]

# generate all combinations of different word embeddings
for r in range(1, len(suitable_word_embedded_columns) + 1):
    for combination in itertools.combinations(suitable_word_embedded_columns, r):
        column_combinations.append(list(combination))

#print(extracted_df["COMMUN"].unique())
#print(extracted_df["JUDGMENT"].unique())
#print(extracted_df["HOMEHOBB"].unique())
#print(extracted_df["MEMORY"].unique())

string_mapping = {
    0.0: 'none',
    0.5: 'ambiguous',
    1.0: 'mild',
    2.0: 'moderate',
    3.0: 'severe',
    99 : 'default'
}

for word_embedded_columns in column_combinations:
    
    dataframe = extracted_df.copy(deep = True)

    for col in word_embedded_columns:
        dataframe[col] = dataframe[col].map(string_mapping)

    # Features : X , Labels : y
    X, y = preprocess.sep_column(dataframe, "NACCUDSD")
    
    # split data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=462)

    # adjustment for pytorch nn training 
    """
    The targets should be in the range [0, 3] for our use case, as they are used to index the output tensor
    """   
    y_train = y_train - 1
    y_test = y_test - 1

    X_train = preprocess.replace_with_word_embeddings(X_train, word_embeddings, word_embedded_columns)
    X_test = preprocess.replace_with_word_embeddings(X_test, word_embeddings, word_embedded_columns)
    
    #print(X_train)
    
    X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
    X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.values)
    y_test_tensor = torch.tensor(y_test.values)
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    input_size = X_train_tensor.shape[1]
    hidden_sizes = [512,256,128,64,32]               # hidden layer size is hyperparameter
    output_size = len(y.unique())
    
    model = ANN.ArtificialNeuralNetwork(input_size, hidden_sizes, output_size)
    
    criterion = nn.CrossEntropyLoss()                       # cross entropy value is used as loss function
    optimizer = optim.Adam(model.parameters(), lr=0.01)    # learning rate is hyperparameter
    
    num_epochs = 100
    loss_list = []
    
    for epoch in range(num_epochs):
        loss_of_epoch = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()               # clears the gradients before new back prop (new batch)
            outputs = model.forward(inputs)     # feed model with forward prop (get predictions - outputs)
            loss = criterion(outputs, labels)   # calculate loss value of predictions
            loss.backward()                     # perform back prop to compute gradient w.r.t model params
            optimizer.step()                    # update the model params (weights) according to LR and gradient

            loss_of_epoch += loss.item()

        loss_list.append(loss_of_epoch / len(train_loader))
    
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

    plt.plot(range(1, num_epochs + 1), loss_list, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Training Loss Curve for replacing {" , ".join(word_embedded_columns)} attribute with word embedding vector')
    plt.legend()
    plt.show()
    
    # evaluation of model in test set
    with torch.no_grad():
        model.eval()
        outputs = model.forward(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print("\nClassification Report:")
    print(classification_report(y_test_tensor.cpu(), predicted.cpu(), digits=4))