In [None]:
import numpy as np
import torch
from torch import nn
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from matplotlib import pyplot as plt
from utils import prob_to_binary

In [None]:
eeg = pd.read_csv("EEG_data.csv")
x = eeg.iloc[:,2:13]
normalized_x =(x-x.mean())/x.std()
normalized_x
X = np.array(normalized_x)
y = np.array(eeg["user-definedlabeln"])
groups = np.zeros(len(eeg),dtype=np.float32)
for i in range(len(eeg)):
    num = str(int(eeg.iloc[i,:]["SubjectID"])) + "." + str(int(eeg.iloc[i,:]["VideoID"]))
    num = float(num)
    groups[i] = num

In [None]:
gss = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=0)
indices = []
for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}, group={groups[train_index]}")
    print(f"  Test:  index={test_index}, group={groups[test_index]}")
    indices.append((train_index, test_index))
train_ind = indices[0][0]
test_ind = indices[0][1]
print(len(train_ind), len(test_ind))
print("Unique subj-video combinations in test: ", pd.unique(groups[test_index]))
train_X, train_y, train_groups = X[train_ind], y[train_ind], groups[train_ind]
test_X, test_y, test_groups = X[test_ind], y[test_ind], groups[test_ind]

# Model and Data Objects

I'll need to make a  need to pad the sequences for the batches to be the same size.

In [None]:
class Data(Dataset):
    def __init__(self, X, y, groups):
        # standardize batch sizes
        groups = torch.tensor(groups)
        subj_vids = []
        uq_groups = torch.unique(groups)
        labels = torch.ones(uq_groups.size(0))
        lengths = torch.ones(uq_groups.size(0))
        for i in range(uq_groups.size(0)):
            x = uq_groups[i]
            ind = (groups == x).nonzero()
            lengths[i] = ind.size(0)
            subj_vids.append(torch.tensor(X[ind], dtype=torch.float32))
            labels[i] = np.mean(y[ind][0])
        padded_batches = pad_sequence(subj_vids, batch_first=True, padding_value=-100)
        
        self.X = padded_batches # n_batches x n_timesteps x 11
        self.y = labels
        self.group_lengths = lengths
        self.len = uq_groups.size(0)
       
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.group_lengths[index]
   
    def __len__(self):
        return self.len


In [None]:
batch_size = 40

# Instantiate training and test data
train_data = Data(train_X, train_y, train_groups)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

test_data = Data(test_X, test_y, test_groups)
test_dataloader = DataLoader(dataset=test_data, batch_size=test_data.__len__())

for batch, (X, y, len) in enumerate(train_dataloader):
    print(f"Batch: {batch+1}")
    print(f"X shape: {X.shape}") # need to deal with the extra dimension during training, printing adjustment below
    print(f"X modified shape: {X.view(X.size(0), X.size(1), X.size(3)).shape}")
    print(f"y shape: {y.shape}")
    print(f"true length: {len}")
    break


In [None]:
class EEG_RNN(nn.Module):
    def __init__(self, input_size=11, hidden_size=11, output_size=1, num_layers=1, bias=False, dropout=0):
        # 11 features by default, only 1 output since binary
        super(EEG_RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, nonlinearity="relu", bias=bias, batch_first=True, dropout=dropout)
        self.layer1 = nn.Linear(hidden_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=dropout)

    
    def forward(self, x, lens, hidden_state=None):
        final_indices = (lens - 1).type(torch.int)
        rnn_output, h_n = self.rnn(x, hidden_state)
        rnn_output = rnn_output.data[final_indices,:]
        h_1 = self.dropout(rnn_output)
        h_2 = self.relu(self.layer1(h_1))
        layer_norm2 = nn.LayerNorm([h_2.size(0), h_2.size(1)])
        h_2 = layer_norm2(h_2)
        h_2 = self.dropout(h_2)
        output = self.sigmoid(self.layer2(h_2))
        return output
    

# Training Loop

In [None]:
torch.manual_seed(100)
model = EEG_RNN(dropout=0.75)
num_epochs = 100
loss_values = []
#norms = []

optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCELoss()
model.train()
best_loss = np.inf
optimal_params = model.state_dict()
for epoch in range(num_epochs):
    for X, y, lens in train_dataloader:
        # zero the parameter gradients
        optimizer.zero_grad()
        X = X.view(X.size(0), X.size(1), X.size(3))
        X = pack_padded_sequence(X, lens, batch_first=True, enforce_sorted=False) # tell model what to ignore
        
        # forward + backward + optimize
        pred = model(X, lens)
        loss = loss_fn(pred, y.unsqueeze(-1))
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()

        # store best performing model
        if loss.item() <= best_loss:
            best_loss = loss.item()
            optimal_params = model.state_dict()
        
        """
        total_norm = 0
        for p in model.parameters():
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        norms.append(total_norm)"""
# load best model
model.load_state_dict(optimal_params)


In [None]:
# evaluate reduction of loss
epoch_index = np.linspace(0, num_epochs, 200)

print(np.array(loss_values).shape)
fig, ax = plt.subplots(figsize=(8,5))
plt.plot(epoch_index, np.array(loss_values))
plt.title("Loss per Train Step")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()
plt.clf()

"""
fig, ax = plt.subplots(figsize=(8,5))
plt.plot(epoch_index, np.array(norms))
plt.title("Norm of Gradient per Train Step")
plt.xlabel("Epoch")
plt.ylabel("Norm of Gradient")
plt.show()"""

# evaluate success on test set
with torch.no_grad():
    for X, y, lens in test_dataloader:
        print(X.size(0))
        X = X.view(X.size(0), X.size(1), X.size(3))
        X = pack_padded_sequence(X, lens, batch_first=True, enforce_sorted=False) # tell model what to ignore
        outputs = model(X, lens)
        
        preds = prob_to_binary(outputs.numpy())
        rnn_preds = outputs.numpy()
        test_labels = y.numpy()
        accuracy = accuracy_score(y.numpy(), preds)
        print("TEST ACCURACY: ", accuracy)
        # Plot ROC
        test_auc = roc_auc_score(y.numpy(), outputs.numpy())
        te_fpr, te_tpr,thresholds = roc_curve(y.numpy(), outputs.numpy())
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - Recurrent Network")
        plt.plot(te_fpr, te_tpr, label = f"Test (AUC = {test_auc})")
        plt.legend()
        plt.show()
        plt.clf()

    for X,y, lens in DataLoader(dataset=train_data, batch_size=train_data.__len__()):
        X = X.view(X.size(0), X.size(1), X.size(3))
        print(X.size(0))
        X = pack_padded_sequence(X, lens, batch_first=True, enforce_sorted=False) # tell model what to ignore
        outputs = model(X, lens)
        
        preds = prob_to_binary(outputs.numpy())
        accuracy = accuracy_score(y.numpy(), preds)
        print("TRAIN ACCURACY: ", accuracy)
        
        # Plot ROC
        train_auc = roc_auc_score(y.numpy(), outputs.numpy())
        tr_fpr, tr_tpr,thresholds = roc_curve(y.numpy(), outputs.numpy())
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - Recurrent Network")
        plt.plot(te_fpr, te_tpr, label = f"Train (AUC = {train_auc})")
        plt.legend()
        plt.show()
        plt.clf()

Explain in paper that it had the tendency to underfit or overfit, so prioritizing perfomance on test data led to a model that didn't do well on training. 

In [None]:
lr_preds = [0.52037767, 0.49901844, 0.56520837, 0.48942119, 0.51469586, 0.63132302,
 0.49844834, 0.42362165, 0.40950222, 0.42885294, 0.58069607, 0.6115527,
 0.51783915, 0.46300492, 0.52220805, 0.5962914,  0.52307969, 0.57684982,
 0.55639789, 0.54565716, 0.62444722, 0.59621355, 0.46035665, 0.49301864,
 0.50578362, 0.5414315,  0.55775697, 0.5598228,  0.54579473, 0.53754902]
gbf_preds = [0.56720806, 0.53665184, 0.62126978, 0.53470766, 0.55230807, 0.63431831,
 0.47839086, 0.42208998, 0.43901364, 0.42522622, 0.56182255, 0.60283442,
 0.56690808, 0.45534771, 0.54777996, 0.61411145, 0.48358263, 0.56873362,
 0.52046399, 0.58780633, 0.64780111, 0.64631479, 0.47580285, 0.54666121,
 0.53211724, 0.55532473, 0.54247439, 0.52227227, 0.60761442, 0.53030283]


In [None]:
gbf_fpr, gbf_tpr, lr_thresholds = roc_curve(test_labels, gbf_preds)
gbf_auc = np.round(roc_auc_score(test_labels, gbf_preds),3)
lr_fpr, lr_tpr, lr_thresholds = roc_curve(test_labels, lr_preds)
lr_auc = np.round(roc_auc_score(test_labels, lr_preds),3)
rnn_fpr, rnn_tpr, rnn_thresholds = roc_curve(test_labels, rnn_preds)
rnn_auc = np.round(roc_auc_score(test_labels, rnn_preds),3)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve - Model Comparison")
plt.plot(gbf_fpr, gbf_tpr, label = f"Gradient Boosted Trees (AUC = {gbf_auc})")
plt.plot(lr_fpr, lr_tpr, label = f"Logistic Regression (AUC = {lr_auc})")
plt.plot(rnn_fpr, rnn_tpr, label = f"RNN (AUC = {rnn_auc})")
plt.legend()
plt.show()
plt.clf()