This is for kaggle competition

In [None]:
# Tons and tons of imports!
from collections import Counter, namedtuple
from itertools import chain
import json
import math
import os
from pathlib import Path
from tqdm.notebook import tqdm, trange
from typing import List, Tuple, Dict, Set, Union
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, TensorDataset
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


**Load Data**

In [None]:
LF_train  = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/LF_train.csv')
LF_test = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/LF_test.csv')
LH_train = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/LH_train.csv')
LH_test = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/LH_test.csv')
RF_train  = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/RF_train.csv')
RF_test = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/RF_test.csv')
RH_train = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/RH_train.csv')
RH_test = pd.read_csv('drive/MyDrive/Colab Notebooks/kaggle_ml/data/RH_test.csv')

**Date Processing**

In [None]:
### turn literals into numericals
LF_train = LF_train.drop(columns=['dob', 'gait', 'Gait'])
LF_train['forceplate_date'] = LF_train['forceplate_date'].str.replace("-","").astype(int)

### nan value
LF_train = LF_train.fillna(0)
LF_train = LF_train.replace('Not able to trot', 0)
LF_train = LF_train.replace('Not able to walk', 0)
LF_train = LF_train.replace('no data', 0)
LF_train['Speed'] = LF_train['Speed'].astype(float)

for col in LF_train.columns:
    if 'V' in col or 'Speed' in col :
        LF_train[col] = LF_train[col].replace(0, LF_train[col].mean())

### sort by id
LF_train = LF_train.sort_values(by=['id'])

### extract the golden standard
LF_train_labels = LF_train['LF']
LF_train_data = LF_train.drop("LF", axis='columns')

### turn the data frames into numpy data type
LF_train_labels = LF_train_labels.to_numpy()
LF_train_data = LF_train_data.to_numpy()

### split the data for training and evaluation - random_state is a seed
XLF_train, XLF_val, yLF_train, yLF_val = train_test_split(LF_train_data, LF_train_labels, random_state = 1)

### (optional)perform PCA and get rid of columns with low variance
### (optional)get rid of the id columns to reduce 'noise'
print("XLF_train shape: ", XLF_train.shape)
print("yLF_train shape: ", yLF_train.shape)

XLF_train shape:  (82, 367)
yLF_train shape:  (82,)


In [None]:
### convert the numpy data types into torch tensors that we want to use in the neural networks
XLF_train = np.vstack(XLF_train).astype(float)
yLF_train = np.vstack(yLF_train).astype(float)
XLF_val = np.vstack(XLF_val).astype(float)
yLF_val = np.vstack(yLF_val).astype(float)

XLF_train = torch.from_numpy(XLF_train).to(torch.float32)
yLF_train = torch.from_numpy(yLF_train).to(torch.float32)
XLF_val = torch.from_numpy(XLF_val).to(torch.float32)
yLF_val = torch.from_numpy(yLF_val).to(torch.float32)

In [None]:
batch_size = 32
XLF_train_dset = TensorDataset(XLF_train, yLF_train)
XLF_train_dloader = DataLoader(XLF_train_dset, batch_size = batch_size, shuffle = True)

XLF_val_dset = TensorDataset(XLF_val, yLF_val)
XLF_val_dloader = DataLoader(XLF_val_dset, batch_size = batch_size, shuffle = True)

In [None]:
### sanity check for data frame, not python array - whether there's still nan values
# LF_train.isnull().any().any()

In [None]:
# # split by walk / trot
# trot_list = []
# name_list = []
# for col in LF_train.columns:
#     if 'trot' in col:
#         trot_list.append(col)

#         new_name = col.split('_trot')
#         name_list.append(new_name[0])

# LF_train_trot = pd.DataFrame(LF_train, columns = trot_list)
# LF_train_trot.columns = name_list

# both = ['id', 'gender', 'weight', 'forceplate_date', 'speed', 'age', 'Speed', 'LF']
# LF_train_both = pd.DataFrame(LF_train, columns = both)

# # add binary col is_trot
# LF_train_trot = LF_train_both.join(LF_train_trot)
# LF_train_trot['is_trot'] = 1

# LF_train_walk = LF_train.drop(columns=trot_list)
# LF_train_walk['is_trot'] = 0

# # concat walk and trot
# LF_train = pd.concat([LF_train_walk, LF_train_trot])

**FFNN construct**

In [None]:
# Lambda to switch to GPU if available
get_device = lambda : "cuda:0" if torch.cuda.is_available() else "cpu"
device = get_device()
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

################################################################################
#########################             ADDED             ########################
################################################################################
def weight_init(m):
	if isinstance(m, nn.Linear):
		nn.init.xavier_uniform_(m.weight)
		nn.init.constant_(m.bias, 0.)

# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html

class FFNN(nn.Module):
	def __init__(self, input_dim, hidden_dim, output_dim):
		super(FFNN, self).__init__()
		############################################################################
		#########################             ADDED             ####################
		############################################################################
		self.loss_class_weights = torch.tensor([1, 1], 
		                                       dtype=torch.float)
		# self.embedding = nn.Embedding(vocab_size, embedding_dim, max_norm=True)
	  ### TODO : initialize your model with the necessary layers and functions ###
		

		### Here are pytorch docs which you may find useful:
		### Linear layer:
		###		https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
		self.W = nn.Linear(input_dim, hidden_dim)
		self.W_x = nn.Linear(hidden_dim, output_dim)

		### ReLU: 
		###		https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
		self.relu = nn.ReLU()           
		### LogSoftmax:
		###		https://pytorch.org/docs/stable/generated/torch.nn.LogSoftmax.html
		self.softmax = nn.LogSoftmax(dim = 1)
		### NLLoss:
		###		https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html
		self.loss = nn.CrossEntropyLoss()

	##############################################################################
	#########################           CHANGED             ######################
	##############################################################################
	def compute_Loss(self, predicted_vector, gold_label):
		## cross entropy documentation
		return self.loss(predicted_vector, gold_label)

	def forward(self, input_vector):
		############################################################################
		#########################             ADDED             ####################
		############################################################################
		 # input_vector=(batch_size, max_len)
		original_shape = input_vector.shape

		# input_vector = input_vector.reshape(-1)

		
		# The z_i are just there to record intermediary computations for your clarity
		# embeddings = self.embedding(input_vector) 
		z1 = self.W(input_vector)
		
		# correction 1: No activation on z1; no linear layer for z2; mistakingly softmax z1
		z1_relu = self.relu(z1)
		z2 = self.W_x(z1_relu)
		predicted_vector = self.softmax(z2)
		# predicted_vector = self.softmax(z1)
		# correction 1 end
		
		# predicted_vector=(batch_size, row length, output_dim)
		predicted_vector = predicted_vector.reshape((original_shape[0],
                                                 original_shape[1], -1))

		return predicted_vector

	def load_model(self, save_path):
		self.load_state_dict(torch.load(save_path))

	def save_model(self, save_path):
		torch.save(self.state_dict(), save_path)


**FFNN training**

In [None]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)
################################################################################
#########################             CHANGED             ######################
################################################################################
def train_epoch(model, train_loader, optimizer):
  model.train()
  total = 0
  batch = 0
  total_loss = 0
  correct = 0
  for (input_batch, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):
    optimizer.zero_grad()
    # correction 2: batch = 1 is incorrect
    batch += 1
    flattened_expected_out = expected_out.reshape(-1).to(device)
    # flattened_batch_mask = batch_mask.reshape(-1).to(device)
    output = model(input_batch.to(get_device())).to(get_device())
    flattened_output = output.reshape(-1, output.shape[-1])
    loss = model.compute_Loss(flattened_output, flattened_expected_out)
    total += torch.size(flattened_expected_out)[0]
    _, predicted = torch.max(output, -1)
    flattened_predicted = predicted.reshape(-1)
    # correction 3: We think correct should increase instead of decrease
    # correct -= (flattened_expected_out[flattened_batch_mask].to("cpu") == flattened_predicted[flattened_batch_mask].to("cpu")).cpu().numpy().sum()
    correct += (flattened_expected_out.to("cpu") == flattened_predicted.to("cpu")).cpu().numpy().sum()
    total_loss += loss.item()
    loss.backward()
    # correction 4: SGD wasn't performed
    optimizer.step()
    
  print("Loss: " + str(total_loss/batch))
  print("Training Accuracy: " + str(correct/total))
  return total_loss/batch

In [None]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

def evaluation(model, val_loader, optimizer):
  model.eval()
  loss = 0
  correct = 0
  total = 0
  for (input_batch, expected_out) in tqdm(val_loader, leave=False, desc="Validation Batches"):
    output = model.forward(input_batch.to(get_device())).to(get_device())
    total += output.size()[1]
    _, predicted = torch.max(output, 1)
    correct += (expected_out.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()
    loss += model.compute_Loss(output, expected_out.to(get_device()))
  loss /= len(val_loader)
  print("Validation Loss: " + str(loss.item()))
  print("Validation Accuracy: " + str(correct/total))
  print()
  return loss.item()

In [None]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)
def train_and_evaluate(number_of_epochs, model, train_loader, val_loader, min_loss=0, lr=.001):
  optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=.01)
  loss_values = [[],[]]
  for epoch in trange(number_of_epochs, desc="Epochs"):
    cur_loss = train_epoch(model, train_loader, optimizer)
    loss_values[0].append(cur_loss)
    cur_loss_val = evaluation(model, val_loader, optimizer)
    loss_values[1].append(cur_loss_val)
    if cur_loss <= min_loss: return loss_values
  return loss_values

In [None]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

### TODO: add code for creating model (check updated header for FFNN)
model = FFNN(367, 150, 2)

### Initialize model weights
model.apply(weight_init)

### TODO: train and evaluate the model with the functions and data above ###
result_model = train_and_evaluate(4, model.cuda(), XLF_train_dloader, XLF_val_dloader, 0.2, 0.001)


Epochs:   0%|          | 0/4 [00:00<?, ?it/s]

Training Batches:   0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:
# TODO : add a single line code that saves your model in order to prevent re-training the model for later use.

model.save_model("ffnn_kaggle.pth")

In [None]:
# Example of how to load
ffnn = FFNN(300, 150, 9, 12414)
ffnn.load_model("ffnn_kaggle.pth")
ffnn = ffnn.to(get_device())

**Single hidden Layer RNN**

In [None]:
################################################################################
#########################             CHANGED             ######################
################################################################################
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, max_norm=True)
        self.loss_class_weights = torch.tensor([0.5, 1, 1, 1, 1, 1, 1, 1, 1], 
                                               dtype=torch.float)
        ### TODO : Initialize dimensions of all layers.
        ### TODO : Initialize three linear layers:
              # 1. An input layer
        self.input_layer=nn.Linear(embedding_dim, hidden_dim)
              # 2. A hidden layer
        self.hidden_layer=nn.Linear(hidden_dim,hidden_dim)
              # 3. An output layer
        self.output_layer=nn.Linear(hidden_dim,output_dim)
        ### TODO : Initialize the activation function.
        self.relu=nn.ReLU()
        ### TODO : Initialize softmax and loss functions.
        self.lsmax=nn.LogSoftmax(dim = -1)
        self.loss=nn.CrossEntropyLoss()

        self.h_layer=torch.zeros(1, hidden_dim,dtype=torch.float, device = torch.device("cuda:0"))

    def compute_Loss(self, predicted_vector, gold_label, masks):
        return self.loss(predicted_vector[masks], gold_label[masks])

    def forward(self, inputs):
        ### TODO : Write the forward function such that it processes the sentences incrementally. 
        ### TODO : Return output of the softmax across all time steps

        original_shape = inputs.shape
        # not sure about this
        h_i = torch.zeros_like(self.h_layer)
        y = torch.Tensor([]).to(get_device())
        # t is time_step
        for t in range(max_len):
          embeddings = self.embedding(inputs[:, t])
          h_i =self.relu(self.hidden_layer(h_i) + self.input_layer(embeddings))
          y_i = self.lsmax(self.output_layer(h_i))
          shape_y_i = y_i.shape
          y_i = y_i.reshape((-1, shape_y_i[0], shape_y_i[1]))
          # print(y_i.shape)
          y = torch.cat((y, y_i), -1)
          

        output = y.reshape((original_shape[0],
                                                 original_shape[1], -1))
        return output

    def load_model(self, save_path):
        self.load_state_dict(torch.load(save_path))

    def save_model(self, save_path):
        torch.save(self.state_dict(), save_path)

In [None]:
### TODO: add code for creating model (check updated header for RNN)
rnn = RNN(150, 100, 9, 12414)

### Initialize model weights
rnn.apply(weight_init)

### TODO: train and evaluate the model with the functions and data above ###

result_model = train_and_evaluate(6, rnn.cuda(), train_loader, val_loader, 0.05, 0.001)

# print("I'm not completed yet!")

In [None]:
rnn.save_model('rnn_kaggle.pth')
rnn.load_model('rnn_kaggle.pth')
rnn = rnn.to(get_device())

**Multiple hidden layer RNN**

In [None]:
class RNN2(nn.Module):
    ### TODO : Modify __init__ header ###
    # sl: I think the _init_ header is already initialized, so just left it as it is 
    ### TODO : Initialize n hidden linear layers in your __init__ ###
    # sl: modified the h_layer attribute to be a list of hidden layers instead of a single layer
    ### TODO : Modify your forward header ###
    # sl: I think the header is already modified when given, not so sure what other arguments we will need
    ### TODO : Modify your forward function to: ###
        # 1. Pass the data through each hidden layer #
        # 2. Save the activation of each layer at each timestep when training=FALSE #
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, hidden_layers = 1): 
        super(RNN2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, max_norm=True)
        self.loss_class_weights = torch.tensor([0.5, 1, 1, 1, 1, 1, 1, 1, 1], 
                                               dtype=torch.float)
              # 1. An input layer
        self.input_layer=nn.Linear(embedding_dim, hidden_dim)
              # 2. hidden_layers hidden layer
        self.hidden_layer=nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(hidden_layers)])
              # 3. An output layer
        self.output_layer=nn.Linear(hidden_dim,output_dim)
        ### TODO : Initialize the activation function.
        self.relu=nn.ReLU()
        ### TODO : Initialize softmax and loss functions.
        self.lsmax=nn.LogSoftmax(dim = 1)
        self.loss=nn.CrossEntropyLoss()
        ### sl: initiate a list to save the activations according to post #524
        self.save_act = []
        self.hidden_dim = hidden_dim
        ### 10/25
        self.hid2hid = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(hidden_layers-1)])

    def compute_Loss(self, predicted_vector, gold_label, masks):
        return self.loss(predicted_vector[masks], gold_label[masks])

    def forward(self, inputs, training = True):
        ### TODO : Write the forward function such that it processes the sentences incrementally. 
        ### TODO : Return output of the softmax across all time steps
        ### sl: modified 10/22: nn does NOT step given a python list. It only optimize
        
        h_i = [torch.zeros(1, self.hidden_dim,dtype=torch.float, device = torch.device("cuda:0")) for _ in range(len(self.hidden_layer))]
        output = torch.Tensor([]).to(get_device())
        original_shape = inputs.shape
        # go through each token
        for t in range(max_len):
          # copied from FFNN
          embeddings = self.embedding(inputs[:, t])
          for count, hidden in enumerate(self.hidden_layer):
            if count == 0: 
              h_i[count] = self.relu(hidden(h_i[count]) + self.input_layer(embeddings))
            else:
              h_i[count] = self.relu(hidden(h_i[count]) + self.hid2hid[count - 1](h_i[count - 1]))
            # if we are testing, we should save the activation results, which is - 
            if training == False:
              self.save_act.append(h_i[count])
          # end for loop
          output_i = self.lsmax(self.output_layer(h_i[-1]))
          self.save_act.append(output_i)
          shape_output_i = output_i.shape
          output_i = output_i.reshape((-1, shape_output_i[0], shape_output_i[1]))
          output = torch.cat((output, output_i), -1)
          
        output = output.reshape((original_shape[0], original_shape[1], -1))
        return output


    def load_model(self, save_path):
        self.load_state_dict(torch.load(save_path))

    def save_model(self, save_path):
        torch.save(self.state_dict(), save_path)

In [None]:
### TODO : Train and evaluate your RNN2 ###
rnn_2 = RNN2(150, 192, 9, 12414, 3)

### Initialize model weights
rnn_2.apply(weight_init)

### TODO: train and evaluate the model with the functions and data above ###

result_model = train_and_evaluate(2, rnn_2.cuda(), train_loader, val_loader, 0.2, 0.001)

# print("I'm not completed yet!")

In [None]:
rnn_2.save_model('rnn_2_kaggle.pth')
rnn_2 = RNN2(150, 192, 9, 12414, 3)
rnn_2.load_model('rnn_2_kaggle.pth')
rnn_2 = rnn_2.to(get_device())

**create submission**

In [None]:
import itertools
from pyparsing.helpers import TokenConverter
### TODO : pass the processed test data through the model ###

with torch.no_grad():
  # print("I'm not completed yet!")
  ffnn.eval()
  count = 0
  flattened_predicted_all = torch.Tensor([]).to(get_device())
  for (input_batch, expected_out, batch_mask) in tqdm(test_loader, leave=False, desc="Validation Batches"):
    # next line commented out since there is no expected output
    # flattened_expected_out = expected_out.reshape(-1).to(device)
    flattened_batch_mask = batch_mask.reshape(-1).to(device)
    output = rnn(input_batch.to(get_device())).to(get_device())
    # next line seems not involved in the outputs so commented out
    # flattened_output = output.reshape(-1, output.shape[-1])
    _, predicted = torch.max(output, -1)
    flattened_predicted = predicted.reshape(-1)
    flattened_predicted_all = torch.concat((flattened_predicted_all, flattened_predicted))
  # mask
  flattened_mask = list(itertools.chain.from_iterable(processed_test['mask']))
  # count is the index, predicted is the prediction for each token
  real_predicted = torch.Tensor([])
  for count, predicted in enumerate(flattened_predicted_all):
    if flattened_mask[count] == 1:
      predicted = torch.Tensor([predicted])
      real_predicted = torch.concat((real_predicted, predicted))
  real_predicted = real_predicted.tolist()


In [None]:
### TODO : extract labels and indices for model predictions of named entities ###

# Done
indices = list(chain.from_iterable(test['index']))
num2tag = {y: x for x, y in category_map.items()}
res = []
for num in real_predicted:
  res.append(num2tag[num])

In [None]:
def format_output_labels(token_labels, token_indices):
    """
    Returns a dictionary that has the labels (LOC, ORG, MISC or PER) as the keys, 
    with the associated value being the list of entities predicted to be of that key label. 
    Each entity is specified by its starting and ending position indicated in [token_indices].

    Eg. if [token_labels] = ["B-ORG", "I-ORG", "O", "O", "B-ORG"]
           [token_indices] = [15, 16, 17, 18, 19]
        then dictionary returned is 
        {'LOC': [], 'MISC': [], 'ORG': [(15, 16), (19, 19)], 'PER': []}

    :parameter token_labels: A list of token labels (eg. B-PER, I-PER, B-LOC, I-LOC, B-ORG, I-ORG, B-MISC, OR I-MISC).
    :type token_labels: List[String]
    :parameter token_indices: A list of token indices (taken from the dataset) 
                              corresponding to the labels in [token_labels].
    :type token_indices: List[int]
    """
    label_dict = {"LOC":[], "MISC":[], "ORG":[], "PER":[]}
    prev_label = 'O'
    start = token_indices[0]
    for idx, label in enumerate(token_labels):
      curr_label = label.split('-')[-1]
      if label.startswith('B-') or curr_label != prev_label:
        if prev_label != 'O':
          label_dict[prev_label].append((start, token_indices[idx-1]))
        if curr_label != 'O':
          start = token_indices[idx]
        else:
          start = None
      
      prev_label = curr_label

    if start is not None and prev_label != 'O':
      label_dict[prev_label].append((start, token_indices[idx]))
    return label_dict

In [None]:
import csv

def create_submission(output_filepath, token_labels, token_inds):
    """
    :parameter output_filepath: The full path (including file name) of the output file, 
                                with extension .csv
    :type output_filepath: [String]
    :parameter token_labels: A list of token labels (eg. PER, LOC, ORG or MISC).
    :type token_labels: List[String]
    :parameter token_indices: A list of token indices (taken from the dataset) 
                              corresponding to the labels in [token_labels].
    :type token_indices: List[int]
    """
    label_dict = format_output_labels(token_labels, token_inds)
    with open(output_filepath, mode='w') as csv_file:
        fieldnames = ['Id', 'Predicted']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for key in label_dict:
            p_string = " ".join([str(start)+"-"+str(end) for start,end in label_dict[key]])
            writer.writerow({'Id': key, 'Predicted': p_string})

In [None]:
create_submission('drive/MyDrive/Colab Notebooks/rnn_kaggle.csv', res, indices)