In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
from argparse import ArgumentParser
import pandas as pd

In [2]:
unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class FFNN(nn.Module):
    def __init__(self, input_dim, h):
        super(FFNN, self).__init__()
        self.h = h
        self.W1 = nn.Linear(input_dim, h)
        self.activation = nn.ReLU() # The rectified linear unit; one valid choice of activation function
        self.output_dim = 5
        self.W2 = nn.Linear(h, self.output_dim)

        self.softmax = nn.LogSoftmax() # The softmax function that converts vectors into probability distributions; computes log probabilities for computational benefits
        self.loss = nn.NLLLoss() # The cross-entropy/negative log likelihood loss taught in class

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, input_vector):
        # [to fill] obtain first hidden layer representation
        hidden_layer = self.activation(self.W1(input_vector))
        # [to fill] obtain output layer representation
        output_layer = self.W2(hidden_layer)
        # [to fill] obtain probability dist.
        predicted_vector = self.softmax(output_layer)
        return predicted_vector

In [3]:
# Returns: 
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab 


# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index 
        index2word[index] = word 
    vocab.add(unk)
    return vocab, word2index, index2word 


# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index)) 
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data



def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val

In [4]:
 # fix random seeds
#parser = ArgumentParser()
#args = parser.parse_args()
random.seed(42)
torch.manual_seed(42)
train_data = './Data_Embedding/training.json'
valid_data = './Data_Embedding/validation.json'
# load data
print("========== Loading data ==========")
train_data, valid_data = load_data(train_data, valid_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
vocab = make_vocab(train_data)
vocab, word2index, index2word = make_indices(vocab)

print("========== Vectorizing data ==========")
train_data = convert_to_vector_representation(train_data, word2index)
valid_data = convert_to_vector_representation(valid_data, word2index)



In [5]:
model = FFNN(input_dim = len(vocab), h = 100)
optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)

In [6]:
epochs = 5
print("========== Training for {} epochs ==========".format(epochs))
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    loss = None
    correct = 0
    total = 0
    start_time = time.time()
    print("Training started for epoch {}".format(epoch + 1))
    random.shuffle(train_data) # Good practice to shuffle order of training data
    minibatch_size = 16 
    N = len(train_data) 
    for minibatch_index in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for example_index in range(minibatch_size):
            input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
            predicted_vector = model(input_vector)
            predicted_label = torch.argmax(predicted_vector)
            correct += int(predicted_label == gold_label)
            total += 1
            example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
            if loss is None:
                loss = example_loss
            else:
                loss += example_loss
        loss = loss / minibatch_size
        loss.backward()
        optimizer.step()
    print("Training completed for epoch {}".format(epoch + 1))
    print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
    print("Training time for this epoch: {}".format(time.time() - start_time))
    loss = None
    correct = 0
    total = 0
    start_time = time.time()
    print("Validation started for epoch {}".format(epoch + 1))
    minibatch_size = 16 
    N = len(valid_data) 
    for minibatch_index in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for example_index in range(minibatch_size):
            input_vector, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
            predicted_vector = model(input_vector)
            predicted_label = torch.argmax(predicted_vector)
            correct += int(predicted_label == gold_label)
            total += 1
            example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
            if loss is None:
                loss = example_loss
            else:
                loss += example_loss
        loss = loss / minibatch_size
    print("Validation completed for epoch {}".format(epoch + 1))
    print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
    print("Validation time for this epoch: {}".format(time.time() - start_time))

Training started for epoch 1


  return self._call_impl(*args, **kwargs)
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:53<00:00,  5.77it/s]


Training completed for epoch 1
Training accuracy for epoch 1: 0.421
Training time for this epoch: 173.23645496368408
Validation started for epoch 1


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.97it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.52875
Validation time for this epoch: 1.8582541942596436
Training started for epoch 2


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:32<00:00,  3.67it/s]


Training completed for epoch 2
Training accuracy for epoch 2: 0.5056875
Training time for this epoch: 272.6725826263428
Validation started for epoch 2


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 25.51it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.505
Validation time for this epoch: 1.9642338752746582
Training started for epoch 3


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:00<00:00,  3.33it/s]


Training completed for epoch 3
Training accuracy for epoch 3: 0.541
Training time for this epoch: 300.1188385486603
Validation started for epoch 3


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 25.82it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.59125
Validation time for this epoch: 1.9412829875946045
Training started for epoch 4


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:21<00:00,  3.11it/s]


Training completed for epoch 4
Training accuracy for epoch 4: 0.5833125
Training time for this epoch: 321.3179190158844
Validation started for epoch 4


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.06it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.55
Validation time for this epoch: 1.923403263092041
Training started for epoch 5


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:31<00:00,  3.02it/s]


Training completed for epoch 5
Training accuracy for epoch 5: 0.6125625
Training time for this epoch: 331.39816546440125
Validation started for epoch 5


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 25.80it/s]

Validation completed for epoch 5
Validation accuracy for epoch 5: 0.58375
Validation time for this epoch: 1.9428677558898926





In [None]:
 # fix random seeds
#parser = ArgumentParser()
#args = parser.parse_args()
random.seed(42)
torch.manual_seed(42)
test_data = './Data_Embedding/test.json'
valid_data = './Data_Embedding/validation.json'
# load data
print("========== Loading data ==========")
test_data, valid_data = load_data(test_data, valid_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
vocab = make_vocab(test_data)
vocab, word2index, index2word = make_indices(vocab)

print("========== Vectorizing data ==========")
test_data = convert_to_vector_representation(test_data, word2index)
valid_data = convert_to_vector_representation(valid_data, word2index)

In [None]:
test_data

In [None]:
test=[item[0] for item in test_data]

In [None]:
test