In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
import string
from argparse import ArgumentParser
import pickle

In [2]:
unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class RNN(nn.Module):
    def __init__(self, input_dim, h):  # Add relevant parameters
        super(RNN, self).__init__()
        self.h = h
        self.numOfLayer = 1
        self.rnn = nn.RNN(input_dim, h, self.numOfLayer, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
        # [to fill] obtain hidden layer representation (https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)
        outputs, hidden = self.rnn(inputs)
        # [to fill] obtain output layer representations
        output_layer = self.W(outputs)
        # [to fill] sum over output 
        sumed_outputs = torch.sum(output_layer, dim=0)
        # [to fill] obtain probability dist.
        predicted_vector = self.softmax(sumed_outputs)
        return predicted_vector


def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))
    return tra, val

In [8]:
train_data = './Data_Embedding/training.json'
valid_data = './Data_Embedding/validation.json'
print("========== Loading data ==========")
train_data, valid_data = load_data(train_data, valid_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
hidden_dim = 100
# Think about the type of function that an RNN describes. To apply it, you will need to convert the text data into vector representations.
# Further, think about where the vectors will come from. There are 3 reasonable choices:
# 1) Randomly assign the input to vectors and learn better embeddings during training; see the PyTorch documentation for guidance
# 2) Assign the input to vectors using pretrained word embeddings. We recommend any of {Word2Vec, GloVe, FastText}. Then, you do not train/update these embeddings.
# 3) You do the same as 2) but you train (this is called fine-tuning) the pretrained embeddings further.
# Option 3 will be the most time consuming, so we do not recommend starting with this

print("========== Vectorizing data ==========")
model = RNN(50, hidden_dim)  # Fill in parameters
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.01)
word_embedding = pickle.load(open('./Data_Embedding/word_embedding.pkl', 'rb'))

stopping_condition = False
epoch = 0

last_train_accuracy = 0
last_validation_accuracy = 0

while not stopping_condition:
    random.shuffle(train_data)
    model.train()
    # You will need further code to operationalize training, ffnn.py may be helpful
    print("Training started for epoch {}".format(epoch + 1))
    train_data = train_data
    correct = 0
    total = 0
    minibatch_size = 16
    N = len(train_data)

    loss_total = 0
    loss_count = 0
    for minibatch_index in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for example_index in range(minibatch_size):
            input_words, gold_label = train_data[minibatch_index * minibatch_size + example_index]
            input_words = " ".join(input_words)

            # Remove punctuation
            input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()

            # Look up word embedding dictionary
            vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i in input_words ]

            # Transform the input into required shape
            vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
            output = model(vectors)

            # Get loss
            example_loss = model.compute_Loss(output.view(1,-1), torch.tensor([gold_label]))

            # Get predicted label
            predicted_label = torch.argmax(output)

            correct += int(predicted_label == gold_label)
            # print(predicted_label, gold_label)
            total += 1
            if loss is None:
                loss = example_loss
            else:
                loss += example_loss

        loss = loss / minibatch_size
        loss_total += loss.data
        loss_count += 1
        loss.backward()
        optimizer.step()
    print(loss_total/loss_count)
    print("Training completed for epoch {}".format(epoch + 1))
    print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
    trainning_accuracy = correct/total

    model.eval()
    correct = 0
    total = 0
    random.shuffle(valid_data)
    print("Validation started for epoch {}".format(epoch + 1))
    valid_data = valid_data

    for input_words, gold_label in tqdm(valid_data):
        input_words = " ".join(input_words)
        input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()
        vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i
                   in input_words]

        vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
        output = model(vectors)
        predicted_label = torch.argmax(output)
        correct += int(predicted_label == gold_label)
        total += 1
        # print(predicted_label, gold_label)
    print("Validation completed for epoch {}".format(epoch + 1))
    print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
    validation_accuracy = correct/total

    if validation_accuracy < last_validation_accuracy and trainning_accuracy > last_train_accuracy:
        stopping_condition=True
        print("Training done to avoid overfitting!")
        print("Best validation accuracy is:", last_validation_accuracy)
    else:
        last_validation_accuracy = validation_accuracy
        last_train_accuracy = trainning_accuracy

    epoch += 1

Training started for epoch 1


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:14<00:00,  3.94it/s]


tensor(2.2633)
Training completed for epoch 1
Training accuracy for epoch 1: 0.2845625
Validation started for epoch 1


100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 155.79it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.16
Training started for epoch 2


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:22<00:00,  3.80it/s]


tensor(1.6419)
Training completed for epoch 2
Training accuracy for epoch 2: 0.3295
Validation started for epoch 2


100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 155.81it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.38125
Training started for epoch 3


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:22<00:00,  3.81it/s]


tensor(1.6288)
Training completed for epoch 3
Training accuracy for epoch 3: 0.342375
Validation started for epoch 3


100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 157.86it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.435
Training started for epoch 4


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:23<00:00,  3.79it/s]


tensor(1.6636)
Training completed for epoch 4
Training accuracy for epoch 4: 0.33775
Validation started for epoch 4


100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 158.47it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.3
Training started for epoch 5


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:23<00:00,  3.80it/s]


tensor(1.6510)
Training completed for epoch 5
Training accuracy for epoch 5: 0.338125
Validation started for epoch 5


100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 157.99it/s]

Validation completed for epoch 5
Validation accuracy for epoch 5: 0.22875
Training done to avoid overfitting!
Best validation accuracy is: 0.3



