In [1]:
'''
First, we import all required packages.
'''
import random
from random import shuffle
random.seed(11)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Set seed for consistant 'random' initializations
torch.manual_seed(11)

<torch._C.Generator at 0x115928050>

In [2]:
'''
Next, we read in the unlabeled data from a CSV file. Each line contains one example.
'''
def load_data(fname = 'BYOSC_data.csv'):
    data = []
    # Use utf-8 text encoding. Lines that raise an error will be ignored.
    for line in open(fname, encoding="utf8", errors="ignore"):
        # get labels y as the sentiment value \in {0, 2, 4}
        # inputs x are the tweet text
        y, _, _, _, _, x = line[1:-2].strip().split('","')
        data.append((x, y))
    
    # Use python random.shuffle to randomly sort the data samples
    shuffle(data)
    
    # Get first 300 random samples for training
    train_set = data[:300]
    # Use remaining data for to evaluate, and tune hyperparameters
    dev_set = data[300:]

    return train_set, dev_set

In [3]:
'''
Methods needed to convert the data into input that can be fed into the neural network.
'''
def make_feature_vectors(data, w2i):
    new_data = []

    for (x, y) in data:
        sentence = []
        # Simple tokenization: just split on spaces
        for word in x.split(' '):
            # Encode words as their unique integer, or the unknown token
            # for the case that the word is not in our vocabulary
            sentence.append(w2i.get(word, w2i['<UNK>']))
        
        # Turn our lists of integers into Pytorch tensors
        # And map the y values of 0,2,4 -> 0,1,2
        new_data.append((torch.tensor(sentence), torch.tensor([int(int(y)/2)])))
        
    return new_data

def get_vocab(data):
    """Create the encoding dictionary to assign each word a unique integer"""
    w2i = {'<UNK>': 0}
    
    for (x, y) in data:
        for word in x.split(' '):
            if word not in w2i:
                w2i[word] = len(w2i)

    return w2i

In [4]:
'''
Now, we define our neural network model.
'''
# Inherit from the pytorch nn.Module so our operatoins are tracked ont he CG and
# autodifferentiated
#https://pytorch.org/docs/stable/generated/torch.nn.Module.html
class MyNNModel(nn.Module):

    def __init__(self, vocab_size, emb_dim, hidden_dim, output_dim):
        super(MyNNModel, self).__init__()
        # Embeddings layer, maps unique words to vectors.
        # https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        # Run resulting embeddings through an LSTM. This encodes the entire sequence.
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        # A nice blog on LSTM: https://colah.github.io/posts/2015-08-Understanding-LSTMs/
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        # Linear layer: a matrix mulltiply + bias term. 
        # Note that it performs the affine transform into `output_dim` dimensions, 
        # as we are using these as our probabilities over the labels we try to classify
        # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        self.linear = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, input):
        """Every pytorch module must implement a forward method. The __call__() method
        uses this, and this is where we call things to map inputs to outputs"""
        # Encode the input LongTensor with the embedding layer
        x_1 = self.embedding(input)
        # Run through the LSTM. We first unsqueeze x_1 because pytorch expects 
        # rank 3 tensors as input.
        # this is typically batch_size x sequence length x hidden dimension.
        # We then take the first output of the LSTM as this is the final hidden state 
        # that we want to classify
        x_2 = self.lstm(x_1.unsqueeze(1))[1][0].squeeze(0).squeeze(0)
        # Run through the linear layer to get scores for `output_size` classes
        return self.linear(x_2)

In [5]:
'''
An evaluation method so we can see how well the model is doing.
'''
def eval(model, data):
    # Set the pytorch model to eval mode so it does not accumulate gradients
    # https://stackoverflow.com/questions/60018578/what-does-model-eval-do-in-pytorch
    model.eval()
    
    total = right = 0
    for (x, y) in data:
        # Perform the softmax function to normalize the scores for each class 
        # into a probability distribution
        probs = F.softmax(model(x), dim=0)
        # Take the index of the class with the highest probability
        y_hat = torch.argmax(probs)
        # Track number correct
        if y_hat == y:
            right += 1
        total += 1
    print("Accuracy: " + str((right * 1.0)/total))

In [9]:
'''
Finally, let's put everything together and train the model!
'''
train, dev = load_data()
vocab = get_vocab(train)
train = make_feature_vectors(train, vocab)
dev = make_feature_vectors(dev, vocab)
    
# TODO[3]: substitute -1 by more reasonable values!
# We want our vocab size, an embedding size and hidden size we choose, 
# and the number of output classes
model = MyNNModel(len(vocab), 64, 32, 3)

# Use the cross entropy loss function
# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
loss_function = nn.CrossEntropyLoss()
# SGD optimization - 0.01 is a good learning rate to start out with
# Note that it computes gradients for each parameter, 
# so we give it a pointer to our model parameters
# https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD
optimizer = optim.SGD(model.parameters(), lr=.01)

# Call our eval function, sanity check how we do on the train data
eval(model, train)
# Now evaluate how we do on the held-out dev set.
eval(model, dev)
print()

# Train for 3 epochs
for i in range(3):
    # Put model in train mode, to ensure it accumulates gradients
    model.train()
    # Each minibatch is of size 1
    # This means gradients are computed for one example at a time
    for (x, y) in train:
        # Zero out the gradients on the graph before each update
        # https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
        model.zero_grad()
        # Compute raw scores from our model.
        # This runs the forward() method
        raw_scores = model(x)
        # Compute a loss value using cross entropy.
        # This takes the -log of the softmax of our score for the correect value y
        loss = loss_function(raw_scores.unsqueeze(0), y)
        # Call backwards, to perform backpropogation and compute gradients
        # for each parameter (starting with loss wrt output layer, and so on)
        # https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html
        loss.backward()
        # Apply the computed gradients in order to optimize:
        # we update each parameter in the model.
        optimizer.step()
        
    # Eval on train/dev after each epoch
    eval(model, train)
    eval(model, dev)
    print()

Accuracy: 0.32666666666666666
Accuracy: 0.32323232323232326

Accuracy: 0.43333333333333335
Accuracy: 0.3383838383838384

Accuracy: 0.47
Accuracy: 0.35858585858585856

Accuracy: 0.53
Accuracy: 0.35858585858585856

