# Assignment 5

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy
from heapq import nlargest
from operator import itemgetter

torch.manual_seed(42)

<torch._C.Generator at 0x7efecc181e10>

In [2]:
training_data = []
test_data = []
word_to_ix = {}
word_num = {}
#pos_to_ix = {}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
VOCAB_LIMIT = 5000

labels = {"anger": 0, "joy": 1, "optimism": 2, "sadness": 3}

In [3]:
with open('/home/ruwen/tweeteval/datasets/emotion/train_text.txt') as f:
    t_lines = f.read().splitlines()

with open('/home/ruwen/tweeteval/datasets/emotion/test_text.txt') as f:
    test_lines = f.read().splitlines()

with open('/home/ruwen/tweeteval/datasets/emotion/train_labels.txt') as f:
    train_labels = f.read().splitlines()

train_labels = [int(numeric_string) for numeric_string in train_labels]

with open('/home/ruwen/tweeteval/datasets/emotion/test_labels.txt') as f:
    test_labels = f.read().splitlines()

test_labels = [int(numeric_string) for numeric_string in test_labels]

print("Training lines: " + str(len(t_lines)))
print("Training labels: " + str(len(train_labels)))
print("Test lines: " + str(len(test_lines)))
print("Test labels: " + str(len(test_labels)))

Training lines: 3257
Training labels: 3257
Test lines: 1421
Test labels: 1421


In [4]:
def tokenize(t_list, lines, labels):
    nlp = spacy.load("en_core_web_sm")
    
    for i, line in enumerate(lines):
        doc = nlp(line)
    
        t_list.append(([], []))
        
        for token in doc:
            if token.text not in word_to_ix:  # word has not been assigned an index yet
                word_to_ix[token.text] = len(word_to_ix)  # Assign each word with a unique index

            if token.text not in word_num:
                word_num[token.text] = 1
            else:
                word_num[token.text] += 1
            
            t_list[-1][0].append(token.text)
            t_list[-1][1].append(labels[i])
    
        #print("line finished")

In [5]:
tokenize(training_data, t_lines, train_labels)

In [6]:
tokenize(test_data, test_lines, test_labels)

In [7]:
top_words = nlargest(VOCAB_LIMIT, word_num, key=word_num.get)

In [8]:
#dict(sorted(word_num.items(), key=itemgetter(1), reverse=True)[:50])

### LSTM Tagger

In [9]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [10]:
def prepare_sequence_top(seq):
    idxs = []
    for w in seq:
        if w in top_words:
            idxs.append(top_words.index(w))
        else:
            idxs.append(5000)
    return torch.tensor(idxs, dtype=torch.long)

In [11]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [12]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, 5001, 4)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [13]:
def train(epochs):
    for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
        for sentence, tags in training_data:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()
    
            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence_top(sentence)
            targets = torch.tensor(tags, dtype=torch.long)
            #targets = prepare_sequence(tags, pos_to_ix)
    
            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)
    
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores[-1], targets[-1])
            loss.backward()
            optimizer.step()
        print("Epoch " +  str(epoch + 1) + " loss: " + str(loss.item()))

In [14]:
train(5)

Epoch 1 loss: 0.458427757024765
Epoch 2 loss: 0.16824369132518768
Epoch 3 loss: 0.01595539227128029
Epoch 4 loss: 0.9161590933799744
Epoch 5 loss: 0.00021789084712509066


In [15]:
def test():
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for sentence, tags in test_data:

            sentence_in = prepare_sequence_top(sentence)
            targets = torch.tensor(tags, dtype=torch.long)
        
            tag_scores = model(sentence_in)
            #print(tag_scores.data)
            #print(tag_scores.data[-1])
            #print(torch.max(tag_scores.data, 1)[1][-1])
            #print(tags)
                
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(tag_scores.data, 1)
            #print(predicted[-1])
            #print(tags[-1])
            correct += (predicted[-1] == tags[-1]).sum().item()
            #print(correct)
            
            
            total += 1
    
    print(f'Accuracy of the network on the test data: {100 * correct // total} %')

In [16]:
test()

Accuracy of the network on the test data: 50 %


### GRU Tagger

In [17]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [21]:
model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, 5001, 4)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0005)

In [22]:
train(16)

Epoch 1 loss: 1.066338062286377
Epoch 2 loss: 1.042771816253662
Epoch 3 loss: 1.0505871772766113
Epoch 4 loss: 1.072179913520813
Epoch 5 loss: 1.1017217636108398
Epoch 6 loss: 1.1343740224838257
Epoch 7 loss: 1.1660730838775635
Epoch 8 loss: 1.1940438747406006
Epoch 9 loss: 1.216921091079712
Epoch 10 loss: 1.2344927787780762
Epoch 11 loss: 1.247281551361084
Epoch 12 loss: 1.256147861480713
Epoch 13 loss: 1.262015461921692
Epoch 14 loss: 1.265702724456787
Epoch 15 loss: 1.2678502798080444
Epoch 16 loss: 1.2689030170440674


In [23]:
test()

Accuracy of the network on the test data: 49 %
