In [1]:
import pandas as pd
import spacy
import time

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1984)

<torch._C.Generator at 0x153516c7c10>

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [4]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
VOCAB_LIMIT = 5000

In [5]:
dataset_dir = './data/tweetval_emotion/'

df_train_text = pd.read_csv('./data/tweetval_emotion/train_text.txt', header=None, delimiter='§', names=['text'])
df_train_labels = pd.read_csv(dataset_dir+'train_labels.txt', header=None, names=['label'])
df_train = df_train_text.join(df_train_labels)

df_test_text = pd.read_csv(dataset_dir+'test_text.txt', header=None, delimiter='§', names=['text'])
df_test_labels = pd.read_csv(dataset_dir+'test_labels.txt', header=None, names=['label'])
df_test = df_test_text.join(df_test_labels)

df_labels = pd.read_csv(dataset_dir+'mapping.txt', header=None, delimiter='\t')

  df_train_text = pd.read_csv('./data/tweetval_emotion/train_text.txt', header=None, delimiter='§', names=['text'])
  df_test_text = pd.read_csv(dataset_dir+'test_text.txt', header=None, delimiter='§', names=['text'])


# Tokenize Tweets

In [6]:
#spacy.prefer_gpu()
spacy.require_cpu()

nlp = spacy.load("en_core_web_sm")

tokenized_words = {}

for _, row in df_train.iterrows():
    doc = nlp(row['text'])
    for token in doc:
        if token.text.lower() not in tokenized_words:
            tokenized_words[token.text.lower()] = 1
        else:
            tokenized_words[token.text.lower()] += 1
        

df_tokens = pd.DataFrame(tokenized_words.items(), columns=["Word", "Count"])

In [7]:
df_tokens = df_tokens.sort_values(by=['Count'], ascending=False).reset_index(drop=True)
df_tokens = df_tokens.head(VOCAB_LIMIT)
df_tokens

Unnamed: 0,Word,Count
0,#,3239
1,@user,2019
2,.,1908
3,i,1578
4,the,1514
...,...,...
4995,hated,1
4996,burden,1
4997,spinning,1
4998,porridge,1


In [8]:
def prepare_sequence(seq):
    idxs = []
    doc = nlp(seq)
    for token in doc:
        if token.text.lower() in df_tokens['Word'].values:
            idxs.append(df_tokens.index[df_tokens['Word'] == token.text.lower()][0])
        else:
            idxs.append(VOCAB_LIMIT)
    return torch.tensor(idxs, dtype=torch.long)

# Train and Test

In [9]:
def train(model, loss_function, optimizer, epochs=10):
    time_start = time.time()
    model = model.to(device)

    for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
        for _, row in df_train.iterrows():
            sentence = row["text"]
            tag = row["label"]
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence).to(device)
            target = torch.tensor([tag], dtype=torch.long).to(device)
            
            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores[-1], target[-1])
            loss.backward()
            optimizer.step()

        print(f'Epoch: {epoch}\tLoss: {loss}')

    time_elapsed = time.time() - time_start
    return model, time_elapsed

In [10]:
def test(model):
    correct = 0
    total = 0

    model = model.to(device)
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for _, row in df_test.iterrows():
            sentence = row["text"]
            tag = row["label"]

            sentence_in = prepare_sequence(sentence).to(device)
        
            tag_scores = model(sentence_in)
                
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(tag_scores.data, 1)
            
            if predicted[-1] == tag:
                correct += 1            
            
            total += 1
    
    print(f'Accuracy of the network on the test data: {100 * correct // total} %')

# LSTM Tagger

In [11]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [12]:
model_lstm = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_LIMIT+1, df_labels.shape[0])
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model_lstm.parameters(), lr=0.1)

model_lstm, lstm_time = train(model_lstm, loss_function, optimizer)

print(f'\nTime needed for training: {lstm_time}')

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


Epoch: 0	Loss: 0.8460325598716736
Epoch: 1	Loss: 0.5453154444694519
Epoch: 2	Loss: 0.10229708254337311
Epoch: 3	Loss: 0.04478735849261284
Epoch: 4	Loss: 0.07790172845125198
Epoch: 5	Loss: 0.0005368936690501869
Epoch: 6	Loss: 6.23445157543756e-05
Epoch: 7	Loss: 0.0011539950501173735
Epoch: 8	Loss: 0.00014006110723130405
Epoch: 9	Loss: 8.141662692651153e-05

Time needed for training: 1069.9049994945526


In [13]:
test(model_lstm)

Accuracy of the network on the test data: 53 %


# GRU Tagger

In [14]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [15]:
model_gru = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_LIMIT+1, df_labels.shape[0])
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model_gru.parameters(), lr=0.1)

model_gru, gru_time = train(model_gru, loss_function, optimizer)

print(f'\nTime needed for training: {gru_time}')

Epoch: 0	Loss: 1.1119444370269775
Epoch: 1	Loss: 0.9962902665138245
Epoch: 2	Loss: 0.023863712325692177
Epoch: 3	Loss: 0.1947767734527588
Epoch: 4	Loss: 0.0014156806282699108
Epoch: 5	Loss: 3.620680570602417
Epoch: 6	Loss: -0.0
Epoch: 7	Loss: -0.0
Epoch: 8	Loss: -0.0
Epoch: 9	Loss: -0.0

Time needed for training: 1086.3864908218384


In [16]:
test(model_gru)

Accuracy of the network on the test data: 53 %
