In [2]:
import numpy as np
import time
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

train_tweets = pd.read_parquet('data/train-00000-of-00001.parquet', engine='pyarrow')
val_tweets = pd.read_parquet('data/validation-00000-of-00001.parquet', engine='pyarrow')
test_tweets = pd.read_parquet('data/test-00000-of-00001.parquet', engine='pyarrow')

classes = ('anger', 'joy', 'optimism', 'sadness')

epochs = 8

train_tweets

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3
...,...,...
3252,I get discouraged because I try for 5 fucking ...,3
3253,The @user are in contention and hosting @user ...,3
3254,@user @user @user @user @user as a fellow UP g...,0
3255,You have a #problem? Yes! Can you do #somethin...,0


In [3]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x20a3e54a8b0>

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [5]:
from collections import Counter

def prepare_sequence(seq, to_ix, N):
    idxs = [to_ix[w.lemma_] if w.lemma_ in to_ix else N for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {}
word_count = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for idx, row in train_tweets.iterrows():
    word_spacy = nlp(row["text"])
    for word in word_spacy:
            if word.lemma_ not in word_to_ix:  # word has not been assigned an index yet
                word_to_ix[word.lemma_] = len(word_to_ix)  # Assign each word with a unique index
                word_count[word.lemma_] = 1
            else:
                word_count[word.lemma_] += 1
print(word_to_ix)

N = 5000
word_counter = Counter(word_count)
most_common_words = word_counter.most_common(N)
most_common_words = [word for word, _ in most_common_words]

new_count = 0
clean_word_to_ix = {}

for word in most_common_words:
    if word not in clean_word_to_ix:
        clean_word_to_ix[word] = len(clean_word_to_ix)

print(clean_word_to_ix)

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

{'#': 0, 'be': 1, 'I': 2, '.': 3, 'the': 4, 'to': 5, '@user': 6, ',': 7, 'a': 8, 'and': 9, '@us': 10, 'not': 11, '!': 12, 'you': 13, 'of': 14, 'it': 15, 'do': 16, 'in': 17, 'have': 18, 'that': 19, 'my': 20, '?': 21, 'for': 22, ' ': 23, "'": 24, 'on': 25, 'get': 26, 'this': 27, 'with': 28, 'so': 29, 'just': 30, 'but': 31, 'they': 32, 'he': 33, 'at': 34, 'can': 35, 'your': 36, 'like': 37, 'go': 38, 'all': 39, 'we': 40, ';': 41, '...': 42, 'when': 43, 'will': 44, '-': 45, 'if': 46, 'what': 47, 'about': 48, '&': 49, 'out': 50, 'up': 51, 'no': 52, 'by': 53, 'make': 54, 'amp': 55, 'from': 56, 'she': 57, 'how': 58, 'would': 59, 'think': 60, 'as': 61, 'people': 62, "'s": 63, 'one': 64, 'or': 65, 'know': 66, 'an': 67, ':': 68, 'see': 69, 'there': 70, 'who': 71, 'say': 72, 'feel': 73, 'his': 74, 'now': 75, 'day': 76, 'watch': 77, 'time': 78, 'look': 79, 'want': 80, 'good': 81, 'bad': 82, 'sad': 83, 'love': 84, 'why': 85, 'take': 86, 'some': 87, 'really': 88, 'life': 89, 'u': 90, 'even': 91, '..'

In [6]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[-1]

In [7]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

model.to(device)

lstm_start = time.time()
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, row in train_tweets.iterrows():
        sentence = row["text"]
        tag = row["label"]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(nlp(sentence), clean_word_to_ix, N).to(device)
        target = torch.tensor([tag], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, target[-1])
        loss.backward()
        optimizer.step()

    print("Epoch", epoch, loss.item())

lstm_end = time.time()
lstm_time = lstm_end - lstm_start

  from .autonotebook import tqdm as notebook_tqdm


0
Epoch 0 0.2773605287075043
1
Epoch 1 0.24571368098258972
2
Epoch 2 0.03512917086482048
3
Epoch 3 0.0303784366697073
4
Epoch 4 0.012581617571413517
5
Epoch 5 0.0013174673076719046
6
Epoch 6 0.002305827336385846
7
Epoch 7 0.30915120244026184


In [8]:
from sklearn.metrics import accuracy_score

def test():
    with torch.no_grad():
        pred = []

        for idx, row in test_tweets.iterrows():
            inputs = prepare_sequence(nlp(row["text"]), word_to_ix, N).to(device)
            tag_scores = model(inputs)
            _, predicted_class = torch.max(tag_scores, -1)
            pred.append(predicted_class.item())

        test_tweets["pred"] = pred

    print("Accuracy:", accuracy_score(test_tweets["label"], test_tweets["pred"]))

In [9]:
test()

Accuracy: 0.306826178747361


In [10]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[-1]

In [25]:
model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0005)

model.to(device)

gru_start = time.time()
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, row in train_tweets.iterrows():
        sentence = row["text"]
        tag = row["label"]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(nlp(sentence), clean_word_to_ix, N).to(device)
        target = torch.tensor([tag], dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, target[-1])
        loss.backward()
        optimizer.step()

    print("Epoch", epoch, loss.item())

gru_end = time.time()
gru_time = gru_end - gru_start

0
Epoch 0 1.031199336051941
1
Epoch 1 0.8988023400306702
2
Epoch 2 0.8376652002334595
3
Epoch 3 0.8070014119148254
4
Epoch 4 0.79155433177948
5
Epoch 5 0.7842944860458374
6
Epoch 6 0.781444251537323
7
Epoch 7 0.7808328866958618


In [26]:
test()

Accuracy: 0.35608726249120337


In [13]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    label = []
    tokens = []
    lengths = []
    max_tokens = 0
    for b in batch:
        token = prepare_sequence(nlp(b[0]), clean_word_to_ix, N)
        label.append(b[1])
        tokens.append(token)
        lengths.append(len(token))
        if len(token) > max_tokens:
            max_tokens = len(token)

    batch_pad = pad_sequence(tokens, batch_first=True, padding_value=N+1)

    return batch_pad.to(device), torch.tensor(label).to(device), torch.tensor(lengths)

class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values.tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample

In [14]:
trainloader = DataLoader(
    DataFrameDataset(dataframe=train_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)
valloader = DataLoader(
    DataFrameDataset(dataframe=val_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)
testloader = DataLoader(
    DataFrameDataset(dataframe=test_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)

In [15]:
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

class LSTMTaggerBatch(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTaggerBatch, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x_padded, lengths):
        embeds = self.word_embeddings(x_padded)
        x_packed = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(x_packed)
        output_padded, _ = pad_packed_sequence(lstm_out, batch_first=True)
        tag_space = self.hidden2tag(output_padded)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[:,-1,:]

In [16]:
model = LSTMTaggerBatch(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix) + 2, len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

model.to(device)

lstm_start = time.time()
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, (x_padded, y_padded, lengths) in enumerate(trainloader):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 3. Run our forward pass.
        tag_scores = model(x_padded, lengths)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, y_padded)
        loss.backward()
        optimizer.step()

    print("Epoch", epoch, loss.item())

lstm_end = time.time()
lstm_batch_time = lstm_end - lstm_start

0
Epoch 0 4.268695831298828
1
Epoch 1 0.6103395223617554
2
Epoch 2 0.22013314068317413
3
Epoch 3 0.03253134340047836
4
Epoch 4 3.7883260250091553
5
Epoch 5 0.5633448958396912
6
Epoch 6 1.6748111248016357
7
Epoch 7 0.029020799323916435


In [17]:
def test_batch():
    with torch.no_grad():
        pred = []

        for idx, (x_padded, y_padded, lengths) in enumerate(testloader):
            tag_scores = model(x_padded, lengths)
            _, predicted_class = torch.max(tag_scores, -1)
            pred.append(predicted_class.cpu().numpy().tolist())

        pred = [item for sub_list in pred for item in sub_list]
        test_tweets["pred"] = np.array(pred).flatten().tolist()

    print("Accuracy:", accuracy_score(test_tweets["label"], test_tweets["pred"]))

In [18]:
test_batch()

Accuracy: 0.36312456016889516


In [19]:
class GRUTaggerBatch(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTaggerBatch, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x_padded, lengths):
        embeds = self.word_embeddings(x_padded)
        x_packed = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(x_packed)
        output_padded, _ = pad_packed_sequence(lstm_out, batch_first=True)
        tag_space = self.hidden2tag(output_padded)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[:,-1,:]

In [23]:
model = GRUTaggerBatch(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix) + 2, len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0005)

model.to(device)

gru_start = time.time()
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, (x_padded, y_padded, lengths) in enumerate(trainloader):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        x_padded.to(device)
        y_padded.to(device)
        lengths.to(device)

        # Step 3. Run our forward pass.
        tag_scores = model(x_padded, lengths)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, y_padded)
        loss.backward()
        optimizer.step()

    print("Epoch", epoch, loss.item())

gru_end = time.time()
gru_batch_time = gru_end - gru_start

0
Epoch 0 3.314232587814331
1
Epoch 1 3.073875904083252
2
Epoch 2 3.074087381362915
3
Epoch 3 3.1164000034332275
4
Epoch 4 3.271702527999878
5
Epoch 5 2.2494609355926514
6
Epoch 6 2.3475234508514404
7
Epoch 7 3.151235580444336


In [24]:
test_batch()

Accuracy: 0.3729767769176636


In [27]:
print("LSTM", lstm_time)
print("GRU", gru_time)
print("LSTM Batch", lstm_batch_time)
print("GRU Batch", gru_batch_time)

LSTM 254.67780327796936
GRU 247.30408453941345
LSTM Batch 173.13447833061218
GRU Batch 178.2487211227417
