In [1]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

train_tweets = pd.read_parquet('data/train-00000-of-00001.parquet', engine='pyarrow')
val_tweets = pd.read_parquet('data/validation-00000-of-00001.parquet', engine='pyarrow')
test_tweets = pd.read_parquet('data/test-00000-of-00001.parquet', engine='pyarrow')

classes = ('anger', 'joy', 'optimism', 'sadness')

epochs = 50

train_tweets

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3
...,...,...
3252,I get discouraged because I try for 5 fucking ...,3
3253,The @user are in contention and hosting @user ...,3
3254,@user @user @user @user @user as a fellow UP g...,0
3255,You have a #problem? Yes! Can you do #somethin...,0


In [2]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x12fb37ea750>

In [11]:
from collections import Counter

def prepare_sequence(seq, to_ix, N):
    idxs = [to_ix[w.text] if w.text in to_ix else N for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {}
word_count = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for idx, row in train_tweets.iterrows():
    word_spacy = nlp(row["text"])
    for word in word_spacy:
        if word.text not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word.text] = len(word_to_ix)  # Assign each word with a unique index
            word_count[word.text] = 1
        else:
            word_count[word.text] += 1
print(word_to_ix)

N = 5000
word_counter = Counter(word_count)
most_common_words = word_counter.most_common(N)
most_common_words = [word for word, _ in most_common_words]

new_count = 0
clean_word_to_ix = {}

for word in most_common_words:
    if word not in clean_word_to_ix:
        clean_word_to_ix[word] = len(clean_word_to_ix)

print(clean_word_to_ix)

EMBEDDING_DIM = 128
HIDDEN_DIM = 256



In [303]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[-1]

In [184]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, row in train_tweets.iterrows():
        sentence = row["text"]
        tag = row["label"]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(nlp(sentence), clean_word_to_ix, N)
        target = torch.tensor([tag], dtype=torch.long)
        print(sentence_in)
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, target[-1])
        loss.backward()
        optimizer.step()

0
tensor([ 415, 2400,   10,    7,  181, 4064,   23,    7,  488,   12,  416,  145,
          27,   22,    2, 1392, 4065, 4066,    2,   21,    0, 1776,    0, 1004,
           0,  248])
tensor([ 146, 1393,   62,   14,   15,  568,   17,   63,   80,   16, 2401,  121,
          63,   27, 2402,    2,    0,  176,    0, 2403])
tensor([ 191,   37,   17,   15,   29,  880,    2, 4067,   32,  569,  350,   45,
        4068,  164,   37, 4069, 2404,   76,   47, 4070])
tensor([4071,  137, 4072,   10,   16,   50,   19, 1394,  137,  398,  170,    6,
        4073,   85,  799, 4074])
tensor([  14,   15,  351,  192,   54,   91,  570, 4075,   23,  322, 1777, 4076])
tensor([   1,   37,   38, 1778,   32,  620,   56,   53,    4,  489,   29, 4077,
          48,    5,   24,  881,    2,   81,   84,    5,  521,  126,   17,   38,
        2405,    2])
tensor([2406,   17, 4078, 1779,   56,  571,    8, 1395, 1005, 4079,    5,  572,
           8, 1172,  621,    2,    0, 1005])
tensor([4080,    8,  490,  109,   25,    7,

KeyboardInterrupt: 

In [129]:
from sklearn.metrics import accuracy_score

def test():
    with torch.no_grad():
        pred = []

        for idx, row in test_tweets.iterrows():
            inputs = prepare_sequence(nlp(row["text"]), word_to_ix, N)
            tag_scores = model(inputs)
            _, predicted_class = torch.max(tag_scores, -1)
            pred.append(predicted_class.item())

        test_tweets["pred"] = pred

    print("Accuracy:", accuracy_score(test_tweets["label"], test_tweets["pred"]))

In [None]:
test()

In [47]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[-1]

In [124]:
model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch)
    for idx, row in train_tweets.iterrows():
        sentence = row["text"]
        tag = row["label"]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(nlp(sentence), clean_word_to_ix, N)
        target = torch.tensor([tag], dtype=torch.long)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, target[-1])
        loss.backward()
        optimizer.step()

In [50]:
test()

Accuracy: 0.2836030964109782


In [288]:
from torch.utils.data import Dataset, DataLoader

def collate_fn(batch):
    new_batch = []
    label = []
    tokens = []
    max_tokens = 0
    for b in batch:
        token = prepare_sequence(nlp(b[0]), clean_word_to_ix, N)
        label.append(b[1])
        tokens.append(token)
        if len(token) > max_tokens:
            max_tokens = len(token)

    for t in tokens:
        num_fill = max_tokens - len(t)
        filled_tensor = torch.full((num_fill,), N+1)
        new_batch.append(torch.cat((t, filled_tensor), dim=0))

    return torch.stack(new_batch), torch.tensor(label)

class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values.tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample

In [272]:
trainloader = DataLoader(
    DataFrameDataset(dataframe=train_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)
valloader = DataLoader(
    DataFrameDataset(dataframe=val_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)
testloader = DataLoader(
    DataFrameDataset(dataframe=test_tweets), shuffle=True, batch_size=8, collate_fn=collate_fn)

In [310]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores[-1]

In [311]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix) + 2, len(classes))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(1):  # again, normally you would NOT do 300 epochs, it is toy data
    for idx, (sentence, tag) in enumerate(trainloader):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        print(sentence.shape)
        # Step 3. Run our forward pass.
        tag_scores = model(sentence)
        print(tag_scores)
        print(tag)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, tag[-1])
        loss.backward()
        optimizer.step()

torch.Size([8, 28])
input :torch.Size([8, 28])


AttributeError: 'LSTMTagger' object has no attribute 'num_layers'

In [130]:
test()

Accuracy: 0.3877551020408163
