In [10]:
# ========= Imports and Data Loading ========= #
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset

import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import nltk
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
class Preprocessing():
    def __init__(self, train_path):
        super().__init__()
        
        self.train_df = self.load_data(train_path)
        self.value_counts = self.train_df['stance'].value_counts()
        
        self.train_set = list(self.train_df.to_records(index=False))
        self.tokenizate()
        
    def load_data(self, train_path):
        return pd.read_csv(train_path)
    
    def remove_links_mentions(self, tweet):
        link_re_pattern = "https?:\/\/t.co/[\w]+"
        mention_re_pattern = "@\w+"
        tweet = re.sub(link_re_pattern, "", tweet)
        tweet = re.sub(mention_re_pattern, "", tweet)
        return tweet.lower()
    
    def tokenizate(self):
        self.train_set = [(category, label, word_tokenize(self.remove_links_mentions(tweet))) for (tweet, category, label) in self.train_set]
    
    def get_datasets(self):
        return self.train_set    

In [13]:
class TweetsDataset(Dataset):
    def __init__(self, train_set):
        super().__init__()
        self.train_set = train_set
                
        self.index2word = self.construct_index2word()
        self.voc_size = len(self.index2word)
        self.word2index = {token: idx for idx, token in enumerate(self.index2word)}

        self.max_seq_length = self.get_max_seq()
        self.train_encoded = [(category, label, self.encode_and_pad(tweet, self.max_seq_length)) for category, label, tweet in self.train_set]
        
        self.data = (np.array([tweet for category, label, tweet in self.train_encoded]))
        self.label = (np.array([label for category, label, tweet in self.train_encoded]))

        self.train_ds = TensorDataset(torch.from_numpy(self.data), torch.from_numpy(self.label))
        
#         Add Categories in train_ds
        
    def construct_index2word(self):
        index2word = ["<PAD>", "<SOS>", "<EOS>"]

        for ds in [self.train_set]:
            for category, label, tweet in ds:
                for token in tweet:
                    if token not in index2word:
                        index2word.append(token)
        return index2word
    
    def get_max_seq(self):
        max_seq_length = 0
        for tweet, category, label in self.train_set:
            if len(tweet) > max_seq_length:
                max_seq_length = len(tweet)
        return max_seq_length
    
    def encode_and_pad(self, tweet, length):
        sos = [self.word2index["<SOS>"]]
        eos = [self.word2index["<EOS>"]]
        pad = [self.word2index["<PAD>"]]

        if len(tweet) < length - 2: # -2 for SOS and EOS
            n_pads = length - 2 - len(tweet)
            encoded = [self.word2index[w] for w in tweet]
            return sos + encoded + eos + pad * n_pads 
        else: # tweet is longer than possible; truncating
            encoded = [self.word2index[w] for w in tweet]
            truncated = encoded[:length - 2]
            return sos + truncated + eos
    
    def get_data(self):
        return self.train_ds

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.embedding_size = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        input = self.embedding(torch.tensor(input))
        out, _ = self.rnn(input)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [15]:
class RnnTweetsClassifier():
    def __init__(self, dataset, embedding_dim, hidden_size, num_layers, num_classes, epoch_size, learning_rate):
        self.dataset = dataset
        self.embedding_dim = embedding_dim
        self.epoch_size = epoch_size
        self.learning_rate = learning_rate
        self.model = RNN(dataset.voc_size, embedding_dim, hidden_size, num_layers, num_classes)
        self.data_loader = DataLoader(dataset=dataset.get_data(), batch_size=256, shuffle=True)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), self.learning_rate)
         
    def train(self):
        for epoch in range(self.epoch_size):
            total_model_acc = 0
            for data in self.data_loader:
                train_data = data[:][0]
                labels = data[:][1]
                # reshape the row data
                output = self.model(train_data)
                # calculate the accuracy
                total_model_acc += (torch.argmax(output, dim=1) == (labels + 1)).sum()
                loss = self.criterion(output, (labels + 1))
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            total_model_acc = total_model_acc / len(self.dataset.get_data())
            print(f"epoch num: {epoch} has accuracy: {total_model_acc}")

    def predict(self, testLoader):
        with torch.no_grad():
            n_correct = 0
            n_samples = 0
            for tweets in testLoader:
                test_data = tweets[:][0]
                labels = tweets[:][1]
                output = self.model(test_data)
                n_correct += (torch.argmax(output, dim=1) == (labels + 1)).sum()
                n_samples += labels.size(0)
            print(f"Model accuracy: {100 * n_correct / n_samples}")
            print("Model F1 Score: ",f1_score(labels, torch.argmax(output,dim=1), average="macro"))

In [26]:
# =========== Training =========== #
train_path = "./dataset/train.csv"
preprocessing = Preprocessing(train_path)
train = preprocessing.get_datasets()

dataset = TweetsDataset(train)
tweet_classifier = RnnTweetsClassifier(dataset, 50, 50, 1, 3, 7, 0.001)
tweet_classifier.train()

  input = self.embedding(torch.tensor(input))


epoch num: 0 has accuracy: 0.7734687924385071
epoch num: 1 has accuracy: 0.7925014495849609
epoch num: 2 has accuracy: 0.7925014495849609
epoch num: 3 has accuracy: 0.7925014495849609
epoch num: 4 has accuracy: 0.7935031652450562
epoch num: 5 has accuracy: 0.7993703484535217
epoch num: 6 has accuracy: 0.8112478256225586


In [27]:
# =========== Test =========== #
dev_path = "./dataset/dev.csv"
preprocessing = Preprocessing(dev_path)
dev = preprocessing.get_datasets()
testset = TweetsDataset(dev)
testloader = DataLoader(dataset=testset.get_data(), batch_size=256, shuffle=True)
tweet_classifier.predict(testloader)

Model accuracy: 71.19999694824219
Model F1 Score:  0.07432432432432433


  input = self.embedding(torch.tensor(input))
