In [1]:
import pandas as pd
import spacy
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('final_data.csv')
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = str(text)
    text_ = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [4]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['text']))

In [5]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 31225
num_words after: 15282


In [6]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [7]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [8]:
df['encoded'] = df['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0.1,Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label,tokenized,pos_tagged,lemmatized,encoded
0,0,1,@@24942188,hopeless,ph,we are living in times of absolute insanity ...,0,0,"['living', 'times', 'absolute', 'insanity', 'p...","[('living', 'v'), ('times', 'n'), ('absolute',...","['live', 'time', 'absolute', 'insanity', 'pret...","[[2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 11, 12, 13, 1..."
1,1,2,@@21968160,migrant,gh,in libya today there are countless number of ...,0,0,"['libya', 'today', 'countless', 'number', 'gha...","[('libya', 'n'), ('today', 'n'), ('countless',...","['libya', 'today', 'countless', 'number', 'gha...","[[6, 82, 83, 3, 84, 4, 85, 86, 8, 87, 37, 88, ..."
2,2,3,@@16584954,immigrant,ie,white house press secretary sean spicer said t...,0,0,"['white', 'house', 'press', 'secretary', 'sean...","[('white', 'a'), ('house', 'n'), ('press', 'n'...","['white', 'house', 'press', 'secretary', 'sean...","[[107, 108, 109, 110, 111, 112, 113, 39, 28, 1..."
3,3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0,"['council', 'customers', 'signs', 'would', 'di...","[('council', 'n'), ('customers', 'n'), ('signs...","['council', 'customer', 'sign', 'would', 'disp...","[[127, 128, 129, 130, 117, 118, 131, 3, 91, 8,..."
4,4,5,@@1494111,refugee,ca,just like we received migrants fleeing el salv...,0,0,"['like', 'received', 'migrants', 'fleeing', 'e...","[('like', 'n'), ('received', 'v'), ('migrants'...","['like', 'receive', 'migrant', 'flee', 'el', '...","[[141, 142, 2, 143, 144, 145, 146, 147, 37, 14..."


In [9]:
Counter(df['orig_label'])

Counter({0: 8529, 1: 947, 2: 144, 3: 458, 4: 391})

In [10]:
X = list(df['encoded'])
y = list(df['orig_label'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [12]:
class PCLDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [13]:
train_ds = PCLDataset(X_train, y_train)
valid_ds = PCLDataset(X_valid, y_valid)

In [20]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [21]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [22]:
class StackedRNN(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim,n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, ht = self.rnn(x_pack)
        out = self.linear(ht[-1])
        return out

In [23]:
model = StackedRNN(vocab_size, 200, 100,4)

In [24]:
train_model(model, epochs=3, lr=0.01)

train loss 1.206, val loss 0.962, val accuracy 0.819, and val rmse 1.063
train loss 0.920, val loss 0.748, val accuracy 0.819, and val rmse 1.063
train loss 0.760, val loss 0.718, val accuracy 0.819, and val rmse 1.063
