## Assignment 6

Develop RNN model in pytorch to solve the following problem:

1. Detect sarcasm Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit
   Your quality metric = accuracy
   Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.

Remember, you can use GPU resourses in kaggle kernels.

In [1]:
import os
import pandas as pd
import numpy as np
import gensim
from sklearn.model_selection import train_test_split


import torch as tt
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset

SEED = 42
np.random.seed(SEED)

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [38]:
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

In [45]:
data_1 = pd.read_csv('/Users/alinashaymardanova/Downloads/train-balanced-sarcasm.csv')
data_2 = pd.concat([data_1['comment'], data_1['label']], axis=1)

In [46]:
data_2.isnull().values.any()

True

In [5]:
train, test = train_test_split(df2, test_size=0.2, shuffle=True)

# train.to_csv('train.csv', encoding='utf-8', index=False)
# test.to_csv('test.csv', encoding='utf-8', index=False)

In [2]:
lemmatizer = WordNetLemmatizer()
def tokenizer(text): 
    return [lemmatizer.lemmatize(x) for x in word_tokenize(text)
            if x and x not in stopwords]

In [6]:
TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')

LABEL = LabelField(dtype=tt.int64, use_vocab=True, 
                   preprocessing=lambda x: classes[x])


train = TabularDataset('train.csv', format='csv', 
                         fields=[('text', TEXT), ('label', LABEL)],
                         skip_header=True)

test = TabularDataset('test.csv', format='csv', 
                         fields=[('text', TEXT), ('label', LABEL)],
                         skip_header=True)

In [7]:
TEXT.build_vocab(train, min_freq=5)
len(TEXT.vocab.itos)

36250

In [9]:
# model = gensim.models.KeyedVectors.load_word2vec_format('/Users/alinashaymardanova/Downloads/wiki-news-300d-1M.vec')
# weight = tt.FloatTensor(model.vectors)

In [11]:
LABEL.build_vocab(train)

In [16]:
train, valid = train.split(0.8, stratified=True)

In [21]:
class MyModel(nn.Module):
    
    def __init__(self, weight, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(weight, freeze=True)
        
        self.rnn = nn.LSTM(input_size=300,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True)
        
        
        self.fc = nn.Sequential(nn.Linear(hidden_size * 2 * 2, 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 3))

    
    def forward(self, batch):
        
        x, x_lengths = batch.text
        
        x = x.to((tt.device("cuda:0" if tt.cuda.is_available() else "cpu")))
        batch.label = batch.label.to((tt.device("cuda:0" if tt.cuda.is_available() else "cpu")))
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
        
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        
        
        return x

In [22]:
tt.cuda.empty_cache()

batch_size = 512

model = MyModel(weight, embed_size=300, hidden_size=512)

model = model.to((tt.device("cuda:0" if tt.cuda.is_available() else "cpu")))

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [23]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches,
                             desc='epoch %d' % (curr_epoch),
                             leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()
        
        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches

In [24]:
def nn_train(model, train_iterator, valid_iterator,
             criterion, optimizer, n_epochs=100,
             scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator,
                                  optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator,
                                 criterion)

        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss,
                  'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'],
                                                                   best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
            

In [30]:
nn_train(model, train_iterator, valid_iterator, 
         criterion, optimizer, scheduler=scheduler, 
         n_epochs=10, early_stopping=2)

A Jupyter Widget


validation loss 0.59745


A Jupyter Widget


validation loss 0.57877


A Jupyter Widget


validation loss 0.57128


A Jupyter Widget


validation loss 0.56947


A Jupyter Widget


validation loss 0.57175


A Jupyter Widget


validation loss 0.56874


A Jupyter Widget


validation loss 0.57112


A Jupyter Widget


validation loss 0.58108
Early stopping! best epoch: 5 val 0.56874


In [41]:
def test_accuracy(model, test_iter):
    model.eval()
    epoch_accuracy = 0
    
    with tt.no_grad():
        for batch in test_iterator:
            pred = model(batch)
            max_vals, max_indices = tt.max(pred.data, 1)
            acc = (max_indices == batch.label).sum().data.cpu().numpy() / max_indices.size()[0]
            epoch_accuracy += acc.item()
        
    return epoch_accuracy / len(test_iterator)

In [42]:
test_accuracy = test_accuracy(model, test_iterator)

In [44]:
test_accuracy

0.7097474405381191