In [1]:
import operator
from functools import reduce
from collections import Counter

import pandas as pd
import torch.optim as optim
import youtokentome as yttm
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.notebook import tqdm

from src.dataset import BinaryDataset
from src.model import Classifier
from src.utils import *
from src.vars import *

### prepare tokenizer

In [2]:
train_data = pd.read_csv(train_file, sep='\t', index_col='id')[['tweet', label_column]]
train_data.head(2)

Unnamed: 0_level_0,tweet,subtask_a
id,Unnamed: 1_level_1,Unnamed: 2_level_1
86426,@USER She should ask a few native Americans wh...,OFF
90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF


In [3]:
Counter(train_data[label_column])

Counter({'OFF': 4400, 'NOT': 8840})

In [4]:
# This file needed for tokenizer initialization
tmp_file = 'tmp.tsv'
prep = train_data[data_column]
prep.to_csv(tmp_file, sep='\t', header=None, index=None)

  after removing the cwd from sys.path.


In [5]:
# coverage != 1.0, to ignore rare tokens
tokenizer = yttm.BPE.train(data=tmp_file, coverage=0.99, vocab_size=1024, model=tokenizer_path, 
                           pad_id=pad_token_id, unk_id=unk_token_id, eos_id=sep_token_id)

In [6]:
# for win
! del tmp.tsv
# for unix
! rm tmp.tsv

"rm" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


### params

In [7]:
num_of_epochs = 10
batch_size = 32
epsilon = 1e-4
learning_rate = 0.001
word_embedding_dim = 64
hidden_dim = 32
topk = 1
fc1 = 32

### data

In [8]:
tokenizer = yttm.BPE(model=tokenizer_path)

x = train_data[data_column].apply(lambda r: tokenizer.encode(r))
y = train_data[label_column]

# Categorical to int
uniq = set(y)
mapping = dict([(k,v) for v,k in enumerate(uniq)])
y = y.apply(lambda r: mapping[r])

X_train, X_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.20, random_state=42)

In [9]:
train_dataset = BinaryDataset(X_train,y_train)
val_dataset = BinaryDataset(X_test, y_test)

data_loader = {'train': create_dataloader(train_dataset, batch_size, pad_token_id),
               'val': create_dataloader(val_dataset, batch_size, pad_token_id)}

In [10]:
# Choose device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Init stats
stats = {'train': {'acc': [], 'loss': []},
         'val': {'acc': [], 'loss': []}}
best_acc = 0

word_vocab_size = tokenizer.vocab_size()
print('Word vocab size:', word_vocab_size)

# Init model.
model = Classifier(word_vocab_size=word_vocab_size,
                   word_embedding_dim=word_embedding_dim,
                   hidden_dim=hidden_dim,
                   target_size=2,
                   fc1=fc1,
                   padding_idx=pad_token_id,
                   topk=topk)
model.to(device)

loss_function = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5,
                                 patience=0, verbose=True, threshold_mode='abs',
                                 threshold=1e-2)

Word vocab size: 1024


### training

In [11]:
def run_model(model, device, optimizer, loss_function,
              data_loader, is_train_phase, desc=None, verbose=True):
    """Run the given data through the model.
    :param model: model to run
    :param optimizer: optimizer for the model
    :param loss_function: function to calculate the loss
    :param data_loader: loader for the data
    :param is_train_phase: if true, model runs in train mode and propagate gradient, otherwise in eval mode
    :param desc: description for the tqdm visualization
    :param verbose: verbose state
    :return: tuple of accuracies and losses
    """
    # Setup gradient
    if is_train_phase:
        model.train()
    else:
        model.eval()
    losses = []
    accuracies = []
    iterable = tqdm(data_loader, desc=desc) if verbose else data_loader
    for sentences, targets in iterable:
        sentences = sentences.to(device)
        targets = targets.to(device).flatten()
        optimizer.zero_grad()

        with torch.set_grad_enabled(is_train_phase):
            y_hat = model(sentences)
            loss = loss_function(y_hat, targets)
            acc = calc_accuracy(y_hat, targets).cpu().detach().numpy()

            # backprop only in train phase
            if is_train_phase:
                loss.backward()
                optimizer.step()

            # store loss
            losses.append(loss.item())
            accuracies.append(acc)

            # Update metrics in description
            if verbose:
                iterable.set_description(desc +
                                         f' - acc: {np.mean(accuracies):.4f}; ' +
                                         f'loss: {np.mean(losses):.4f}')

    return accuracies, losses

In [12]:
# Train model
for epoch in range(num_of_epochs):
    tqdm.write(f'------------ Epoch {epoch} ------------')
    for phase in ['train', 'val']:
        desc = f"{phase.title()}: Epoch #{epoch}"
        epoch_accs, epoch_losses = run_model(model, device, optimizer,
                                             loss_function, data_loader[phase],
                                             phase == 'train', desc)

        acc, loss = np.mean(epoch_accs), np.mean(epoch_losses)
        stats[phase]['acc'].append(acc)
        stats[phase]['loss'].append(loss)
    # Update learning rate.
    lr_scheduler.step(stats['val']['acc'][-1])
    # Check best model
    if stats['val']['acc'][-1] > best_acc:
        best_acc = stats['val']['acc'][-1]
        tqdm.write('Biggest val accuracy')
        tqdm.write('Saving model...')
        try:
            torch.save(model, model_file)
            tqdm.write('Saved successfully')
        except FileNotFoundError:
            tqdm.write('Error during saving!')
    # Check loss change for early stopping
    loss_change = abs(reduce(operator.sub, stats['train']['loss'][-2:]))
    if epsilon and loss_change < epsilon:
        print(f'Early stopping: loss change ({loss_change}) is less than {epsilon}')

print('Finished...')

------------ Epoch 0 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #0', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #0', max=83.0, style=ProgressStyle(description…


Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 1 ------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #1', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #1', max=83.0, style=ProgressStyle(description…


Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 2 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #2', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #2', max=83.0, style=ProgressStyle(description…


Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 3 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #3', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #3', max=83.0, style=ProgressStyle(description…


Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 4 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #4', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #4', max=83.0, style=ProgressStyle(description…


Epoch     4: reducing learning rate of group 0 to 5.0000e-04.
Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 5 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #5', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #5', max=83.0, style=ProgressStyle(description…


Epoch     5: reducing learning rate of group 0 to 2.5000e-04.
Biggest val accuracy
Saving model...
Saved successfully
------------ Epoch 6 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #6', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #6', max=83.0, style=ProgressStyle(description…


Epoch     6: reducing learning rate of group 0 to 1.2500e-04.
------------ Epoch 7 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #7', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #7', max=83.0, style=ProgressStyle(description…


Epoch     7: reducing learning rate of group 0 to 6.2500e-05.
------------ Epoch 8 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #8', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #8', max=83.0, style=ProgressStyle(description…


Epoch     8: reducing learning rate of group 0 to 3.1250e-05.
------------ Epoch 9 ------------


HBox(children=(FloatProgress(value=0.0, description='Train: Epoch #9', max=331.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Val: Epoch #9', max=83.0, style=ProgressStyle(description…


Epoch     9: reducing learning rate of group 0 to 1.5625e-05.
Finished...


### Test data

In [13]:
testX = pd.read_csv(test_file, sep='\t', index_col='id')[data_column].apply(lambda r: tokenizer.encode(r))
testY = pd.read_csv(test_answer, index_col=0, header=None)[1].apply(lambda r: mapping[r]).values

In [14]:
model.to('cpu')
y_pred = [model(torch.LongTensor(x).unsqueeze(0)) for x in testX]
y_pred = np.array([torch.max(x, dim=1)[1].item() for x in y_pred])

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy score: {:.4f}".format(accuracy_score(testY, y_pred)))
print("Precision score: {:.4f}".format(precision_score(testY, y_pred)))
print("Recall score: {:.4f}".format(recall_score(testY, y_pred)))

Accuracy score: 0.7314
Precision score: 0.8269
Recall score: 0.7935
