# Recurrent Predictors

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from matplotlib import pyplot as plt
from sklearn import metrics
from tqdm import tqdm

from rnn_models import LSTMTagger
from utils import report_model, prepare_sequence

Keeping initial random state fixed for reproducibility:

In [None]:
torch.manual_seed(1)
np.random.seed(0)

Loading datasets

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
max_hits = 55

Instantiating report table

In [None]:
clf_table = pd.DataFrame(columns=['name', 'window_size', 'accuracy', 'AUC']).set_index(['name', 'window_size'])

Dataset to 

In [None]:
X_train_all, y_train_all = prepare_sequence(train_df)
X_test, _ = prepare_sequence(test_df)

test_idx = ~pd.isnull(test_df['disengage'])
y_test = test_df.loc[test_idx, 'disengage']
sample_weight = test_df.loc[test_idx, 'num_hit']

Splitting train in train and validation

In [None]:
train_dev_ratio = 0.8
train_idx = np.isin(np.arange(len(X_train_all)),
                    np.random.choice(len(X_train_all), int(len(X_train_all) * train_dev_ratio), replace=False))
X_train, y_train = [], []
X_val, y_val = [], []
for i in range(len(train_idx)):
    if train_idx[i]:
        X_train.append(X_train_all[i])
        y_train.append(y_train_all[i])
    else:
        X_val.append(X_train_all[i])
        y_val.append(y_train_all[i])

sample_weight = test_df.loc[~pd.isnull(test_df['disengage']), 'num_hit']

Running the model

In [None]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
NUM_EPOCHS = 5000

Model definition

In [None]:
model = LSTMTagger(max_hits, HIDDEN_DIM, max_hits, 2)
model_best_loss = LSTMTagger(max_hits, HIDDEN_DIM, max_hits, 2)
model_best_auc = LSTMTagger(max_hits, HIDDEN_DIM, max_hits, 2)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [None]:
loss_arr = np.ndarray(NUM_EPOCHS)
auc_interval = 100
auc_arr = np.ndarray(int(np.ceil(NUM_EPOCHS / auc_interval)))

pos = 0
last_auc = 0
last_loss = np.inf

In [None]:
for epoch in tqdm(range(NUM_EPOCHS), desc='Training'):
    for sentence_in, targets in zip(X_train, y_train):
        model.zero_grad()
        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        loss_arr[epoch] = loss.item()
        loss.backward()
        optimizer.step()

    if epoch % auc_interval == 0:
        with torch.no_grad():
            pred = np.hstack([model(sequence).argmax(axis=1).numpy() for sequence in X_val])
            fpr, tpr, thresholds = metrics.roc_curve(np.hstack(y_val), pred, pos_label=1)
            auc = metrics.auc(fpr, tpr)
            auc_arr[pos] = auc

            # saving best models
            if auc > last_auc:
                last_auc = auc_arr[pos]
                model_best_auc.load_state_dict(model.state_dict())
            if loss < last_loss:
                last_loss = loss_arr[epoch]
                model_best_auc.load_state_dict(model.state_dict())
            print(f'Epoch {epoch:5d}: loss({loss_arr[epoch]:12.10f}), auc({auc_arr[pos]:12.10f})')
        pos += 1

Ploting training curves

In [None]:
fig, axs = plt.subplots(1, 2)
axs[0].plot(loss_arr, 'b')
axs[0].set_title('Loss')
axs[0].set_xlabel = 'Epoch'

axs[1].plot(np.arange(len(auc_arr)) * auc_interval, auc_arr, 'r')
axs[1].set_title('AUC')
axs[1].set_xlabel = 'Epoch'
plt.show()

Running prediction and writing reports

In [None]:
with torch.no_grad():
    raw_pred = np.hstack([model(sequence).argmax(axis=1).numpy() for sequence in X_test])
    pred = raw_pred[test_idx]
    report_model(y_test, pred, sample_weight=sample_weight, name='RNN', clf_table=clf_table)

    raw_pred = np.hstack([model_best_auc(sequence).argmax(axis=1).numpy() for sequence in X_test])
    pred = raw_pred[test_idx]
    report_model(y_test, pred, sample_weight=sample_weight, name='RNN - best validation AUC', clf_table=clf_table)

    raw_pred = np.hstack([model_best_loss(sequence).argmax(axis=1).numpy() for sequence in X_test])
    pred = raw_pred[test_idx]
    report_model(y_test, pred, sample_weight=sample_weight, name='RNN - best loss', clf_table=clf_table)

Runing baseline methods
- Every HIT implies that the user will disengage soon

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test, np.zeros(y_test.values.shape), sample_weight=sample_weight,pos_label=1)
auc = metrics.auc(fpr, tpr)
acc = metrics.accuracy_score(y_test, pred)
clf_table.loc[('All zeroes', '--'), ['accuracy', 'AUC']] = acc, auc

- Every HIT implies that the user will not disengage soon

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test, np.ones(y_test.values.shape), sample_weight=sample_weight, pos_label=1)
auc = metrics.auc(fpr, tpr)
acc = metrics.accuracy_score(y_test, pred)
clf_table.loc[('All ones', '--'), ['accuracy', 'AUC']] = acc, auc

In [None]:
print(clf_table)

