In [None]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append("..")

from fastai.tabular import *
from fastai.callbacks.tracker import EarlyStoppingCallback, SaveModelCallback
from torch.nn import CrossEntropyLoss as CEloss
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt

import src.utils as u
import src.fai_utils as fu

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

# Load data

In [None]:
data_path = Path('../data')

train_full, test, all_cols, cont_cols, cat_cols = u.read_data(data_path)

for f in cont_cols:
    test[f] = test[f].fillna(0)
    train_full[f] = train_full[f].fillna(0)

# Model training

In [None]:
val_ids = (train_full.datetime >= pd.Timestamp('2018-10-01')).values

procs = [FillMissing, Categorify, Normalize]

data = (TabularList.from_df(
            train_full, procs=procs, cat_names=cat_cols, cont_names=cont_cols)
                 .split_by_idx(val_ids)
                 .label_from_df(cols='y')
                 .add_test(test_tab)
                 .databunch(bs=100_000)
       )

test_tab = TabularList.from_df(df=test, cat_names=cat_cols, cont_names=cont_cols)

In [None]:
learn = tabular_learner(data, layers=[1024, 512, 256, 128],

                                metrics=fu.F1(th_start=0, th_stop=1, steps=20),
                                callback_fns=[ShowGraph,
                                              partial(EarlyStoppingCallback,
                                                      monitor='f1',
                                                      min_delta=0.001,
                                                      patience=5)
                                              ],
                                loss_func=CEloss(
                                    weight=tensor([1, 10]).float().cuda()
                                ),
                                opt_func=torch.optim.Adam
                                )

In [None]:
learn.lr_find()
learn.recorder.plot()
plt.show()

In [None]:
learn.fit_one_cycle(10, max_lr=slice(5e-3),
                    callbacks=[
                        SaveModelCallback(learn, every='improvement',
                                          monitor='f1', name='best_model')]
                   )
                        
learn.recorder.plot_losses()                        
learn.recorder.plot_lr()
plt.show()

# Submit

In [None]:
probas_val, *_ = learn.get_preds(DatasetType.Valid)
probas_test, *_ = learn.get_preds(DatasetType.Test)

probas_val = probas_val[:, 1]
probas_test = probas_test[:, 1]

In [None]:
th = u.estimate(learn, val, y_true=val.y)  # th: 0.3838, f1 score: 0.1213

In [None]:
pred_test = np.array(probas_test) > th * .95  # we choose th to predict 8118 events

n_pred = sum(pred_test)
print(f'Predicted events: {n_pred}')

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred_test.astype(int)}
)
submit.to_csv(f'../results/submit_fai_{n_pred}.csv', index=False)