In [None]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append("..")

from fastai.tabular import FillMissing, Categorify, Normalize, TabularList, tabular_learner
from fastai.callbacks.tracker import EarlyStoppingCallback, SaveModelCallback
from fastai.basic_data import DatasetType
from torch.nn import CrossEntropyLoss as CEloss
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

import src.train_utils as u

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

# Load data

In [None]:
train_path = '../data/train_1002.pkl'
test_path = '../data/test_1002.pkl'

train_full, test, all_cols, cont_cols, cat_cols = u.read_data(
    train_path=train_path, test_path=test_path)

In [None]:
data_bunch = u.create_fai_databunch(train=train_full, test=test,
                                    cat_cols=cat_cols, cont_cols=cont_cols)

## Model training

We managed to make solution 100% reproducible, but it didn’t survive kernel restart.
We’ve tried all advices given on fastai
[forum](https://forums.fast.ai/t/solved-reproducibility-where-is-the-randomness-coming-in/31628/5)
and
[docs](https://docs.fast.ai/dev/test.html#getting-reproducible-results), but unfortunately none of them helped.
It seems that we cannot avoid this issue while training on gpu.

So we decided to train models in cycle and choose the best one.
To reproduce our score, we need a model that score roughly above 0.122 on validation data
and set threshold for test data that results in predicting approx. 7000 - 8500 events.
We managed to train such a model in several attempts, but it may a lot of time...


In [None]:
n_attempts = 15

best_learn, best_th, best_score = None, None, 0

for i in tqdm(range(n_attempts)):
    learn = u.train_fai_model(data=data_bunch)
    th, score = u.estimate(learn, th_start=0, th_stop=1, steps=51)
    plt.show()
    
    if score > best_score:
        best_learn = learn
        best_score = score
        best_th = th
        del learn
        
    if best_score > 12.15:
        break


best_learn.save(f'best_model_{round(best_score, 6)}')

# Submit

In [None]:
probas_test, *_ = best_learn.get_preds(DatasetType.Test)
probas_test = np.array(probas_test[:, 1])

In [None]:
n_pred = 7500
pred_test = np.zeros(len(test), bool)
pred_test[np.argsort(-1 * probas_test)[:n_pred]] = True

# pred_test = probas_test > 0.58

print(f'Predicted events: {sum(pred_test)}')

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred_test.astype(int)}
)
submit.to_csv(f'../results/submit_fai_{sum(pred_test)}.csv', index=False)