Please, note, that you have setup SEED before running this notebook to reproduce our results. So, the start command should look like this:

**env PYTHONHASHSEED=42 jupyter notebook**

In [None]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append("..")

from fastai.tabular import FillMissing, Categorify, Normalize, TabularList, tabular_learner
from fastai.callbacks.tracker import EarlyStoppingCallback, SaveModelCallback
from fastai.basic_data import DatasetType
from torch.nn import CrossEntropyLoss as CEloss
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

import src.train_utils as u
from src import visualisation as v

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

# Load data

In [None]:
train_path = '../data/train_1002.pkl'
test_path = '../data/test_1002.pkl'

train_full, test, all_cols, cont_cols, cat_cols = u.read_data(
    train_path=train_path, test_path=test_path)

In [None]:
val_start = '2018-10-01'

data_bunch = u.create_fai_databunch(train=train_full, test=test,
                                    cat_cols=cat_cols, cont_cols=cont_cols,
                                   val_start=val_start)

## Model training

In [None]:
learn = u.train_fai_model(data=data_bunch)
th, score = u.estimate(learn, th_start=0, th_stop=1, steps=51) # Score should be 0.121518

learn.save(f'best_model_{round(score, 6)}')

In [None]:
# The trained model also available for downloading:
# https://drive.google.com/file/d/1lvk2p2YeOYFW2Rqk4pRvmo8Fhq7iN14g/view?usp=sharing
# learn.load('path/to/downloaded/model')

# Predict visualisation

In [None]:
probas_val, *_ = learn.get_preds(DatasetType.Valid)
probas_val = np.array(probas_val[:, 1])
predict_val = probas_val > th


In [None]:
val = train_full[train_full.datetime >= pd.Timestamp(val_start)]
val.reset_index(inplace=True, drop=True)

val['event'] = predict_val
val['time'] = val.datetime
val['sid'] = val.segment_id
val = val[val.event]
v.add_more_time(val)

val.reset_index(inplace=True, drop=True)

In [None]:
ones = v.read_ones('../data/train.csv')
v.add_more_time(ones)

In [None]:
time = (val_start, '2018-11-30')
sid = 'SE831U7'


# pred
v.plot_sid_events(ones, sid, *time)


# gt
v.plot_sid_events(val, sid, *time)

# Submit

In [None]:
probas_test, *_ = learn.get_preds(DatasetType.Test)
probas_test = np.array(probas_test[:, 1])

In [None]:
n_pred = 7500  # This submit will give you 0.124107710792282 on leaderbord (similar with qEbz8JUU)

pred_test = np.zeros(len(test), bool)
pred_test[np.argsort(-1 * probas_test)[:n_pred]] = True

print(f'Predicted events: {sum(pred_test)}')

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred_test.astype(int)}
)
submit.to_csv(f'../results/submit_fai_{sum(pred_test)}.csv', index=False)