In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from fastai.tabular import *
from fastai.callbacks.tracker import EarlyStoppingCallback, SaveModelCallback
from pathlib import Path
import src.utils as u
import src.fai_utils as fu
from tqdm.auto import tqdm
from collections import Counter
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.nn import CrossEntropyLoss as CEloss

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

data_path = Path('../data')

In [None]:
train_full, test, all_cols, cont_cols, cat_cols = u.read_data(data_path)

In [None]:
for f in cont_cols:
    test[f] = test[f].fillna(0)
    train_full[f] = train_full[f].fillna(0)

In [None]:
high_corr_to_drop = ['longitude', 'latitude', 'LANES', 'lane_width', 'P0', 'dist_to_center', 'traffic1',
                    'avg_speed_neigh', 'traffic_total_neigh',
                    'average_ttime_neigh', 'acc_cnt_last_quarter_sid_neigh',
                    'mean_traffic', 'acc_cnt_last_halfyear_sid_neigh',
                    # with low feature importance
                    
                     'cloud_cover',
                     'cloud1_cover',
                     'acc_cnt_last_halfyear_sid_wd_h_neigh',
                     'delta_rel_diff_traffic_last_hour',
                     'rel_diff_traffic',
                     'acc_cnt_last_halfyear_vds',
                     'delta_traffic_total_next_hour',
                     'temp',
                     'acc_cnt_last_halfyear_vds_wd',
                     'orientation',
                     'cloud_3',
                     'wind_speed',
                     'wind_dir_angle',
                     'max_gust',
                     'weather_cond',
                     'blinding',
                     'acc_cnt_last_quarter_vds_wd',
                     'cloud_height',
                     'delta_traffic_total_last_hour',
                     'wind_dir',
                     'delta_rel_diff_traffic_next_hour',
                     'acc_cnt_last_quarter_sid_wd_h_neigh',
                     'precip_time',
                     'humidity',
                     'angle_wind_road',
                     'cloud_1',
                     'mean_avg_speed',
                     'acc_cnt_last_halfyear_vds_wd_h',
                     'cloud_2',
                     'sinuosity',
                     'wind_dir_defined',
                     'vds_count',
                     'snow',
                     'CLASS',
                     'mist',
                     'vms_count',
                     'PAVETYPE',
                     'delta_average_ttime_last_day',
                     'average_ttime_na',
                     'precip_mm',
                     'SURFTYPE',
                     'smoke',
                     'fog',
                     'drizzle',
                     'cloud_cover_fog',
                     'delta_average_ttime_next_day',
                     'WIDTH',
                     'visibility',
                     'rain',
                     'main_route']

all_cols = list(set(all_cols) - set(high_corr_to_drop))
cont_cols = list(set(cont_cols) - set(high_corr_to_drop))
cat_cols = list(set(cat_cols) - set(high_corr_to_drop))

# Fast ai

# 1 FOLD VALID

In [None]:
val_ids = (train_full.datetime >= pd.Timestamp('2018-10-01')).values

In [None]:
procs = [FillMissing, Categorify, Normalize]

test_tab = TabularList.from_df(df=test, cat_names=cat_cols, cont_names=cont_cols)

In [None]:
p = {
    'bs': 200_000,
    'n_epochs': 10,
    'layers': [1024, 512, 256, 128],
    'weights': [1, 10],
    'n_steps_f1': 20,
    'emb_drop': 0.5
}

In [None]:
data = (TabularList.from_df(
            train_full, procs=procs, cat_names=cat_cols, cont_names=cont_cols)
                 .split_by_idx(val_ids)
                 .label_from_df(cols='y')
                 .add_test(test_tab)
                 .databunch(bs=p['bs']))

In [None]:
learn = tabular_learner(data, layers=p['layers'],
                                emb_drop=p['emb_drop'],
                                metrics=fu.F1(0, 1, steps=p['n_steps_f1']),
                                callback_fns=[ShowGraph,
                                              partial(EarlyStoppingCallback,
                                                      monitor='f1',
                                                      min_delta=0.001,
                                                      patience=6)
                                              ],
                                loss_func=CEloss(
                                    weight=tensor(p['weights']).float().cuda()
                                ),
                                opt_func=torch.optim.Adam
                                )

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(p['n_epochs'], max_lr=slice(5e-3),
                    callbacks=[
                        SaveModelCallback(learn, every='improvement',
                                          monitor='f1', name='best_model')]
                   )
                        
learn.recorder.plot_losses()                        
learn.recorder.plot_lr()
plt.show()

# Cross VALID

In [None]:
probas_val = probas_val[:, 1]

In [None]:
def get_folds(train):
    # months = ['01', '03', '06', '09', '12']
    months = ['09', '12']
    
    folds = []
    for i_fold in range(len(months) - 1):
        tstart = pd.Timestamp('2018-' + months[i_fold] + '-01')
        tend = pd.Timestamp('2018-' + months[i_fold + 1] + '-01')

        tt = train.datetime
        ids_val = ((tstart <=  tt) & (tt < tend)).to_numpy().nonzero()[0]
        folds.append(ids_val)
    
    return folds
        
    
folds = get_folds(train_full)

# Submit

In [None]:
probas_val, *_ = learn.get_preds(DatasetType.Valid)
probas_test, *_ = learn.get_preds(DatasetType.Test)

probas_val = probas_val[:, 1]
probas_test = probas_test[:, 1]

In [None]:
th = u.estimate(learn, val, y_true=val.y)

In [None]:
pred = np.array(probas_test) > th * .95

n_pred = sum(pred)
print(n_pred)

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred.astype(int)}
)

In [None]:
submit.to_csv(f'../results/submit_fai_{n_pred}.csv', index=False)