In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import catboost
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool, cv
from pathlib import Path
import src.utils as u
import src.cb_utils as m
from tqdm.auto import tqdm
from collections import Counter
from sklearn.metrics import f1_score, precision_score, recall_score

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

data_path = Path('../data')

In [2]:
train_full, test, all_cols, cont_cols, cat_cols = u.read_data(data_path)

In [None]:
# corr = train_full[all_cols].corr()

# high_corr = []
# fields = corr.columns

# for f1 in fields:
#     for f2 in fields:
#         if (f1 != f2) and (corr[f1][f2] > 0.8) and ({f1, f2} not in high_corr):
#             print(f1, f2, corr[f1][f2])
#             high_corr.append({f1, f2})

In [3]:
high_corr_to_drop = ['longitude', 'latitude', 'LANES', 'lane_width', 'P0',
                     'mean_avg_speed', 'traffic_total', 'std_traffic', 'traffic1']

all_cols = list(set(all_cols) - set(high_corr_to_drop))
cont_cols = list(set(cont_cols) - set(high_corr_to_drop))
cat_cols = list(set(cat_cols) - set(high_corr_to_drop))

In [6]:
sum(train_full['traffic_total'].isnull()) / len(train_full)

0.011583073220570936

# Cross val

In [None]:
train = pd.concat([u.select_by_time(train_full, '2016-01-01', '2018-01-01', 'datetime'),
                   u.select_by_time(train_full, '2018-04-01', '2019-01-01', 'datetime')])
train.reset_index(inplace=True)

val = u.select_by_time(train_full, '2018-01-01', '2018-04-01', 'datetime')


def get_folds(df):
    months = [('2018-04-01', '2018-06-30'),
              ('2018-07-01', '2018-09-30'),
              ('2018-10-01', '2019-01-01')]
    
    folds = []
    for (ta, tb) in months:
        tstart = pd.Timestamp(ta)
        tend = pd.Timestamp(tb)

        tt = df.datetime
        ids_val = ((tstart <=  tt) & (tt < tend)).to_numpy().nonzero()[0]
        ids_val = np.array(ids_val)
        folds.append(ids_val)
    
    return folds
        
    
folds = get_folds(train)

In [None]:
val_pool = Pool(data=val[all_cols], label=val['y'], cat_features=cat_cols)
test_pool = Pool(data=test[all_cols], label=test['y'], cat_features=cat_cols)

In [None]:
probas, ths, val_scores = m.fit_predict_cv(train=train,
                                           test=val,
                                           val_ids=folds, 
                                           cat_cols=cat_cols,
                                           all_cols=all_cols,
                                           draw_plot=True)

In [None]:
for i_fold, (th, proba) in enumerate(zip(ths, probas)):
    print(u.f1_flexible(proba, th_start=0, th_stop=1, steps=20, gts=val.y)[0])
    print(f1_score(y_pred=proba > th, y_true=val.y))
    print(sum(proba > th))
    print()

In [None]:
y_pred_vote = m.vote_predict(probas, ths)
f1_score(y_pred=y_pred_vote, y_true=val.y)

In [None]:
models = m.load_models('/home/AlekseySh/code/comp/results/cb/2020-02-02 02:03:13.874913/')

In [None]:
pred = predict_multi_models(models, ths, val_pool)

In [None]:
u.estimate(models[1], val_pool, val_pool.get_label())

# 1 FOLD VAL

In [None]:
field_combs = [
    tuple([('segment_id')]), tuple(['hour']), tuple(['weekday']), tuple(['month']),
    ('segment_id', 'hour'), ('segment_id', 'weekday'), ('segment_id', 'month'), ('weekday', 'month'), ('hour', 'weekday'),
    ('segment_id', 'weekday', 'hour'), ('segment_id', 'weekday', 'month')
]

ones = u.read_ones('../data/train.csv')
ones['segment_id'] = ones['sid']
u.add_more_time(ones)

train = u.select_by_time(train_full, '2018-01-01', '2018-10-01', 'datetime')
u.add_statistic(data=train, stat_data=ones,
                field_combs=field_combs, prefix='hist_',
                tstart='2016-01-01', tend='2018-01-01',
               )

val = u.select_by_time(train_full, '2018-10-01', '2019-01-01', 'datetime')
u.add_statistic(data=val, stat_data=ones,
                field_combs=field_combs, prefix='hist_',
                tstart='2016-01-01', tend='2018-10-01',
               )

u.add_statistic(data=test, stat_data=ones,
                field_combs=field_combs, prefix='hist_',
                tstart='2016-01-01', tend='2019-01-01',
               )


hist_cols = list(filter(lambda x: x.startswith('hist_'), train.columns.values))

all_cols += hist_cols
cont_cols += hist_cols


In [None]:
train_pool = Pool(data=train[all_cols], label=train['y'], cat_features=cat_cols)
val_pool =  Pool(data=val[all_cols], label=val['y'], cat_features=cat_cols)
test_pool = Pool(data=test[all_cols], cat_features=cat_cols)

In [None]:
# --bootstrap-type
# --bagging-temperature
# --random-strength
# --depth
# --grow-policy
# --max-leaves
# --rsm
# --boost-from-average
# --model-shrink-rate

params = {
    'iterations': 10000,
    'learning_rate': 0.05,
    'scale_pos_weight': 10,
    'has_time': False,
    'one_hot_max_size': 100,
    'depth': 4,
    
    'loss_function': 'Logloss',
    'task_type': 'GPU',
    'use_best_model': True,
    'eval_metric': 'F1'#m.FlexibleF1(0, 1, 20)
}

In [None]:
cls = CatBoostClassifier(**params)

cls.fit(
    train_pool,
    eval_set=val_pool,
    plot=True,
    verbose=1,
    early_stopping_rounds=10000,
)

In [None]:
th = u.estimate(cls, val_pool, y_true=val_pool.get_label(), steps=20)

In [None]:
proba_test = cls.predict_proba(test_pool)[:, 1]
proba_val = cls.predict_proba(val_pool)[:, 1]

In [None]:
pred_val = proba_val > th

print(f1_score(y_true=val.y, y_pred=pred_val))

# heuristic
pred_val[val.hour <= 4] = 0
pred_val[val.hour >= 22] = 0

print(f1_score(y_true=val.y, y_pred=pred_val))

In [None]:
# cls = CatBoostClassifier()
# cls.load_model('../results/model_cb.pt')

# cls.save_model('../results/cb/stack/model_1159.pt')

# Submit

In [None]:
pred = proba_test > 1.18 * th

n_pred = sum(pred)
print(n_pred)

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred.astype(int)}
)
submit.to_csv(f'../results/submit_cb_{n_pred}.csv', index=False)