In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import catboost
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
import src.train_utils as u 
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool
from pathlib import Path
from tqdm.auto import tqdm

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 100)

u.random_seed(42)

In [None]:
class FlexibleF1(object):
    
    def __init__(self,
                 th_start: float = 0.0,
                 th_stop: float = 1.0,
                 steps: int = 20
                 ):
        self.th_grid = np.linspace(start=th_start,
                                       stop=th_stop,
                                       num=steps
                                       )
        self.train_call = False
        
    @staticmethod
    def is_max_optimal() -> bool:
        return True
    
    @staticmethod
    def get_final_error(error, _):
        return error
    
    def evaluate(self, approxes, target, _) -> float:
        self.train_call = ~self.train_call
        if self.train_call:
            return 0, 1.0
        else:
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])
            approx = np.array(approxes[0])
            exps = np.exp(approx)
            probs = exps / (1 + exps)
            scores = [f1_score(y_pred=probs > th,
                               y_true=np.array(target)
                               )
                      for th in self.th_grid]
            score = max(scores)
            return score, 1.0
        
        
def select_by_time(df: pd.DataFrame,
                   tstart: str,
                   tend: str,
                   time_col: str = 'time'
                   ) -> pd.DataFrame:
    df_select = df[(pd.Timestamp(tstart) <= df[time_col]) &
                   (df[time_col] < pd.Timestamp(tend))]
    df_select.reset_index(drop=True, inplace=True)
    return df_select


In [None]:
train_path = '../data/train_1002.pkl'
test_path = '../data/test_1002.pkl'

train_full, test, all_cols, cont_cols, cat_cols = u.read_data(
    train_path=train_path, test_path=test_path)

In [None]:
train = select_by_time(train_full, '2016-10-01', '2018-10-01', 'datetime')
val = select_by_time(train_full, '2018-10-01', '2019-01-01', 'datetime')

train_pool = Pool(data=train[all_cols], label=train['y'], cat_features=cat_cols)
val_pool =  Pool(data=val[all_cols], label=val['y'], cat_features=cat_cols)
test_pool = Pool(data=test[all_cols], cat_features=cat_cols)

In [None]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'scale_pos_weight': 5,
    'has_time': True,
    'one_hot_max_size': 1000,
    'depth': 5,
    
    'loss_function': 'Logloss',
    'task_type': 'CPU',
    'use_best_model': True,
    'eval_metric': FlexibleF1(0, 1, 20)
}

In [None]:
cls = CatBoostClassifier(**params)

cls.fit(
    train_pool,
    eval_set=val_pool,
    plot=True,
    verbose=1,
    early_stopping_rounds=5,
)

# cls.save_model('../results/cb_model.pt')

# Submit

In [None]:
proba_test = cls.predict_proba(test_pool)[:, 1]
pred = proba_test > 0.285

n_pred = sum(pred)
print(n_pred)

In [None]:
submit = pd.DataFrame(
    data={'datetime x segment_id': test['datetime x segment_id'].values,
          'prediction': pred.astype(int)}
)
submit.to_csv(f'../results/submit_cb_{n_pred}.csv', index=False)