In [6]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool

In [7]:
N_FOLD = 10
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-CAT'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')
mkdir(f'./results/{SOL_NAME}/')

In [8]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [9]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [10]:
# train
df_train = dt.fread(f'./dataset/train_{DATA_NAME}_LGB.csv').to_pandas()
# result
df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [11]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'stock_id', 'row_id']]

In [12]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [23]:
seed0 = 2021
params = {
    'n_estimators': 10000, 
    'learning_rate': 0.03, 
    'depth': 6, 
    'l2_leaf_reg': 3, 
    'subsample': 0.8,
    'per_float_feature_quantization': ['9:border_count=1024', '10:border_count=1024', '11:border_count=1024', '12:border_count=1024'], 
    'loss_function': 'RMSE', 
    'eval_metric': 'RMSE', 
    'task_type': 'CPU', 
    'random_seed': seed0, 
    }

list_seeds = [0, 11, 42, 777, 2045]

In [24]:
list_rmspe = []
for i_seed, seed in enumerate(list_seeds):
    df_train = add_time_fold(df_train, N_FOLD, seed=seed)
    list_rmspe += [[]]
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols]
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols]
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_seed+1}/{len(list_seeds)} | {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)

        cat_tr = Pool(X_train, y_train, weight = 1 / np.square(y_train), cat_features = [0])
        cat_val = Pool(X_test, y_test, weight = 1 / np.square(y_test), cat_features = [0])
        raw_model = CatBoostRegressor(**params)
        model = raw_model.fit(cat_tr, eval_set = cat_val, early_stopping_rounds = 250, use_best_model = True, verbose = 250)

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        list_rmspe[-1] += [curr_rmspe]
        ckp_path = f'./models/{SOL_NAME}/CAT_{i_seed}_{i_fold}.pkl'
        save_pickle(model, ckp_path)
        # generate and save preds
        df_result.loc[idx_test, f'pred_{i_seed}'] = y_pred
        clear_output()
        print(list_rmspe)

[[0.21657539750282262, 0.22326839614765498, 0.2136221346145354, 0.2135817166983782, 0.21798364723619848, 0.2180001495917959, 0.2128552271481703, 0.21503017851011744, 0.21492552762714606, 0.2356506474761117], [0.22194325103489745, 0.21912713622065605, 0.2179995863351541, 0.21239106777861444, 0.22279472289707428, 0.21164679998943034, 0.21909735859641594, 0.22822785752018226, 0.21869325040251192, 0.20950200202562252], [0.21531170768979194, 0.2174223037182661, 0.21201824081549211, 0.21634508707444713, 0.2212535065344276, 0.21732277049831916, 0.21771141548999184, 0.2154469854195095, 0.21610991062834398, 0.23066778038477212], [0.22253817758562888, 0.2165887602615825, 0.2157443100458601, 0.23749408145277215, 0.2128488203299294, 0.21825037970175368, 0.22123312843294562, 0.21512744142395504, 0.21378475109873757, 0.2133007984019488], [0.21640063004847893, 0.2184607529082726, 0.21071113499825644, 0.21238785647986794, 0.21696866219588717, 0.2152171005936213, 0.21978202735310332, 0.2164053451604146

In [25]:
# saving results for ensembling
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)

In [26]:
for i in range(len(list_seeds)):
    print(i, rmspe(df_result['target'], df_result[f'pred_{i}']))
print('All: ', rmspe(df_result['target'], df_result[[f'pred_{i}' for i in range(len(list_seeds))]].mean(axis=1)))

0 0.21824666782582222
1 0.21820875286336297
2 0.21801319460292157
3 0.21880261161260908
4 0.2179025805184648
All:  0.21685685005692384
