In [1]:
!pip install scikit-learn==0.24.1
!pip install catboost==0.25.1 --quiet

Collecting scikit-learn==0.24.1
  Downloading scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.2 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.2.0
[K     |████████████████████████████████| 67.3 MB 1.1 MB/s 
[?25h

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/KaggleCompetition/Optiver/Leo-Optiver/

Mounted at /content/drive
/content/drive/My Drive/KaggleCompetition/Optiver/Leo-Optiver


In [3]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
# import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool

# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [4]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-cat'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')
mkdir(f'./results/{SOL_NAME}/')

In [5]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [6]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [7]:
# train
# df_train = dt.fread(f'./dataset/public_train_{DATA_NAME}_LGB.csv').to_pandas()
df_train = pd.read_csv(f'./dataset/public_train_{DATA_NAME}_LGB.csv')
# result
# df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = pd.read_csv('./dataset/train.csv')
df_result = gen_row_id(df_result)

In [8]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'stock_id', 'row_id']]

In [9]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [10]:
seed0 = 2021
params = {
    'n_estimators': 10000, 
    'learning_rate': 0.03, 
    'depth': 6, 
    'l2_leaf_reg': 3, 
    'subsample': 0.8,
    'per_float_feature_quantization': ['9:border_count=1024', '10:border_count=1024', '11:border_count=1024', '12:border_count=1024'], 
    'loss_function': 'RMSE', 
    'eval_metric': 'RMSE', 
    'task_type': 'CPU', 
    'random_seed': seed0, 
    }
list_rmspe = [1 for _ in range(N_FOLD)]

In [11]:
# df_train[fea_cols]

In [None]:
n_trials = 1
for _ in range(n_trials):
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols]
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols]
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)

        cat_tr = Pool(X_train, y_train, weight = 1 / np.square(y_train), cat_features = [0])
        cat_val = Pool(X_test, y_test, weight = 1 / np.square(y_test), cat_features = [0])
        raw_model = CatBoostRegressor(**params)
        model = raw_model.fit(cat_tr, eval_set = cat_val, early_stopping_rounds = 250, use_best_model = True, verbose = 250)

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        if curr_rmspe < list_rmspe[i_fold]:
            ckp_path = f'./models/{SOL_NAME}/cat_601_{i_fold}.pkl'
            save_pickle(model, ckp_path)
            list_rmspe[i_fold] = curr_rmspe
            # generate and save preds
            df_result.loc[idx_test, 'pred'] = y_pred
        # clear_output()
        print(list_rmspe)
        calc_metric(df_result.fillna(0))
        # 0.2169
    # break

Fold 1/5 (343145, 245) (85787, 245)
0:	learn: 0.0011815	test: 0.0011866	best: 0.0011866 (0)	total: 121ms	remaining: 20m 5s
250:	learn: 0.0005166	test: 0.0005206	best: 0.0005206 (250)	total: 13.5s	remaining: 8m 44s
500:	learn: 0.0004913	test: 0.0004955	best: 0.0004955 (500)	total: 26.7s	remaining: 8m 26s
750:	learn: 0.0004626	test: 0.0004739	best: 0.0004739 (747)	total: 39.3s	remaining: 8m 4s
1000:	learn: 0.0004458	test: 0.0004699	best: 0.0004697 (990)	total: 51.9s	remaining: 7m 46s
1250:	learn: 0.0004343	test: 0.0004702	best: 0.0004694 (1113)	total: 1m 4s	remaining: 7m 31s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.0004694085218
bestIteration = 1113

Shrink model to first 1114 iterations.
Done!
[0.21845786613754242, 1, 1, 1, 1]
   R2: -1.2258
RMSPE: 0.8997
Fold 2/5 (343145, 245) (85787, 245)
0:	learn: 0.0011883	test: 0.0011592	best: 0.0011592 (0)	total: 59.3ms	remaining: 9m 52s
250:	learn: 0.0005162	test: 0.0005110	best: 0.0005110 (250)	total: 13.4s	remaining:

In [None]:
# saving results for ensembling
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)