In [1]:
!pip install scikit-learn==0.24.1
!pip install lightgbm==3.2.0 --quiet
!pip install deep-forest

Collecting scikit-learn==0.24.1
  Downloading scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 120.8 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.2.0
[K     |████████████████████████████████| 2.0 MB 5.4 MB/s 
[?25hCollecting deep-forest
  Downloading deep_forest-0.1.5-cp37-cp37m-manylinux2010_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 5.3 MB/s 
Installing collected packages: deep-forest
Successfully installed deep-forest-0.1.5


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/KaggleCompetition/Optiver/Leo-Optiver/

Mounted at /content/drive
/content/drive/My Drive/KaggleCompetition/Optiver/Leo-Optiver


In [3]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
# import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

import lightgbm as lgb
from deepforest import CascadeForestRegressor as CFR

# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [4]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-cfr'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')
mkdir(f'./results/{SOL_NAME}/')

In [5]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [6]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [7]:
# train
# df_train = dt.fread(f'./dataset/public_train_{DATA_NAME}_LGB.csv').to_pandas()
df_train = pd.read_csv(f'./dataset/public_train_{DATA_NAME}_LGB.csv')
# result
# df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = pd.read_csv('./dataset/train.csv')
df_result = gen_row_id(df_result)

In [8]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'stock_id', 'row_id']]

In [9]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [10]:
seed0 = 2021
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
list_rmspe = [1 for _ in range(N_FOLD)]

In [11]:
# df_train[fea_cols]

In [14]:
n_trials = 1
for _ in range(n_trials):
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols].values
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols].values
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)

        model = CFR(
            use_predictor = True, 
            predictor = 'lightgbm',  
            predictor_kwargs = params, 
            n_jobs = -1,  
            random_state = seed0, 
            verbose = -1, 
            )
        model.fit(X_train, y_train, sample_weight = 1 / np.square(y_train))

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        if curr_rmspe < list_rmspe[i_fold]:
            ckp_path = f'./models/{SOL_NAME}/cfr_601_{i_fold}.pkl'
            save_pickle(model, ckp_path)
            list_rmspe[i_fold] = curr_rmspe
            # generate and save preds
            df_result.loc[idx_test, 'pred'] = y_pred
        # clear_output()
        print(list_rmspe)
        calc_metric(df_result.fillna(0))
        # 0.2169
    # break

Fold 1/5 (343145, 245) (85787, 245)
Done!
[0.2135781983999914, 1, 1, 1, 1]
   R2: -1.2260
RMSPE: 0.8995
Fold 2/5 (343145, 245) (85787, 245)
Done!
[0.2135781983999914, 0.2163855527296646, 1, 1, 1]
   R2: -0.7646
RMSPE: 0.7864
Fold 3/5 (343147, 245) (85785, 245)
Done!
[0.2135781983999914, 0.2163855527296646, 0.2112647115480818, 1, 1]
   R2: -0.2210
RMSPE: 0.6538
Fold 4/5 (343150, 245) (85782, 245)
Done!
[0.2135781983999914, 0.2163855527296646, 0.2112647115480818, 0.2128087011327794, 1]
   R2: 0.3046
RMSPE: 0.4863
Fold 5/5 (343141, 245) (85791, 245)
Done!
[0.2135781983999914, 0.2163855527296646, 0.2112647115480818, 0.2128087011327794, 0.22681868119736065]
   R2: 0.8235
RMSPE: 0.2162


In [15]:
# saving results for ensembling
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)