In [7]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

import lightgbm as lgb
from deepforest import CascadeForestRegressor as CFR


In [8]:
N_FOLD = 10
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-CFR'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')

In [9]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [10]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [11]:
# train
df_train = dt.fread(f'./dataset/train_{DATA_NAME}_LGB.csv').to_pandas()
# result
df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [12]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'stock_id', 'row_id']]

In [13]:
df_train = gen_row_id(df_train)
# df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [14]:
seed = 7
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin': 100,
    'min_data_in_leaf': 500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column': [0],
    'seed': seed,
    'feature_fraction_seed': seed,
    'bagging_seed': seed,
    'drop_seed': seed,
    'data_random_seed': seed,
    'n_jobs':-1,
    'verbose': -1
}

list_seeds = [0, 11, 42, 777, 2045]

In [15]:
list_rmspe = []
for i_seed, seed in enumerate(list_seeds):
    df_train = add_time_fold(df_train, N_FOLD, seed)
    list_rmspe += [[]]
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols].values
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols].values
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_seed+1}/{len(list_seeds)} | {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)

        model = CFR(
            use_predictor = True, 
            predictor = 'lightgbm',  
            predictor_kwargs = params, 
            n_jobs = -1,  
            random_state = seed, 
            verbose = -1, 
        )
        model.fit(X_train, y_train, sample_weight = 1 / np.square(y_train))

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        list_rmspe[-1] += [curr_rmspe]
        ckp_path = f'./models/{SOL_NAME}/CFR_{i_seed}_{i_fold}.pkl'
        save_pickle(model, ckp_path)
        # generate and save preds
        df_result.loc[idx_test, f'pred_{i_seed}'] = y_pred
        clear_output()
        print(list_rmspe)

[[0.2116915702456817, 0.22146642494000243]]
Fold 1/5 | 3/10 (386039, 245) (42893, 245)


KeyboardInterrupt: 

In [None]:
# saving results for ensembling
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)