In [8]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

import lightgbm as lgb

# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [9]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-lgb'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')

In [10]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [11]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [12]:
# train
df_train = dt.fread(f'./dataset/public_train_{DATA_NAME}_LGB.csv').to_pandas()
# result
df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [13]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'stock_id', 'row_id']]

In [14]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [15]:
seed0=2021
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
list_rmspe = [1 for _ in range(N_FOLD)]

In [16]:
n_trials = 1
for _ in range(n_trials):
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols].values
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols].values
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)


        train_dataset = lgb.Dataset(X_train, y_train, weight=1/np.square(y_train))
        test_dataset = lgb.Dataset(X_test, y_test, weight=1/np.square(y_test))
        model = lgb.train(params = params,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, test_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=250,
                          feval = feval_rmspe)

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        if curr_rmspe < list_rmspe[i_fold]:
            ckp_path = f'./models/{SOL_NAME}/lgb_{i_fold}.pkl'
            save_pickle(model, ckp_path)
            list_rmspe[i_fold] = curr_rmspe
            # generate and save preds
            df_result.loc[idx_test, 'pred'] = y_pred
        # clear_output()
        print(list_rmspe)
        calc_metric(df_result.fillna(0))
        # 0.2169
    # break

Fold 1/5 (343145, 245) (85787, 245)
Training until validation scores don't improve for 250 rounds
[250]	training's rmse: 0.000427405	training's RMSPE: 0.197532	valid_1's rmse: 0.000469495	valid_1's RMSPE: 0.218498
Early stopping, best iteration is:
[130]	training's rmse: 0.000447763	training's RMSPE: 0.206941	valid_1's rmse: 0.000468885	valid_1's RMSPE: 0.218214
Done!
[0.21821441457630464, 1, 1, 1, 1]
   R2: -1.2291
RMSPE: 0.8997
Fold 2/5 (343145, 245) (85787, 245)
Training until validation scores don't improve for 250 rounds
[250]	training's rmse: 0.00042535	training's RMSPE: 0.19631	valid_1's rmse: 0.000465093	valid_1's RMSPE: 0.217629
[500]	training's rmse: 0.000402993	training's RMSPE: 0.185992	valid_1's rmse: 0.000462181	valid_1's RMSPE: 0.216267
[750]	training's rmse: 0.000389685	training's RMSPE: 0.17985	valid_1's rmse: 0.00046148	valid_1's RMSPE: 0.215938
[1000]	training's rmse: 0.000380364	training's RMSPE: 0.175548	valid_1's rmse: 0.000460991	valid_1's RMSPE: 0.215709
Did not

In [18]:
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)