In [12]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

import lightgbm as lgb

# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [13]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '501-lgb'
DATA_NAME = '501'
mkdir(f'./models/{SOL_NAME}/')

In [14]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [15]:
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Loading data

In [16]:
# train
df_train = dt.fread(f'./dataset/train_{DATA_NAME}_LGB.csv').to_pandas()
fea_cols = [f for f in df_train.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
# result
df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [17]:
fea_cols_TA = [f for f in fea_cols if 'min_' not in f]
df_time_mean = df_train.groupby('time_id')[fea_cols_TA].mean()
df_time_mean.columns = [f'{c}_TA_mean' for c in df_time_mean.columns]
df_time_mean = df_time_mean.reset_index()
df_train = df_train.merge(df_time_mean, on='time_id', how='left')

del df_time_mean
gc.collect()

57

In [18]:
fea_cols = ['stock_id'] + [f for f in df_train if f not in ['time_id', 'target', 'pred_NN', 'stock_id', 'row_id']]

In [19]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

# Evaluation

In [20]:
def add_time_stats(df_train):
    time_cols = [f for f in df_train.columns if f.endswith('_time') and 'RV' in f]
    df_gp_stock = df_train.groupby('stock_id')
    #
    df_stats = df_gp_stock[time_cols].mean().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_mean' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].std().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_std' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    # df_stats = df_gp_stock[time_cols].skew().reset_index()
    # df_stats.columns = ['stock_id'] + [f'{f}_skew' for f in time_cols]
    # df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].min().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_min' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].max().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_max' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    # df_stats = df_gp_stock[time_cols].quantile(0.25).reset_index()
    # df_stats.columns = ['stock_id'] + [f'{f}_q1' for f in time_cols]
    # df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    # #
    # df_stats = df_gp_stock[time_cols].quantile(0.50).reset_index()
    # df_stats.columns = ['stock_id'] + [f'{f}_q2' for f in time_cols]
    # df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    # #
    # df_stats = df_gp_stock[time_cols].quantile(0.75).reset_index()
    # df_stats.columns = ['stock_id'] + [f'{f}_q3' for f in time_cols]
    # df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    return df_train

In [21]:
seed0=2021
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
list_rmspe = [1 for _ in range(N_FOLD)]

In [23]:
n_trials = 1
for _ in range(n_trials):
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]
        # df_tr = add_time_stats(df_tr)
        # df_te = add_time_stats(df_te)

        fea_cols = ['stock_id'] + [f for f in df_tr.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]

        X_train = df_tr[fea_cols].values
        y_train = df_tr['target'].values
        X_test = df_te[fea_cols].values
        y_test = df_te['target'].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)


        train_dataset = lgb.Dataset(X_train, y_train, weight=1/np.square(y_train))
        test_dataset = lgb.Dataset(X_test, y_test, weight=1/np.square(y_test))
        model = lgb.train(params = params,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, test_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=250,
                          feval = feval_rmspe)

        y_pred = model.predict(X_test)
        curr_rmspe = rmspe(y_test, y_pred)
        if curr_rmspe < list_rmspe[i_fold]:
            ckp_path = f'./models/{SOL_NAME}/lgb_{i_fold}.pkl'
            save_pickle(model, ckp_path)
            list_rmspe[i_fold] = curr_rmspe
            # generate and save preds
            df_result.loc[idx_test, 'pred'] = y_pred
        # clear_output()
        print(list_rmspe)
        calc_metric(df_result.fillna(0))
        # 0.2169
    # break

Fold 1/5 (343145, 483) (85787, 483)
Training until validation scores don't improve for 250 rounds
[250]	training's rmse: 0.00042914	training's RMSPE: 0.198334	valid_1's rmse: 0.000461219	valid_1's RMSPE: 0.214646
[500]	training's rmse: 0.000405874	training's RMSPE: 0.187581	valid_1's rmse: 0.000460849	valid_1's RMSPE: 0.214474
Early stopping, best iteration is:
[343]	training's rmse: 0.000418189	training's RMSPE: 0.193273	valid_1's rmse: 0.00046044	valid_1's RMSPE: 0.214284
Done!
[0.21428398420781644, 0.217304076091105, 1, 1, 1]
   R2: -0.7658
RMSPE: 0.7865
Fold 2/5 (343145, 483) (85787, 483)
Training until validation scores don't improve for 250 rounds
[250]	training's rmse: 0.000427165	training's RMSPE: 0.197148	valid_1's rmse: 0.000466604	valid_1's RMSPE: 0.218336
[500]	training's rmse: 0.000404432	training's RMSPE: 0.186656	valid_1's rmse: 0.000463736	valid_1's RMSPE: 0.216994
[750]	training's rmse: 0.000390877	training's RMSPE: 0.1804	valid_1's rmse: 0.000463629	valid_1's RMSPE: 0

In [24]:
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)