In [1]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

from sklearn.preprocessing import MinMaxScaler


from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

In [2]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = '601-TabNet'
DATA_NAME = '601'
mkdir(f'./models/{SOL_NAME}/')

In [3]:
# CONSTANT
MEAN = -5.762330803300896
STD = 0.6339307835941186
EPS = 1e-9

In [4]:
# get ids
list_stock_id = get_stock_id()
list_time_id = get_time_id()

# Functions

In [5]:
def transform_target(target):
    return (np.log(target + EPS) - MEAN) / STD

def inverse_target(target):
    return np.exp(MEAN + STD * target) - EPS

def np_rmspe(y_true, y_pred):
    y_true = inverse_target(y_true)
    y_pred = inverse_target(y_pred)
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        y_true = inverse_target(y_true)
        y_pred = inverse_target(y_pred)
        return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def RMSPELoss(y_pred, y_true):
    y_true = torch.exp(MEAN + STD * y_true) - EPS
    y_pred = torch.exp(MEAN + STD * y_pred) - EPS
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()

# Loading data

In [6]:
# train
df_train = dt.fread(f'./dataset/train_{DATA_NAME}_NN.csv').to_pandas()
fea_cols = [f for f in df_train if f not in ['time_id', 'target', 'pred_NN', 'stock_id', 'row_id']]

# result
df_result = dt.fread('./dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [7]:
df_train['target'] = transform_target(df_train['target'])
df_train = gen_row_id(df_train)

# Evaluation

In [8]:
tabnet_params = dict(
    cat_idxs=[0],
    cat_dims=[127],
    cat_emb_dim=1,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10,
    # device_name='cpu'
)

list_seeds = [0, 11, 42]

In [9]:
list_rmspe = []
for i_seed, seed in enumerate(list_seeds):
    df_train = add_time_fold(df_train, N_FOLD, seed=seed)
    list_rmspe += [[]]
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]


        X_train = df_tr[['stock_id']+fea_cols].values
        y_train = df_tr[['target']].values
        X_test = df_te[['stock_id']+fea_cols].values
        y_test = df_te[['target']].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_seed+1}/{len(list_seeds)} | {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)

        scaler = MinMaxScaler(feature_range=(-1, 1))
        X_train[:, 1:] = scaler.fit_transform(X_train[:, 1:])
        X_test[:, 1:] = scaler.transform(X_test[:, 1:])
        save_pickle(scaler, f'./models/{SOL_NAME}/minmax_scaler_{i_seed}_{i_fold}.pkl')

        # Callbacks
        ckp_path = f'./models/{SOL_NAME}/model_{i_seed}_{i_fold}'

        model = TabNetRegressor(**tabnet_params)
        model.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            max_epochs=10000,
            patience=50,
            batch_size=1024*20,
            virtual_batch_size=128*20,
            num_workers=8,
            drop_last=False,
            eval_metric=[RMSPE],
            loss_fn=RMSPELoss
        )


        y_pred = model.predict(X_test)
        curr_rmspe = np_rmspe(y_test, y_pred)
        list_rmspe[-1] += [curr_rmspe]
        model.save_model(ckp_path)
        # generate and save preds
        df_result.loc[idx_test, f'pred_{i_seed}'] = inverse_target(y_pred)
        clear_output()
        print(list_rmspe)

[[0.20932573216359418, 0.21450710797142347, 0.21467728477515727, 0.21188039971174139, 0.21610412162811835], [0.21667292805020122, 0.21064368083877436, 0.21399648932337212, 0.21419504264008452, 0.22428668088719647], [0.21969074679208403, 0.21413597533061055, 0.2092519252376234, 0.2103655782333957, 0.21265650254589885]]


In [10]:
df_result.to_csv(f'./results/{SOL_NAME}.csv', index=False)

In [11]:
for i in range(len(list_seeds)):
    print(i, rmspe(df_result['target'], df_result[f'pred_{i}']))
print('All: ', rmspe(df_result['target'], df_result[[f'pred_{i}' for i in range(len(list_seeds))]].mean(axis=1)))

0 0.21331257396122144
1 0.21600768895616343
2 0.2132514947044603
All:  0.20999046417613576
