In [22]:
%reload_ext autoreload
%autoreload 2

import glob
import os, gc
import numpy as numpy
import pandas as pd
import datatable as dt
import scipy as sp
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from numba_functions import *
from IPython.display import clear_output

In [23]:
# CONSTANT
MEAN = -5.762330803300896
STD = 0.6339307835941186
EPS = 1e-9

In [24]:
def transform_target(target):
    return (np.log(target + EPS) - MEAN) / STD

def inverse_target(target):
    return np.exp(MEAN + STD * target) - EPS

In [30]:
class OptimizeMSPE:
    def __init__(self, transform=False):
        self.coef_ = 0
        self.transform_ = transform
        
    
    def _mspe(self, coef, X, y):
        # create predictions by taking row wise sum
        if self.transform_:
            X = transform_target(X)

        y_hat = np.sum(X * coef, axis=1)

        if self.transform_:
            y_hat = inverse_target(y_hat)

        mspe_score = np.mean(np.square((y - y_hat) / y))

        return mspe_score
    
    def fit(self, X, y):
        from functools import partial
        loss_partial = partial(self._mspe, X=X, y=y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        # initial_coef = np.zeros(X.shape[1])

        self.result_ = sp.optimize.minimize(loss_partial, x0=initial_coef, 
            method='SLSQP',
            jac='3-point',
            options=dict(
                ftol=1e-10,
                disp=True,
            )
        )
        self.coef_ = self.result_.x
        print('RMSPE: ', np.sqrt(loss_partial(self.coef_)))

    def predict(self, X):
        if self.transform_:
            X = transform_target(X)
        y_pred = np.sum(X * self.coef_, axis=1)
        if self.transform_:
            y_pred = inverse_target(y_pred)
        return y_pred

# Load results csv

In [26]:
list_result_names = [path.lstrip('./results/').rstrip('.csv') for path in glob.glob('./results/*.csv')]
list_result_names.remove('OptimizeRV')
list_result_names

['501-WaveNet',
 '501-MLP',
 '601-cfr',
 '501-TabNet',
 '601-LGB',
 '501-1dCNN',
 '601-TabNet',
 '501-UNet',
 '601-MLP',
 '601-1dCNN',
 '601-CAT']

In [27]:
df_result = pd.read_csv('./dataset/train.csv')
for result_name in list_result_names:
    df_pred = pd.read_csv(f'./results/{result_name}.csv')
    if 'pred' not in df_pred:
        df_pred['pred'] = df_pred[[f for f in df_pred if f.startswith('pred_')]].mean(axis=1)
    df_pred.rename(columns={'pred': f'pred_{result_name}'}, inplace=True)
    df_result = df_result.merge(df_pred[['stock_id', 'time_id', f'pred_{result_name}']], on=['stock_id', 'time_id'], how='inner', validate='one_to_one')

# OptimizeRV
df_pred = pd.read_csv('results/OptimizeRV.csv')
df_pred.rename(columns={'rv_new': f'pred_OptimizeRV'}, inplace=True)
df_result = df_result.merge(df_pred[['stock_id', 'time_id', 'pred_OptimizeRV']], on=['stock_id', 'time_id'], how='inner', validate='one_to_one')

print(df_result.isna().any().any())
df_result.head()

False


Unnamed: 0,stock_id,time_id,target,pred_501-WaveNet,pred_501-MLP,pred_601-cfr,pred_501-TabNet,pred_601-LGB,pred_501-1dCNN,pred_601-TabNet,pred_501-UNet,pred_601-MLP,pred_601-1dCNN,pred_601-CAT,pred_OptimizeRV
0,0,5,0.004136,0.003771,0.003921,0.003813,0.003946,0.004056,0.003857,0.003786,0.003921,0.003678,0.00356,0.004046,0.003399
1,0,11,0.001445,0.001823,0.001635,0.001626,0.001657,0.001511,0.001565,0.001469,0.00164,0.001507,0.00148,0.001551,0.001169
2,0,16,0.002168,0.002323,0.002322,0.002177,0.002361,0.0023,0.002207,0.002306,0.002317,0.002437,0.002363,0.00246,0.00223
3,0,31,0.002195,0.00213,0.002206,0.001973,0.002187,0.001978,0.002103,0.001819,0.002164,0.001906,0.001979,0.002035,0.002524
4,0,62,0.001747,0.001697,0.001648,0.001682,0.001636,0.001615,0.001641,0.001711,0.001649,0.001621,0.001598,0.001654,0.001553


In [46]:
removed_row_ids = ['31-25504', '31-27174']
df_query = df_result.loc[~df_result.row_id.isin(removed_row_ids)]
print(removed_row_ids)
print(rmspe(df_result['target'], df_result['pred_501-MLP']))
print(rmspe(df_query['target'], df_query['pred_501-MLP']))

['31-25504', '31-27174']
0.20939128074512814
0.20794027092230224


# Ensemble Together

In [28]:
pred_cols = [f for f in df_result if f.startswith('pred_')]
pred_cols_disp = [c.lower().replace('-', '_') for c in pred_cols]
print('pred_cols =', pred_cols_disp)
opt = OptimizeMSPE(transform=False)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')

opt = OptimizeMSPE(transform=True)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')

pred_cols = ['pred_501_wavenet', 'pred_501_mlp', 'pred_601_cfr', 'pred_501_tabnet', 'pred_601_lgb', 'pred_501_1dcnn', 'pred_601_tabnet', 'pred_501_unet', 'pred_601_mlp', 'pred_601_1dcnn', 'pred_601_cat', 'pred_optimizerv']
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.042144095691129695
            Iterations: 54
            Function evaluations: 1353
            Gradient evaluations: 54
RMSPE:  0.20529027178882514
coef_ = [ 0.04185692187336371, 0.22359353665652743, -0.043730174113198975, 0.29970337570898237, 0.150631400441942, -0.0014968965221374782, 0.06126529401030603, -0.059603937841766576, 0.34195366349473927, 0.17287214655559283, -0.1658692387341289, -0.02339794028936296 ]
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.042136183775607766
            Iterations: 53
            Function evaluations: 1326
            Gradient evaluations: 53
RMSPE:  0.20527100081503907
coef_ = [ 0.037752622247

# Seperate 501 and 601

In [31]:
# 501
pred_cols = [f for f in df_result if f.startswith('pred_') and '501' in f]
pred_cols_disp = [c.lower().replace('-', '_') for c in pred_cols]
print('pred_cols =', pred_cols_disp)
opt = OptimizeMSPE(transform=False)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')

opt = OptimizeMSPE(transform=True)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')

df_result['fpred_501'] = opt.predict(df_result[pred_cols].values)

pred_cols = ['pred_501_wavenet', 'pred_501_mlp', 'pred_501_tabnet', 'pred_501_1dcnn', 'pred_501_unet']
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.043252330247570685
            Iterations: 17
            Function evaluations: 188
            Gradient evaluations: 17
RMSPE:  0.20797194581859035
coef_ = [ 0.06431492974981139, 0.395374708848404, 0.46401136450128855, 0.10897283383711862, -0.035292540351464854 ]
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.04322896511507816
            Iterations: 36
            Function evaluations: 397
            Gradient evaluations: 36
RMSPE:  0.20791576446984045
coef_ = [ 0.06325215538145833, 0.3766310446758944, 0.4487462778244628, 0.11239907244301549, 0.0048943329613022005 ]


In [32]:
# 601
pred_cols = [f for f in df_result if f.startswith('pred_') and '601' in f]
pred_cols_disp = [c.lower().replace('-', '_') for c in pred_cols]
print('pred_cols =', pred_cols_disp)
opt = OptimizeMSPE(transform=False)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')

opt = OptimizeMSPE(transform=True)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')
df_result['fpred_601'] = opt.predict(df_result[pred_cols].values)

pred_cols = ['pred_601_cfr', 'pred_601_lgb', 'pred_601_tabnet', 'pred_601_mlp', 'pred_601_1dcnn', 'pred_601_cat']
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.042938995495149894
            Iterations: 26
            Function evaluations: 339
            Gradient evaluations: 26
RMSPE:  0.20721726640207833
coef_ = [ 0.018812468318657804, 0.16918620047399174, 0.22850562679269684, 0.4484921565680106, 0.2523474092559502, -0.11911314015836186 ]
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.042905255823901874
            Iterations: 35
            Function evaluations: 456
            Gradient evaluations: 35
RMSPE:  0.20713583906195923
coef_ = [ 0.03567145436118382, 0.171988013958782, 0.22237535010846676, 0.4199231435976319, 0.26636665446742797, -0.10963075179106067 ]


In [34]:
# 501 + 601
pred_cols = [f for f in df_result if f.startswith('fpred_')]
pred_cols_disp = [c.lower().replace('-', '_') for c in pred_cols]
print('pred_cols =', pred_cols_disp)
opt = OptimizeMSPE(transform=True)
opt.fit(df_result[pred_cols].values, df_result['target'].values)
print('coef_ = [', ', '.join(map(str, opt.coef_)), ']')
df_result['_fpred_all'] = opt.predict(df_result[pred_cols].values)

pred_cols = ['fpred_501', 'fpred_601']
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.04223201481093635
            Iterations: 12
            Function evaluations: 60
            Gradient evaluations: 12
RMSPE:  0.2055042938990238
coef_ = [ 0.4496154243643552, 0.5526525394467906 ]


# hmean

In [35]:
pred_cols = [f for f in df_result if f.startswith('fpred_')]
pred_cols_disp = [c.lower().replace('-', '_') for c in pred_cols]
print('pred_cols =', pred_cols_disp)
pred_hmean = sp.stats.hmean(df_result[pred_cols].values, axis=1)
print('RMSPE: ', rmspe(df_result['target'], pred_hmean))

pred_cols = ['fpred_501', 'fpred_601']
RMSPE:  0.20553242104805416


In [None]:
# df_result['bias'] = 1
# pred_cols = [f for f in df_result if f.startswith('pred_')]
# print(pred_cols)
# opt = OptimizeRMSPE()
# opt.fit(df_result[['bias']+pred_cols], df_result['target'])
# print('coef_ = ', opt.coef_)