In [1]:
import sys
import warnings

sys.path.append('..')
warnings.filterwarnings('ignore')

from tqdm.auto import tqdm
import pandas as pd
import numpy as np

from spinesTS.preprocessing import split_series

from spinesTS.data import *
from spinesTS.nn import *
from spinesTS.ml_model import *
from spinesTS.metrics import *
from spinesTS.pipeline import *
from spinesTS.features_generator import *

In [2]:
series_data = BuiltInSeriesData()

+---+----------------------+----------------------------------------------+
|   | ds name              | columns                                      |
+---+----------------------+----------------------------------------------+
| 0 | ETTh1                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 1 | ETTh2                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 2 | ETTm1                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 3 | ETTm2                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 4 | Electric_Production  | date, value                                  |
| 5 | Messages_Sent        | date, ta, tb, tc                             |
| 6 | Messages_Sent_Hour   | date, hour, ta, tb, tc                       |
| 7 | Supermarket_Incoming | date, goods_cnt                              |
| 8 | Web_Sales            | date, type_a, type_b, sales_cnt              |
+---+----------------------+----------------------------------------------+


In [3]:
in_features = 128
out_features = 24
train_size = 0.8
date_col = 'date'
device='cpu'
learning_rate = 0.001
dnn_epochs = 3000


def dataloader(model_name, diff_n=1):
    y_names = set(['OT', 'value', 'ta', 'goods_cnt', 'sales_cnt'])
    for ds in series_data:
        y_name = list(y_names & set(ds.columns))[0]
        dataset_name = ds.dataset_name
        ds = ds[[date_col, y_name]]

        if not model_name.startswith('WideGBRT'):
            x_train, x_test, y_train, y_test = \
                split_series(ds[y_name], ds[y_name], in_features, out_features, train_size=train_size)
        else:
            gbrt_processor = GBRTPreprocessing(in_features, out_features, y_name, date_col=date_col, differential_n=diff_n)
            gbrt_processor.fit(ds)
            x_train, x_test, y_train, y_test = gbrt_processor.transform(ds, mode='train')
            
        yield x_train, x_test, y_train, y_test, dataset_name


res = pd.DataFrame(columns=['algo', 'dataset', 'r2', 'mae', 'mape'])


def benchmark():
    from catboost import CatBoostRegressor
    
    global res

    models = {
        # 'StackingRNN':StackingRNN(in_features, out_features, device=device, learning_rate=learning_rate),
        # 'GAUNet':GAUNet(in_features, out_features, device=device, learning_rate=learning_rate),
        # 'Time2VecNet':Time2VecNet(in_features, out_features, device=device, learning_rate=learning_rate),
        'MultiStepRegressor':Pipeline([('MSR', MultiStepRegressor(CatBoostRegressor(use_best_model=True)))]),
        'MultiOutputRegressor':Pipeline([('MOR', MultiOutputRegressor(CatBoostRegressor(use_best_model=True)))]),
        'AddedFeatures_MSR':Pipeline([
                ('fe', ContinuousFeatureGenerator()),
                ('multi_reg', MultiStepRegressor(CatBoostRegressor(use_best_model=True)))
            ]),
        'AddedFeatures_MOR':Pipeline([
                ('fe', ContinuousFeatureGenerator()),
                ('multi_reg', MultiOutputRegressor(CatBoostRegressor(use_best_model=True)))
            ]),
        'WideGBRT-diff_n-0':WideGBRT(model=CatBoostRegressor(use_best_model=True)),
        'WideGBRT':WideGBRT(model=CatBoostRegressor(use_best_model=True))
    }

    iters = tqdm(list(models.items()), desc='modeling...')
    for (model_name, model) in iters:
        if model_name == 'WideGBRT-diff_n-0':
            diff_n = 0
        else:
            diff_n = 1
        for (x_train, x_test, y_train, y_test, dataset_name) in tqdm(dataloader(model_name, diff_n), desc="dataset fitting...", total=len(series_data)):
            if model_name in ('StackingRNN', 'GAUNet', 'Time2VecNet'):
                model.fit(
                    x_train, y_train, eval_set=(x_test, y_test), batch_size=32,
                    min_delta=0, patience=100, epochs=dnn_epochs, verbose=False, lr_scheduler=None
                )
            elif model_name in (
                'MultiStepRegressor', 'MultiOutputRegressor', 
                'AddedFeatures_MSR', 'AddedFeatures_MOR', 'WideGBRT-diff_n-0', 'WideGBRT'
            ):
                model.fit(x_train, y_train, eval_set=([x_test, y_test]), use_best_model=True, verbose=0)

            y_pred = model.predict(x_test)
            
            _s = {'algo':model_name}
            _s['dataset'] = dataset_name
            _s['r2'] = r2_score(y_test.T, y_pred.T)
            _s['mae'] = mean_absolute_error(y_test, y_pred)
            _s['mape'] = mean_absolute_percentage_error(y_test, y_pred)

            
            res = res.append(_s, ignore_index=True)
            res.to_json('./res.json', orient='index')
    
    return res

In [4]:
benchmarks = benchmark()

modeling...:   0%|          | 0/6 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

In [5]:
pd.pivot_table(benchmarks, values='r2', index='algo', columns='dataset', aggfunc=lambda s: s)

dataset,ETTh1,ETTh2,ETTm1,ETTm2,Electric_Production,Messages_Sent,Messages_Sent_Hour,Supermarket_Incoming,Web_Sales
algo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AddedFeatures_MOR,-2.94139,-5.212844,-3.503168,-7.237333,0.822574,-4.655299,0.790442,-0.070715,-4.645118
AddedFeatures_MSR,-29.867876,-55.74597,-67.19795,-113.783446,0.647795,-4.758954,0.43916,-0.404258,-13.214841
MultiOutputRegressor,-2.77494,-4.612363,-3.071225,-6.8066,0.829147,-4.365157,0.794496,-0.048492,-3.508137
MultiStepRegressor,-2.785439,-5.037074,-3.439186,-6.654741,0.858135,-4.592366,0.806895,-0.112169,-2.962707
WideGBRT,-32.424934,-6.623825,-24.228082,-97.496126,0.82946,-2.149676,0.666656,-0.10764,-6.886536
WideGBRT-diff_n-0,-3.098563,-4.107369,-3.977586,-8.319113,0.837893,-3.911544,0.775425,-0.010521,-4.279417


In [6]:
pd.pivot_table(benchmarks, values='mae', index='algo', columns='dataset', aggfunc=lambda s: s)

dataset,ETTh1,ETTh2,ETTm1,ETTm2,Electric_Production,Messages_Sent,Messages_Sent_Hour,Supermarket_Incoming,Web_Sales
algo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AddedFeatures_MOR,1.343257,2.989515,1.319814,1.593525,3.029269,46086.04065,1390.646741,6100.619386,18860.280422
AddedFeatures_MSR,3.504008,10.114462,4.071238,5.157644,4.140304,46316.192546,2108.875861,6874.287724,29450.78091
MultiOutputRegressor,1.328996,2.887455,1.25883,1.495535,3.001412,44003.222255,1341.554892,5978.278031,17687.561302
MultiStepRegressor,1.392797,2.987418,1.31646,1.538308,2.867165,43702.632932,1223.98356,6211.023407,17930.627796
WideGBRT,3.812195,4.402928,2.770403,4.39327,2.865794,35010.562615,1884.639419,6336.971963,21139.411379
WideGBRT-diff_n-0,1.366112,2.973768,1.351436,1.685808,2.853178,42048.487559,1496.521811,5942.366473,17142.469991


In [7]:
pd.pivot_table(benchmarks, values='mape', index='algo', columns='dataset', aggfunc=lambda s: s)

dataset,ETTh1,ETTh2,ETTm1,ETTm2,Electric_Production,Messages_Sent,Messages_Sent_Hour,Supermarket_Incoming,Web_Sales
algo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AddedFeatures_MOR,0.274992,0.326388,0.181986,0.08475,0.028507,0.38548,1.241258,0.211385,0.216717
AddedFeatures_MSR,0.500931,0.688521,0.444602,0.260352,0.038635,0.386846,3.674187,0.228879,0.317591
MultiOutputRegressor,0.276919,0.267299,0.176592,0.07832,0.028303,0.368071,0.979341,0.203326,0.204857
MultiStepRegressor,0.285303,0.245182,0.175122,0.082706,0.027436,0.366389,0.617494,0.208331,0.192283
WideGBRT,0.702062,0.634496,0.448437,0.242724,0.027059,0.288841,2.152916,0.222305,0.269977
WideGBRT-diff_n-0,0.284276,0.32116,0.183804,0.088127,0.026809,0.352011,1.683465,0.20438,0.208213
