In [1]:
import sys
import warnings

sys.path.append('..')
warnings.filterwarnings('ignore')

from tqdm.auto import tqdm
import pandas as pd
import numpy as np

from spinesTS.preprocessing import split_series

from spinesTS.data import *
from spinesTS.nn import *
from spinesTS.ml_model import *
from spinesTS.metrics import *
from spinesTS.pipeline import *
from spinesTS.features_generator import *

In [2]:
series_data = BuiltInSeriesData()

+---+----------------------+----------------------------------------------+
|   | ds name              | columns                                      |
+---+----------------------+----------------------------------------------+
| 0 | ETTh1                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 1 | ETTh2                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 2 | ETTm1                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 3 | ETTm2                | date, HUFL, HULL, MUFL, MULL, LUFL, LULL, OT |
| 4 | Electric_Production  | date, value                                  |
| 5 | Messages_Sent        | date, ta, tb, tc                             |
| 6 | Messages_Sent_Hour   | date, hour, ta, tb, tc                       |
| 7 | Supermarket_Incoming | date, goods_cnt                              |
| 8 | Web_Sales            | date, type_a, type_b, sales_cnt              |
+---+----------------------+----------------------------------------------+


In [19]:
in_features = 128
out_features = 24
train_size = 0.8
date_col = 'date'
device='cpu'
learning_rate = 0.001
dnn_epochs = 3000


def dataloader(model_name, diff_n=1):
    y_names = set(['OT', 'value', 'ta', 'goods_cnt', 'sales_cnt'])
    for ds in series_data:
        y_name = list(y_names & set(ds.columns))[0]
        dataset_name = ds.dataset_name
        ds = ds[[date_col, y_name]]

        if not model_name.startswith('WideGBRT'):
            x_train, x_test, y_train, y_test = \
                split_series(ds[y_name], ds[y_name], in_features, out_features, train_size=train_size)
        else:
            gbrt_processor = GBRTPreprocessing(in_features, out_features, y_name, date_col=date_col, differential_n=diff_n)
            gbrt_processor.fit(ds)
            x_train, x_test, y_train, y_test = gbrt_processor.transform(ds, mode='train')
            
        yield x_train, x_test, y_train, y_test, dataset_name

def benchmark():
    from catboost import CatBoostRegressor
    
    res = pd.DataFrame(columns=['algo', 'dataset', 'r2', 'mae', 'mape'])

    models = {
        'StackingRNN':StackingRNN(in_features, out_features, device=device, learning_rate=learning_rate),
        'GAUNet':GAUNet(in_features, out_features, device=device, learning_rate=learning_rate),
        'Time2VecNet':Time2VecNet(in_features, out_features, device=device, learning_rate=learning_rate),
        'MultiStepRegressor':Pipeline([('MSR', MultiStepRegressor(CatBoostRegressor(use_best_model=True)))]),
        'MultiOutputRegressor':Pipeline([('MOR', MultiOutputRegressor(CatBoostRegressor(use_best_model=True)))]),
        'AddedFeatures_MSR':Pipeline([
                ('fe', ContinuousFeatureGenerator()),
                ('multi_reg', MultiStepRegressor(CatBoostRegressor(use_best_model=True)))
            ]),
        'AddedFeatures_MOR':Pipeline([
                ('fe', ContinuousFeatureGenerator()),
                ('multi_reg', MultiOutputRegressor(CatBoostRegressor(use_best_model=True)))
            ]),
        'WideGBRT-diff_n-0':WideGBRT(model=CatBoostRegressor(use_best_model=True)),
        'WideGBRT-diff_n-1':WideGBRT(model=CatBoostRegressor(use_best_model=True))
    }

    iters = tqdm(list(models.items()), desc='modeling...')
    for (model_name, model) in iters:
        if model_name == 'WideGBRT-diff_n-0':
            diff_n = 0
        else:
            diff_n = 1
        for (x_train, x_test, y_train, y_test, dataset_name) in tqdm(dataloader(model_name, diff_n), desc="dataset fitting...", total=len(series_data)):
            if model_name in ('StackingRNN', 'GAUNet', 'Time2VecNet'):
                model.fit(
                    x_train, y_train, eval_set=(x_test, y_test), batch_size=32,
                    min_delta=0, patience=100, epochs=dnn_epochs, verbose=False, lr_scheduler=None
                )
            elif model_name in (
                'MultiStepRegressor', 'MultiOutputRegressor', 
                'AddedFeatures_MSR', 'AddedFeatures_MOR', 'WideGBRT'
            ):
                model.fit(x_train, y_train, eval_set=([x_test, y_test]), use_best_model=True, verbose=0)

            y_pred = model.predict(x_test)
            
            _s = {'algo':model_name}
            _s['dataset'] = dataset_name
            _s['r2'] = r2_score(y_test.T, y_pred.T)
            _s['mae'] = mean_absolute_error(y_test, y_pred)
            _s['mape'] = mean_absolute_percentage_error(y_test, y_pred)

            res = res.append(_s, ignore_index=True)
        
    return res

In [None]:
benchmarks = benchmark()

modeling...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

dataset fitting...:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
pd.pivot_table(benchmarks, values='r2', index='algo', columns='dataset', aggfunc=lambda s: s)

In [None]:
pd.pivot_table(benchmarks, values='mae', index='algo', columns='dataset', aggfunc=lambda s: s)

In [None]:
pd.pivot_table(benchmarks, values='mape', index='algo', columns='dataset', aggfunc=lambda s: s)