## Imports

In [None]:
!pip install pycaret-ts-alpha
!conda install -c conda-forge brotlipy
!pip install jupyterlab
!pip install nodejs
!conda install bottleneck
!pip install pymannkendall

In [1]:
import pandas as pd
from pycaret.time_series import *

import warnings
from sktime.forecasting.model_selection import ExpandingWindowSplitter
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sktime.transformations.series.difference import Differencer
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import AutoARIMA, ARIMA
from sktime.forecasting.compose import MultiplexForecaster, AutoEnsembleForecaster, ColumnEnsembleForecaster, DirRecTabularRegressionForecaster, DirRecTimeSeriesRegressionForecaster, DirectTabularRegressionForecaster, DirectTimeSeriesRegressionForecaster, EnsembleForecaster, StackingForecaster
from sktime.forecasting.ets import AutoETS
from sktime.forecasting.bats import BATS
from sktime.forecasting.croston import Croston
from sktime.forecasting.tbats import TBATS
from sktime.forecasting.theta import ThetaForecaster
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.forecasting.compose import RecursiveTabularRegressionForecaster, DirectTabularRegressionForecaster, DirRecTabularRegressionForecaster
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, Lars, LassoLars, BayesianRidge, HuberRegressor, PassiveAggressiveRegressor, OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
import lightgbm as lgbm

import sys
sys.path.append('../src/')
from functions import *

from datetime import datetime

## Start with train - valid

In [222]:
# exp.variables

In [223]:
exp.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
naive,Naive Forecaster,sktime.forecasting.naive.NaiveForecaster,True
grand_means,Grand Means Forecaster,sktime.forecasting.naive.NaiveForecaster,True
snaive,Seasonal Naive Forecaster,sktime.forecasting.naive.NaiveForecaster,True
polytrend,Polynomial Trend Forecaster,sktime.forecasting.trend.PolynomialTrendForeca...,True
arima,ARIMA,sktime.forecasting.arima.ARIMA,True
auto_arima,Auto ARIMA,sktime.forecasting.arima.AutoARIMA,True
exp_smooth,Exponential Smoothing,sktime.forecasting.exp_smoothing.ExponentialSm...,True
ets,ETS,sktime.forecasting.ets.AutoETS,True
theta,Theta Forecaster,sktime.forecasting.theta.ThetaForecaster,True
tbats,TBATS,sktime.forecasting.tbats.TBATS,False


In [42]:
forecaster = RecursiveTabularRegressionForecaster(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                                    max_iter=1000, normalize=False,
                                    positive=False, precompute=False,
                                    random_state=42, selection='cyclic',
                                    tol=0.0001, warm_start=False))

fh = 1
preprocess = False
min_max_scaler = TabularToSeriesAdaptor(MinMaxScaler(feature_range=(1, 2)))

dataset_name = 'Satellite'
data, frequency_yearly_period, freq_sktime = read_file(dataset_name, data_path='H:/My Drive/PhD/ECOSCOPE/time-series-forecasting-waves/data/')

target = 'sla'

data = data[target]

# split data
train, test, valid, train_without_valid, train_test_split_date, train_valid_split_date = train_valid_test_split(dataset_name, data)
# add a dummy last row to the full dataset (in this case it is the train), because pycaret keeps the last fh for test
index_of_dummy_row = train.index[-1] + pd.DateOffset(months=1 if freq_sktime == 'M' else 0, days= 1 if freq_sktime == 'D' else 0)
value_of_dummy_row = 10

# cv from sktime
cv = ExpandingWindowSplitter(step_length=1, fh=fh, initial_window=train_without_valid.shape[0])

exp = setup(train.append(pd.Series([value_of_dummy_row], index=[index_of_dummy_row])), 
            fold_strategy=cv, 
            log_experiment=True, 
            session_id=42, 
            verbose=False)

s = time.time()
# instead of 'theta', I can use ThetaForecaster(deseasonalize=True, initial_level=None, sp=12)
model = exp.create_model('lightgbm_cds_dt', round=6, fold=cv, verbose=True)
print(model)

Unnamed: 0,cutoff,MAE,RMSE,MAPE,SMAPE,MASE
0,2014-12,0.020965,0.020965,0.645218,0.487837,0.814914
1,2015-01,0.025262,0.025262,11.672077,1.707433,0.981965
2,2015-02,0.007752,0.007752,0.346127,0.295063,0.301135
3,2015-03,0.001275,0.001275,0.075819,0.07305,0.049559
4,2015-04,0.01441,0.01441,0.311947,0.369594,0.561859
5,2015-05,0.046266,0.046266,0.541921,0.743336,1.802966
6,2015-06,0.001648,0.001648,0.022504,0.022253,0.064102
7,2015-07,0.038952,0.038952,0.324291,0.38705,1.517941
8,2015-08,0.006867,0.006867,0.045911,0.046989,0.266787
9,2015-09,0.064578,0.064578,0.551884,0.432531,2.507599


BaseCdsDtForecaster(degree=1, deseasonal_model='additive',
                    regressor=LGBMRegressor(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=42, reg_alpha=0.0,
                                            reg_lambda=0.0, silent='warn',
                                            subsample=1.0,
                                            

In [28]:
# for multi step consider DirectTabularRegressionForecaster, RecursiveTabularRegressionForecaster, DirRecTabularRegressionForecaster
forecasters = [
    # NaiveForecaster(sp=1, strategy='last', window_length=None),
    # NaiveForecaster(sp=12, strategy='last', window_length=None),
    # PolynomialTrendForecaster(degree=1, regressor=None, with_intercept=True),
    # AutoARIMA(),
    # ExponentialSmoothing(damped_trend=False, initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated', seasonal=None, sp=frequency_yearly_period, trend='add', use_boxcox=None),
    # AutoETS(),
    # ThetaForecaster(deseasonalize=True, initial_level=None, sp=frequency_yearly_period),
    # TBATS(box_cox_bounds=(0, 1), context=None, multiprocessing_start_method='spawn', n_jobs=None, show_warnings=True, sp=frequency_yearly_period, use_arma_errors=True, use_box_cox=None, use_damped_trend=None, use_trend=None),
    # BATS(box_cox_bounds=(0, 1), context=None, multiprocessing_start_method='spawn', n_jobs=None, show_warnings=True, sp=frequency_yearly_period, use_arma_errors=True, use_box_cox=None, use_damped_trend=None, use_trend=None)
    DirectTabularRegressionForecaster(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False, positive=False)),
    DirectTabularRegressionForecaster(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=42, selection='cyclic', tol=0.0001, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=42, solver='auto', tol=0.001)),
    DirectTabularRegressionForecaster(estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=42, selection='cyclic', tol=0.0001, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=Lars(copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, jitter=None, n_nonzero_coefs=500, normalize=True, precompute='auto', random_state=42, verbose=False)),
    DirectTabularRegressionForecaster(estimator=LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, jitter=None, max_iter=500, normalize=True, positive=False, precompute='auto', random_state=42, verbose=False)),
    DirectTabularRegressionForecaster(estimator=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300, normalize=False, tol=0.001, verbose=False)),
    DirectTabularRegressionForecaster(estimator=HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100, tol=1e-05, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=PassiveAggressiveRegressor(C=1.0, average=False, early_stopping=False, epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive', max_iter=1000, n_iter_no_change=5, random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None, normalize=True, precompute='auto', tol=None)),
    DirectTabularRegressionForecaster(estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform')),
    DirectTabularRegressionForecaster(estimator=DecisionTreeRegressor(ccp_alpha=0.0,  criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=42, splitter='best')),
    DirectTabularRegressionForecaster(estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, random_state=42, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)),
    DirectTabularRegressionForecaster(estimator=AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', n_estimators=50, random_state=42)),
    DirectTabularRegressionForecaster(estimator=lgbm.sklearn.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent='warn', subsample=1.0, subsample_for_bin=200000, subsample_freq=0))
]

## Train-Valid -- No Preprocess -- No Tuning

In [29]:
scores = pd.DataFrame()
fh = 1
preprocess = False
min_max_scaler = TabularToSeriesAdaptor(MinMaxScaler(feature_range=(1, 2)))

dataset_name = 'Satellite'
data, frequency_yearly_period, freq_sktime = read_file(dataset_name, data_path='H:/My Drive/PhD/ECOSCOPE/time-series-forecasting-waves/data/')


for target in data.columns[:1]:
    print('#'*70, target, '#'*70)
    data_ = data[target]
    
    # split data
    train, test, valid, train_without_valid, train_test_split_date, train_valid_split_date = train_valid_test_split(dataset_name, data_)
    # add a dummy last row to the full dataset (in this case it is the train), because pycaret keeps the last fh for test
    index_of_dummy_row = train.index[-1] + pd.DateOffset(months=1 if freq_sktime == 'M' else 0, days= 1 if freq_sktime == 'D' else 0)
    value_of_dummy_row = 10
    
    # cv from sktime
    cv = ExpandingWindowSplitter(step_length=1, fh=fh, initial_window=train_without_valid.shape[0])

    for forecaster in forecasters[:1]:
        print('='*40, forecaster, '='*40)
        model = TransformedTargetForecaster(steps=[
            ("minmaxscaler", min_max_scaler),
            ("forecaster", forecaster),
        ])

        exp = setup(train.append(pd.Series([value_of_dummy_row], index=[index_of_dummy_row])), 
                fold_strategy=cv, 
                log_experiment=True, 
                session_id=42,
                verbose=False)

        s = time.time()
        # instead of 'theta', I can use ThetaForecaster(deseasonalize=True, initial_level=None, sp=12)
        model = exp.create_model(load_model('ada'), round=6, fold=cv, verbose=False)
        runtime = time.time()-s

        scores = scores.append({
            'Dataset': dataset_name,
            'Variable': target,
            'Method': model.get_params()['forecaster'], 
            'Forecasting Horizon': fh, 
            'Preprocess': preprocess,
            'Runtime': runtime, 
            'MAE': exp.variables['master_model_container'][0]['scores'].loc['Mean']['MAE'],
            'RMSE': exp.variables['master_model_container'][0]['scores'].loc['Mean']['RMSE'],
            'sMAPE': exp.variables['master_model_container'][0]['scores'].loc['Mean']['SMAPE'],
            'MASE': exp.variables['master_model_container'][0]['scores'].loc['Mean']['MASE'],
            'MAE std': exp.variables['master_model_container'][0]['scores'].loc['SD']['MAE'],
            'RMSE std': exp.variables['master_model_container'][0]['scores'].loc['SD']['RMSE'],   
            'sMAPE std': exp.variables['master_model_container'][0]['scores'].loc['SD']['SMAPE'],   
            'MASE std': exp.variables['master_model_container'][0]['scores'].loc['SD']['MASE'],   
        }, ignore_index=True)

Column 'date' was removed from Satellite.
###################################################################### sla ######################################################################
train datetime margins              : 1993-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 300 (89.3%)
test datetime margins               : 2018-01-01 00:00:00 - 2020-12-01 00:00:00.     Total samples: 36 (10.7%)
valid datetime margins              : 2015-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 36 (10.7%)
train_without_valid datetime margins: 1993-01-01 00:00:00 - 2014-12-01 00:00:00.     Total samples: 264 (78.6%)
                                                          alpha_2=1e-06,
                                                          alpha_init=None,
                                                          compute_score=False,
                                                          copy_X=True,
                                                          fit_interc

In [30]:
scores

Unnamed: 0,Dataset,Variable,Method,Forecasting Horizon,Preprocess,Runtime,MAE,RMSE,sMAPE,MASE,MAE std,RMSE std,sMAPE std,MASE std
0,Satellite,sla,DirectTabularRegressionForecaster(estimator=Ad...,1.0,0.0,8.983873,0.020218,0.020218,0.361233,0.789341,0.015034,0.015034,0.302839,0.587741


In [30]:
exp.predict_model(estimator=model)

Unnamed: 0,Model,MAE,RMSE,MAPE,SMAPE,MASE
0,Theta Forecaster,9.895,9.895,0.9895,1.9584,389.2997


2018-01    0.105
Freq: M, Name: Time Series, dtype: float64

## Train-Valid -- No Preprocess -- No Tuning With Predictions (MUCH SLOWER)

In [5]:
scores = pd.DataFrame()

fh = 1
preprocess = False
min_max_scaler = TabularToSeriesAdaptor(MinMaxScaler(feature_range=(1, 2)))

dataset_name = 'Satellite'
data, frequency_yearly_period, freq_sktime = read_file(dataset_name, data_path='H:/My Drive/PhD/ECOSCOPE/time-series-forecasting-waves/data/')

for target in data.columns:
    print('#'*70, target, '#'*70)

    # split data
    train, test, valid, train_without_valid, train_test_split_date, train_valid_split_date = train_valid_test_split(dataset_name, data)
    # cv from sktime
    cv = ExpandingWindowSplitter(step_length=1, fh=fh, initial_window=train_without_valid.shape[0])

    # for train - valid:
    data_ = train[target]
    train_length_init = len(train_without_valid) + 1
    # for train - test:
    # data_ = data[target]
    # train_length_init = len(train)+1

    for forecaster in forecasters:
        print('='*40, forecaster, '='*40)
        model = TransformedTargetForecaster(steps=[
            ("minmaxscaler", min_max_scaler),
            ("forecaster", forecaster),
        ])

        cv_scores = pd.DataFrame()
        y_preds, y_true = pd.Series(), pd.Series()
        
        s = time.time()
        for i in range(len(test)):
            y_train = data_.iloc[:train_length_init+i]
            y_test = data_.loc[[c for c in data_.index if c not in y_train.index]]

            exp = setup(y_train, 
                        fold_strategy='expanding',
                        fold = 2,
                        fh = 1,
                        log_experiment=True, 
                        session_id=42,
                        verbose=False)

            model = exp.create_model(model, round=6, verbose=False)
            r = exp.predict_model(estimator=model, round=6, verbose=False)
            y_preds = y_preds.append(r)
            y_true = y_true.append(exp.variables['y_test'])
            cv_scores = cv_scores.append({
                'test dates': str(exp.variables['y_test'].index.values[0]),
                'MAE': exp.variables['display_container'][2]['MAE'].values[0],
                'RMSE': exp.variables['display_container'][2]['RMSE'].values[0],
                'SMAPE': exp.variables['display_container'][2]['SMAPE'].values[0],
                'MASE': exp.variables['display_container'][2]['MASE'].values[0],
            }, ignore_index=True)

        runtime = time.time()-s

        cv_scores.loc['MEAN'] = np.mean(cv_scores, axis=0)
        cv_scores.loc['SD'] = np.std(cv_scores, axis=0)

        scores = scores.append({
            'Dataset': dataset_name,
            'Variable': target,
            'Method': model.get_params()['forecaster'], 
            'Forecasting Horizon': fh, 
            'Preprocess': preprocess,
            'Runtime': runtime, 
            'MAE': cv_scores.loc['MEAN']['MAE'],
            'RMSE': cv_scores.loc['MEAN']['RMSE'],
            'sMAPE': cv_scores.loc['MEAN']['SMAPE'],
            'MASE': cv_scores.loc['MEAN']['MASE'],
            'MAE std': cv_scores.loc['SD']['MAE'],
            'RMSE std': cv_scores.loc['SD']['RMSE'],   
            'sMAPE std': cv_scores.loc['SD']['SMAPE'],   
            'MASE std': cv_scores.loc['SD']['MASE'],   
            'y_pred': y_preds,
            'y_true': y_true,
        }, ignore_index=True)

Column 'date' was removed from Satellite.
###################################################################### sla ######################################################################
train datetime margins              : 1993-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 300 (89.3%)
test datetime margins               : 2018-01-01 00:00:00 - 2020-12-01 00:00:00.     Total samples: 36 (10.7%)
valid datetime margins              : 2015-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 36 (10.7%)
train_without_valid datetime margins: 1993-01-01 00:00:00 - 2014-12-01 00:00:00.     Total samples: 264 (78.6%)
                     initial_seasonal=None, initial_trend=None,
                     initialization_method='estimated', seasonal=None, sp=12,
        bounds=None, callback=None, damped_trend=False, dates=None, disp=False,
        error='add', freq=None, full_output=True, ignore_inf_ic=True,
        information_criterion='aic', initial_level=None, initial_seasonal=

In [6]:
scores

Unnamed: 0,Dataset,Variable,Method,Forecasting Horizon,Preprocess,Runtime,MAE,RMSE,sMAPE,MASE,MAE std,RMSE std,sMAPE std,MASE std,y_pred,y_true
0,Satellite,sla,"NaiveForecaster(sp=1, strategy='last', window_...",1.0,0.0,53.938783,0.023155,0.023155,0.421625,0.739155,0.014968,0.014968,0.390923,0.476583,2015-01 0.057829 2015-02 0.032494 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
1,Satellite,sla,"NaiveForecaster(sp=12, strategy='last', window...",1.0,0.0,59.290284,0.028984,0.028984,0.534894,0.926631,0.018525,0.018525,0.476252,0.593588,2015-01 0.076945 2015-02 0.042900 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
2,Satellite,sla,"PolynomialTrendForecaster(degree=1, regressor=...",1.0,0.0,61.378739,0.031539,0.031539,0.471149,1.006811,0.019377,0.019377,0.376448,0.617632,2015-01 0.071550 2015-02 0.071245 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
3,Satellite,sla,"ExponentialSmoothing(damped_trend=False, initi...",1.0,0.0,66.035356,0.023146,0.023146,0.421035,0.738869,0.015047,0.015047,0.391291,0.479104,2015-01 0.058052 2015-02 0.032619 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
4,Satellite,sla,"AutoETS(additive_only=False, allow_multiplicat...",1.0,0.0,76.573243,0.023155,0.023155,0.421607,0.739151,0.014968,0.014968,0.390884,0.476595,2015-01 0.057830 2015-02 0.032496 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
5,Satellite,sla,"ThetaForecaster(deseasonalize=True, initial_le...",1.0,0.0,67.45465,0.023907,0.023907,0.498701,0.763136,0.019597,0.019597,0.509473,0.624244,2015-01 0.041529 2015-02 0.010035 2015-0...,2015-01 0.032494 2015-02 0.002164 2015-0...
6,Satellite,ugosa,"NaiveForecaster(sp=1, strategy='last', window_...",1.0,0.0,65.292451,0.035348,0.035348,1.079333,0.758861,0.032113,0.032113,0.721571,0.692324,2015-01 -0.022487 2015-02 -0.079619 2015-0...,2015-01 -0.079619 2015-02 -0.024529 2015-0...
7,Satellite,ugosa,"NaiveForecaster(sp=12, strategy='last', window...",1.0,0.0,61.64339,0.058523,0.058523,1.634267,1.252798,0.040912,0.040912,0.588385,0.87488,2015-01 0.021645 2015-02 0.030314 2015-0...,2015-01 -0.079619 2015-02 -0.024529 2015-0...
8,Satellite,ugosa,"PolynomialTrendForecaster(degree=1, regressor=...",1.0,0.0,59.141025,0.036173,0.036173,1.445002,0.772842,0.025349,0.025349,0.642141,0.54045,2015-01 0.016047 2015-02 0.014660 2015-0...,2015-01 -0.079619 2015-02 -0.024529 2015-0...
9,Satellite,ugosa,"ExponentialSmoothing(damped_trend=False, initi...",1.0,0.0,66.445366,0.035187,0.035187,1.298833,0.753296,0.028024,0.028024,0.631082,0.598882,2015-01 -0.000280 2015-02 -0.008675 2015-0...,2015-01 -0.079619 2015-02 -0.024529 2015-0...


In [7]:
scores.to_csv(f'../../results/scores/train-valid_no-tune_no-pre-process_{dataset_name}.csv', index=False)

## Tuning

In [10]:
algorithms = {
    'decision_tree': {
        'params': {
            'forecaster__estimator__ccp_alpha': [0, 0.01, 0.1],
            'forecaster__estimator__max_depth': [1, 2, 3, 4, 5, 10, None],
            'forecaster__estimator__max_leaf_nodes': [2, 3, 8, 16, 100, None],
            'forecaster__estimator__min_impurity_decrease': [0, 0.01, 0.1],
            'forecaster__estimator__min_samples_leaf': [1, 2, 3, 4],
            'forecaster__estimator__min_samples_split': [2, 3]
        },
        'estimator': 
            DecisionTreeRegressor(ccp_alpha=0.0,  criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                  min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=42, splitter='best')
},
    'random_forest': {
        'params': {
            'forecaster__estimator__ccp_alpha': [0, 0.01, 0.1],
            'forecaster__estimator__max_depth': [1, 2, 3, 4, 5, 10, None],
            'forecaster__estimator__max_leaf_nodes': [2, 3, 8, 16, 100, None],
            'forecaster__estimator__min_impurity_decrease': [0, 0.01, 0.1],
            'forecaster__estimator__min_samples_leaf': [1, 2, 3],
            'forecaster__estimator__min_samples_split': [2, 3],
            'forecaster__estimator__n_estimators': [10, 100, 200],
        },
        'estimator':
            RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, 
                                  min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, 
                                  n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)
},
    'extra_trees_regressor': {
        'params': {
            'forecaster__estimator__ccp_alpha': [0, 0.01, 0.1],
            'forecaster__estimator__max_depth': [1, 2, 3, 4, 5, 10, None],
            'forecaster__estimator__max_leaf_nodes': [3, 8, 16, 100, None],
            'forecaster__estimator__min_impurity_decrease': [0, 0.01, 0.1],
            'forecaster__estimator__min_samples_leaf': [1, 2],
            'forecaster__estimator__min_samples_split': [2, 3],
            'forecaster__estimator__n_estimators': [10, 50, 100, 200],
            'forecaster__estimator__warm_start': [True, False],
        },
        'estimator':
            ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, 
                                min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, 
                                n_estimators=100, n_jobs=-5, oob_score=False, random_state=42, verbose=0, warm_start=False)
},
    'gradient_boosting': {
        'params': {
            'forecaster__estimator__alpha': [0.5, 0.9],
            'forecaster__estimator__ccp_alpha': [0, 0.01, 0.1],
            'forecaster__estimator__max_depth': [2, 3, 5, 10, None],
            'forecaster__estimator__min_impurity_decrease': [0, 0.01, 0.1],
            'forecaster__estimator__min_samples_leaf': [1, 2],
            'forecaster__estimator__min_samples_split': [2, 3],
            'forecaster__estimator__n_estimators': [10, 100, 200],
            'forecaster__estimator__learning_rate': [0.1, 0.01],
        },
        'estimator':
            GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=3, max_features=None, 
                                      max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, 
                                      min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, random_state=42, subsample=1.0, tol=0.0001, 
                                      validation_fraction=0.1, verbose=0, warm_start=False)
},
    'adaboost': {
        'params': {
            'forecaster__estimator__loss': ['linear', 'square', 'exponential'],
            'forecaster__estimator__n_estimators': [10, 50, 100, 200],
            'forecaster__estimator__learning_rate': [0.1, 0.05, 0.01],
        },
        'estimator':
            AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear', n_estimators=50, random_state=42)
},
    'lgb_regressor': {
        'params': {
            'forecaster__estimator__max_depth': [1, 2, 3, 4, 5, 10, -1],
            'forecaster__estimator__num_leaves': [2, 3, 10, 20, 31, 100],
            'forecaster__estimator__min_child_samples': [5, 10, 20, 50],
            'forecaster__estimator__min_child_weight': [0.001, 0.005],
            'forecaster__estimator__n_estimators': [10, 50, 100, 200],
        },
        'estimator':
            lgbm.sklearn.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, 
                                       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, 
                                       random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent='warn', subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
},
    'knn': {
        'params': {
            'forecaster__estimator__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
            'forecaster__estimator__p': [1, 2, 3],
        },
        'estimator':
            KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform')
},
    'passive_aggressive': {
        'params': {
            'forecaster__estimator__C': [0.1, 0.25, 0.5, 0.75, 1],
            'forecaster__estimator__early_stopping': [True, False],
            'forecaster__estimator__epsilon': [0.01, 0.05, 0.1, 0.2],
            'forecaster__estimator__max_iter': [500, 1000, 2000],
            'forecaster__estimator__n_iter_no_change': [1, 2, 3, 4, 5, 7],
            'forecaster__estimator__validation_fraction': [0.1, 0.2],
            'forecaster__estimator__tol': [None, 0.001, 0.002],
        },
        'estimator':
            PassiveAggressiveRegressor(C=1.0, average=False, early_stopping=False, epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive', max_iter=1000, 
                                       n_iter_no_change=5, random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)
},
    'huber': {
        'params': {
            'forecaster__estimator__alpha': [0.00005, 0.0001, 0.0005, 0.001],
            'forecaster__estimator__epsilon': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2],
            'forecaster__estimator__max_iter': [50, 100, 200, 500],
            'forecaster__estimator__tol': [1e-05, 1e-06, 5e-05, 5e-04],
            'forecaster__estimator__warm_start': [True, False],
        },
        'estimator':
            HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100, tol=1e-05, warm_start=False)
},
    'bayesian_ridge': {
        'params': {
            'forecaster__estimator__alpha_1': [1e-05, 5e-05, 1e-06, 5e-06],
            'forecaster__estimator__alpha_2': [1e-05, 5e-05, 1e-06, 5e-06],
            'forecaster__estimator__lambda_1': [1e-05, 5e-05, 1e-06, 5e-06],
            'forecaster__estimator__lambda_2': [1e-05, 5e-05, 1e-06, 5e-06],
            'forecaster__estimator__compute_score': [True, False],
            'forecaster__estimator__n_iter': [100, 200, 300, 400],
            'forecaster__estimator__tol': [0.0005, 0.001, 0.005, 0.01, 0.05],
        },
        'estimator': 
            BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300, normalize=False, tol=0.001, verbose=False)
},
    'lasso_lars': {
        'params': {
            'forecaster__estimator__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2],
            'forecaster__estimator__max_iter': [100, 200, 500, 1000]
        },
        'estimator':
            LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, jitter=None, max_iter=500, normalize=True, positive=False, precompute='auto', random_state=42, verbose=False)
},
    'lars': {
        'params': {
            'forecaster__estimator__n_nonzero_coefs': [1, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1500, 2000],
        },
        'estimator': 
            Lars(copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, jitter=None, n_nonzero_coefs=500, normalize=True, precompute='auto', random_state=42, verbose=False)
},
    'elastic_net': {
        'params': {
            'forecaster__estimator__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2],
            'forecaster__estimator__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
            'forecaster__estimator__max_iter': [100, 200, 500, 1000],
            'forecaster__estimator__tol': [0.05, 0.001, 0.005, 0.0001, 0.0005],
            'forecaster__estimator__warm_start': [True, False],
        },
        'estimator':
            ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=42, selection='cyclic', tol=0.0001, warm_start=False)
},
    'ridge': {
        'params': {
            'forecaster__estimator__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2],
            'forecaster__estimator__max_iter': [100, 200, 500, 1000],
            'forecaster__estimator__tol': [0.05, 0.001, 0.005, 0.0001, 0.0005],
        },
        'estimator': 
            Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=42, solver='auto', tol=0.001)
},
    'lasso': {
        'params': {
            'forecaster__estimator__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2],
            'forecaster__estimator__max_iter': [100, 200, 500, 1000, 2000, 2500, 3000],
            'forecaster__estimator__tol': [0.05, 0.001, 0.005, 0.0001, 0.0005],
        },
        'estimator':
            Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=42, selection='cyclic', tol=0.0001, warm_start=False)
}
                 }
# OrthogonalMatchingPursuit no tuning
# LinearRegression no tuning

In [16]:
for algorithm_name, value in algorithms.items():
    print(algorithm_name)
    print(value['estimator'])
    print(value['params'])
    print()

decision_tree
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, random_state=42,
                      splitter='best')
{'forecaster__estimator__ccp_alpha': [0, 0.01, 0.1], 'forecaster__estimator__max_depth': [1, 2, 3, 4, 5, 10, None], 'forecaster__estimator__max_leaf_nodes': [2, 3, 8, 16, 100, None], 'forecaster__estimator__min_impurity_decrease': [0, 0.01, 0.1], 'forecaster__estimator__min_samples_leaf': [1, 2, 3, 4], 'forecaster__estimator__min_samples_split': [2, 3]}

random_forest
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                     

In [11]:
scores = pd.DataFrame()
fh = 1
preprocess = False
min_max_scaler = TabularToSeriesAdaptor(MinMaxScaler(feature_range=(1, 2)))

dataset_name = 'Satellite'
data, frequency_yearly_period, freq_sktime = read_file(dataset_name, data_path='')

for algorithm_name, value in algorithms.items():
    if algorithm_name not in ['decision_tree', 'random_forest']:
        print('='*80, algorithm_name, '='*80)
        for target in data.columns:
            print('#'*70, target, '#'*70)
            data_ = data[target]

            # split data
            train, test, valid, train_without_valid, train_test_split_date, train_valid_split_date = train_valid_test_split(dataset_name, data_)

            # add a dummy last row to the full dataset (in this case it is the train), because pycaret keeps the last fh for test
            index_of_dummy_row = train.index[-1] + pd.DateOffset(months=1 if freq_sktime == 'M' else 0, days= 1 if freq_sktime == 'D' else 0)
            value_of_dummy_row = 10

            # cv from sktime
            cv = ExpandingWindowSplitter(step_length=1, fh=fh, initial_window=train_without_valid.shape[0])

            forecaster = DirectTabularRegressionForecaster(estimator=value['estimator'])
            model = TransformedTargetForecaster(steps=[
                ("minmaxscaler", min_max_scaler),
                ("forecaster", forecaster),
            ])

            exp = setup(train.append(pd.Series([value_of_dummy_row], index=[index_of_dummy_row])), 
                    fold_strategy=cv, 
                    log_experiment=True, 
                    session_id=42,
                    verbose=False)
            model = exp.create_model(model, round=6, fold=cv, verbose=False)



            tuned_model = exp.tune_model(model, fold=cv, search_algorithm='grid', optimize='RMSE', round=6,
                                         custom_grid = value['params'], n_jobs=-10
                                        )
            # print(model)
            # print(tuned_model)
            save_model(tuned_model, f'../results/tuned_models/{dataset_name}/fh.{fh}_{target}_{algorithm_name}')

    # check available parameters with model.get_params().keys()

Unnamed: 0,cutoff,MAE,RMSE,MAPE,SMAPE,MASE,RMSSE
0,2014-12,0.089992,0.089992,1.130278,2.0,1.886049,1.504311
1,2015-01,0.012852,0.012852,0.523962,0.41519,0.269151,0.214871
2,2015-02,0.032112,0.032112,1.478258,2.0,0.6721,0.537022
3,2015-03,0.000834,0.000834,0.054652,0.053199,0.017518,0.013974
4,2015-04,0.038612,0.038612,1.150261,2.0,0.813651,0.648142
5,2015-05,0.010327,0.010327,0.329068,0.393874,0.218123,0.173649
6,2015-06,0.013982,0.013982,25.054113,1.852148,0.296362,0.235536
7,2015-07,0.00403,0.00403,1.221224,2.0,0.085532,0.067982
8,2015-08,0.103509,0.103509,1.104093,2.0,2.204478,1.749283
9,2015-09,0.053053,0.053053,0.529641,0.720423,1.125481,0.893815


Transformation Pipeline and Model Successfully Saved
###################################################################### vgosa ######################################################################
train datetime margins              : 1993-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 300 (89.3%)
test datetime margins               : 2018-01-01 00:00:00 - 2020-12-01 00:00:00.     Total samples: 36 (10.7%)
valid datetime margins              : 2015-01-01 00:00:00 - 2017-12-01 00:00:00.     Total samples: 36 (10.7%)
train_without_valid datetime margins: 1993-01-01 00:00:00 - 2014-12-01 00:00:00.     Total samples: 264 (78.6%)


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,cutoff,MAE,RMSE,MAPE,SMAPE,MASE,RMSSE


Fitting 36 folds for each of 1280 candidates, totalling 46080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 612 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1312 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 2212 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 3312 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 4612 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 6112 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 7812 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 9712 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 11812 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 14112 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 16612 tasks      | elapsed:   52.9s


## With Preprocess

In [None]:
forecaster =  ThetaForecaster(deseasonalize=True, initial_level=None, sp=12)
min_max_scaler = TabularToSeriesAdaptor(MinMaxScaler(feature_range=(1, 2)))
pipe = TransformedTargetForecaster(steps=[
    ("detrender", Detrender()),
    ("deseasonalizer", Differencer(lags=1)),
    ("minmaxscaler", min_max_scaler),
    ("forecaster", forecaster),
])