In [None]:
!pip install skforecast tqdm

In [None]:
import pandas as pd
from tqdm import tqdm
import lightgbm
import sklearn
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
import skforecast
from sklearn.feature_selection import RFECV
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import bayesian_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import select_features
import matplotlib.pyplot as plt
import numpy as np
import os
from os import path
import shutil
import re
import traceback
%matplotlib inline

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
pipeline_categorical = make_pipeline(
                           OrdinalEncoder(
                               dtype=int,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1,
                               encoded_missing_value=-1
                           ),
                           FunctionTransformer(
                               func=lambda x: x.astype('category'),
                               feature_names_out= 'one-to-one'
                           )
                       )

transformer_exog = make_column_transformer(
                       (
                           pipeline_categorical,
                           make_column_selector(dtype_exclude=np.number)
                       ),
                       remainder="passthrough",
                       verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Lags grid
lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

# Regressor hyperparameters search space for lightgbm
def search_space(trial):
    search_space  = {
        'n_estimators'    : trial.suggest_int('n_estimators', 400, 1200, step=100),
        'max_depth'       : trial.suggest_int('max_depth', 3, 10, step=1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 25, 500),
        'learning_rate'   : trial.suggest_float('learning_rate', 0.01, 0.5),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, step=0.1),
        'max_bin'         : trial.suggest_int('max_bin', 50, 250, step=25),
        'reg_alpha'       : trial.suggest_float('reg_alpha', 0, 1, step=0.1),
        'reg_lambda'      : trial.suggest_float('reg_lambda', 0, 1, step=0.1),
        'lags'            : trial.suggest_categorical('lags', lags_grid)
    }
    return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
    forecaster = ForecasterAutoreg(
        regressor = LGBMRegressor(random_state=15926, verbose=-1),
        lags = 24,
        transformer_exog = transformer_exog,
        fit_kwargs = {"categorical_feature": "auto"}
    )

    # search for best parameters
    results_search, frozen_trial = bayesian_search_forecaster(
    forecaster         = forecaster,
    y                  = data.loc[:end_valid, 'gw-level'],
    exog               = data.loc[:end_valid, exog_features],
    search_space       = search_space,
    steps              = 30,
    refit              = False,
    metric             = 'mean_absolute_percentage_error',
    initial_train_size = len(data.loc[:end_train]),
    fixed_train_size   = False,
    n_trials           = 20,
    random_state       = 123,
    return_best        = True,
    n_jobs             = 'auto',
    verbose            = False,
    show_progress      = True
    )

    best_params = results_search['params'].iat[0]

    return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor          = LGBMRegressor(**best_params, random_state=15926, verbose=-1),
  lags               = 24,
  transformer_exog   = transformer_exog,
  fit_kwargs         = {"categorical_feature": "auto"}
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor          = LGBMRegressor(**best_params, random_state=15926, verbose=-1),
  lags               = 24,
  transformer_exog   = transformer_exog, #one_hot_encoder,
  fit_kwargs         = {"categorical_feature": "auto"}
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value


In [None]:
# Define the function to return the SMAPE value
def smape(A, F):
    tmp = 2 * np.abs(F - A) / (np.abs(A) + np.abs(F))
    len_ = np.count_nonzero(~np.isnan(tmp))
    if len_ == 0 and np.nansum(tmp) == 0: # Deals with a special case
        return 100
    return round(100 / len_ * np.nansum(tmp), 3)

In [None]:
def populate_test_data(data_dir):

  preds_dict = {}
  smape_dict = {}


  # collect all files in the directory
  filenames = os.listdir(data_dir)
  filenames = filenames[:20]

  try:

    for filename in tqdm(filenames):

        hrbnz01 = filename.split(".")[0].split("-")[-1]
        filepath = path.join(data_dir, filename)
        df_exog = pd.read_csv(filepath)
        df_exog["season"] = df_exog["season"].astype("category")
        df_exog["weather"] = df_exog["weather"].astype("category")
        df_exog["date"] = pd.to_datetime(df_exog["date"])
        df_exog.set_index("date", inplace=True)
        df_exog.index = pd.date_range(start=df_exog.index.min(), end=df_exog.index.max(), freq='MS')

        # get the estimate end train and end validation dates
        data = df_exog.copy()
        exog_data = data.drop("gw-level", axis=1)
        exog_features = exog_data.columns
        df_idx = data.index
        train_num = int(len(data) * 0.8)
        valid_num = len(data.loc[:"2021-11-01"])
        end_train = df_idx[train_num]
        end_valid = df_idx[valid_num]
        end_evaluation = df_idx[train_num+26]
        evaluate_data = data.loc[df_idx[train_num+1]: end_evaluation, "gw-level"].values


        # tune for best hyperparamters and evaluate on MAPE metric
        best_params = search_hyperparameters(data, end_train, end_valid, exog_features, transformer_exog)

        # train and make predict into 26 months in the future of the test template
        df_predictions, smape = train_and_predict(data,best_params, evaluate_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog)
        df_predictions["pred"] =  df_predictions["pred"].round(2)
        preds_dict[hrbnz01] = df_predictions['pred'].values
        smape_dict[hrbnz01] = smape


  except Exception as ex:
    print("[Error]")
    print(traceback.format_exc())

  df_smape = pd.DataFrame(smape_dict, index=[0])
  df_final_preds = pd.DataFrame(preds_dict, index=range(26))
  print("> Done")

  return df_final_preds, df_smape

In [None]:
processed_data_dir = "/content/drive/MyDrive/processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_lgbm, df_smape_lgbm = populate_test_data(processed_data_dir)
# df_submission.to_csv("df_submission.csv", index=False)
# df_smape.to_csv("smape_score.csv", index=False)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:04<01:21,  4.26s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 9, 'min_data_in_leaf': 408, 'learning_rate': 0.05151409102341216, 'feature_fraction': 1.0, 'max_bin': 250, 'reg_alpha': 0.4, 'reg_lambda': 0.1}
  Backtesting metric: 0.0028209927694297756



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 8, 'min_data_in_leaf': 25, 'learning_rate': 0.09808714723656196, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 0.2, 'reg_lambda': 0.8}
  Backtesting metric: 0.00014637267861973758



 10%|█         | 2/20 [00:06<00:54,  3.02s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 15%|█▌        | 3/20 [00:09<00:52,  3.11s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.0017201921810284914



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.0005276728042629041



 20%|██        | 4/20 [00:13<00:55,  3.48s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 8, 'min_data_in_leaf': 25, 'learning_rate': 0.09808714723656196, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 0.2, 'reg_lambda': 0.8}
  Backtesting metric: 0.0010515876019023943



 25%|██▌       | 5/20 [00:16<00:46,  3.09s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 30%|███       | 6/20 [00:18<00:37,  2.71s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 8, 'min_data_in_leaf': 85, 'learning_rate': 0.4297920688485764, 'feature_fraction': 1.0, 'max_bin': 125, 'reg_alpha': 0.9, 'reg_lambda': 1.0}
  Backtesting metric: 0.0018978145044446662



  0%|          | 0/20 [00:00<?, ?it/s]

 35%|███▌      | 7/20 [00:22<00:42,  3.24s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 400, 'max_depth': 8, 'min_data_in_leaf': 194, 'learning_rate': 0.13332771976648977, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 1.0, 'reg_lambda': 0.4}
  Backtesting metric: 0.0013688040347614356



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 10, 'min_data_in_leaf': 47, 'learning_rate': 0.013949960267970019, 'feature_fraction': 1.0, 'max_bin': 50, 'reg_alpha': 0.0, 'reg_lambda': 0.2}
  Backtesting metric: 0.00037188609407542365



 40%|████      | 8/20 [00:26<00:43,  3.67s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1000, 'max_depth': 6, 'min_data_in_leaf': 53, 'learning_rate': 0.2050416851119114, 'feature_fraction': 0.9, 'max_bin': 75, 'reg_alpha': 0.1, 'reg_lambda': 0.5}
  Backtesting metric: 0.0010445037800371787



 45%|████▌     | 9/20 [00:29<00:38,  3.46s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'min_data_in_leaf': 25, 'learning_rate': 0.09266753030469671, 'feature_fraction': 0.6, 'max_bin': 225, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.0}
  Backtesting metric: 0.0009230126427751498



 50%|█████     | 10/20 [00:37<00:45,  4.59s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 10, 'min_data_in_leaf': 31, 'learning_rate': 0.017627004844245255, 'feature_fraction': 1.0, 'max_bin': 50, 'reg_alpha': 0.0, 'reg_lambda': 0.2}
  Backtesting metric: 0.00027614021957600963



 55%|█████▌    | 11/20 [00:43<00:45,  5.08s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 60%|██████    | 12/20 [00:46<00:37,  4.66s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 9, 'min_data_in_leaf': 151, 'learning_rate': 0.26589150150237945, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 0.9, 'reg_lambda': 0.0}
  Backtesting metric: 0.0003446468848262791



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 8, 'min_data_in_leaf': 28, 'learning_rate': 0.130475871043031, 'feature_fraction': 1.0, 'max_bin': 75, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.0}
  Backtesting metric: 0.00037625451383366384



 65%|██████▌   | 13/20 [00:51<00:33,  4.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1100, 'max_depth': 3, 'min_data_in_leaf': 69, 'learning_rate': 0.22476099797953217, 'feature_fraction': 0.8, 'max_bin': 100, 'reg_alpha': 0.2, 'reg_lambda': 0.0}
  Backtesting metric: 0.0004874252520617758



 70%|███████   | 14/20 [00:54<00:25,  4.21s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 75%|███████▌  | 15/20 [00:57<00:19,  3.86s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.0006862424344243744



  0%|          | 0/20 [00:00<?, ?it/s]

 80%|████████  | 16/20 [01:01<00:14,  3.71s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 9, 'min_data_in_leaf': 408, 'learning_rate': 0.05151409102341216, 'feature_fraction': 1.0, 'max_bin': 250, 'reg_alpha': 0.4, 'reg_lambda': 0.1}
  Backtesting metric: 0.0022477802634600502



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 8, 'min_data_in_leaf': 28, 'learning_rate': 0.130475871043031, 'feature_fraction': 1.0, 'max_bin': 75, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.0}
  Backtesting metric: 0.0015243874936501037



 85%|████████▌ | 17/20 [01:03<00:10,  3.35s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 90%|█████████ | 18/20 [01:06<00:06,  3.24s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 3, 'min_data_in_leaf': 425, 'learning_rate': 0.23265642115398555, 'feature_fraction': 0.8, 'max_bin': 125, 'reg_alpha': 0.4, 'reg_lambda': 1.0}
  Backtesting metric: 0.00027939354713483046



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 9, 'min_data_in_leaf': 198, 'learning_rate': 0.3190142353753449, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 1.0, 'reg_lambda': 0.2}
  Backtesting metric: 0.0010031936310860347



 95%|█████████▌| 19/20 [01:10<00:03,  3.26s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1100, 'max_depth': 3, 'min_data_in_leaf': 87, 'learning_rate': 0.24203460874368538, 'feature_fraction': 0.9, 'max_bin': 225, 'reg_alpha': 0.8, 'reg_lambda': 0.4}
  Backtesting metric: 0.0028786271903392022



100%|██████████| 20/20 [01:13<00:00,  3.69s/it]

> Done





In [None]:
df_smape_lgbm.index = pd.Index(['smape'])
print(f" average smape: {round(df_smape_lgbm.mean(axis=1)[0],4)}")
df_smape_lgbm


 average smape: 0.1377


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
smape,0.313,0.026,0.13,0.072,0.082,0.235,0.126,0.03,0.171,0.088,0.049,0.064,0.07,0.046,0.058,0.447,0.193,0.042,0.093,0.419


In [None]:
df_submission_lgbm.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_lgbm

Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.89,399.97,238.26,200.74,224.31,230.86,242.54,510.75,328.43,492.98,638.33,424.73,658.55,251.62,283.86,113.63,120.67,181.15,144.05,140.06
2022-02-01,153.89,400.0,238.26,200.71,224.38,230.76,242.46,510.81,328.39,493.27,638.32,424.62,658.54,251.68,283.86,113.63,120.68,181.15,144.07,140.29
2022-03-01,153.99,399.97,238.26,200.72,224.33,230.61,242.53,510.88,328.38,493.44,638.42,424.62,658.59,251.64,283.86,113.63,120.74,181.15,144.08,140.68
2022-04-01,154.02,399.99,238.26,200.68,224.22,230.51,242.84,511.05,328.16,493.66,638.72,424.64,658.59,251.61,283.86,113.63,120.83,181.15,144.08,140.48
2022-05-01,154.0,400.0,238.26,200.66,224.4,230.8,243.14,511.18,328.39,493.41,638.95,424.6,658.53,251.63,283.86,113.63,120.87,181.15,144.08,140.52
2022-06-01,153.95,400.08,238.26,200.62,224.5,230.9,243.35,511.21,328.42,492.76,639.07,424.59,658.55,251.69,283.86,113.63,120.92,181.15,144.07,140.4
2022-07-01,153.86,400.16,238.26,200.64,224.6,230.99,243.3,511.08,328.54,492.57,639.08,424.6,658.67,251.68,283.86,113.63,120.92,181.15,144.09,139.75
2022-08-01,153.86,400.16,238.26,200.63,224.56,230.98,243.28,511.01,328.89,492.93,639.02,424.6,658.81,251.65,283.86,113.63,120.82,181.15,144.1,140.42
2022-09-01,153.85,400.16,238.26,200.65,224.5,230.91,243.18,510.93,329.08,493.2,639.02,424.61,658.95,251.74,283.86,113.63,120.71,181.15,144.09,140.08
2022-10-01,153.76,400.16,238.26,200.7,224.51,231.07,242.53,510.87,329.16,493.27,638.91,424.6,658.96,251.74,283.86,113.63,120.64,181.15,144.11,139.69


## Trying XGBoost Model for Comparison

In [None]:
# Regressor hyperparameters search space for xgboost
def search_space(trial):

  # Lags grid
  lags_grid = tuple([12, [1, 2, 3, 4, 7, 9, 12]])

  search_space  = {
      'n_estimators'    : trial.suggest_int('n_estimators', 400, 1200, step=100),
      'max_depth'       : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate'   : trial.suggest_float('learning_rate', 0.01, 1),
      'subsample'       : trial.suggest_float('subsample', 0.1, 1),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
      'gamma'           : trial.suggest_float('gamma', 0, 1),
      'reg_alpha'       : trial.suggest_float('reg_alpha', 0, 1),
      'reg_lambda'      : trial.suggest_float('reg_lambda', 0, 1),
      'lags'            : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog, hrbnz01):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(**best_params,
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(**best_params,
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

In [None]:
processed_data_dir = "/content/drive/MyDrive/processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_xgb, df_smape_xgb = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.03885337722243354, 'subsample': 0.3604262920151898, 'colsample_bytree': 0.9501979211086962, 'gamma': 0.994411218105991, 'reg_alpha': 0.003677635938920898, 'reg_lambda': 0.47886494132474805}
  Backtesting metric: 0.0024727797496551145



  5%|▌         | 1/20 [00:09<03:06,  9.80s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 4, 'learning_rate': 0.9635955935176932, 'subsample': 0.9931518759512354, 'colsample_bytree': 0.9845292840882569, 'gamma': 0.004802254183477721, 'reg_alpha': 0.82769454819645, 'reg_lambda': 0.40138295165893406}
  Backtesting metric: 0.00018873325345216215



 10%|█         | 2/20 [00:16<02:18,  7.69s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.5588394172205944, 'subsample': 0.4500555167108301, 'colsample_bytree': 0.9326192406525875, 'gamma': 0.8416699969127163, 'reg_alpha': 0.35739756668317624, 'reg_lambda': 0.04359146379904055}
  Backtesting metric: 0.001364044338959121



 15%|█▌        | 3/20 [00:24<02:16,  8.01s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.6286544670746439, 'subsample': 0.7072201458890424, 'colsample_bytree': 0.8581081938582316, 'gamma': 0.08319498833243877, 'reg_alpha': 0.7636828414433382, 'reg_lambda': 0.243666374536874}
  Backtesting metric: 0.0004934320216603084



 20%|██        | 4/20 [00:34<02:21,  8.82s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.6070295271268181, 'subsample': 0.5905612058198184, 'colsample_bytree': 0.4084874503968776, 'gamma': 0.3041207890271841, 'reg_alpha': 0.4170222110247016, 'reg_lambda': 0.6813007657927966}
  Backtesting metric: 0.0010156338625526625



 25%|██▌       | 5/20 [00:39<01:51,  7.45s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.4365541356963474, 'subsample': 0.5443165878852756, 'colsample_bytree': 0.4832472612662452, 'gamma': 0.3122612229724653, 'reg_alpha': 0.4263513069628082, 'reg_lambda': 0.8933891631171348}
  Backtesting metric: 0.0016981307398602208



 30%|███       | 6/20 [00:46<01:43,  7.40s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.6309764823307561, 'subsample': 0.7510747223709593, 'colsample_bytree': 0.11451628602551515, 'gamma': 0.5944318794450425, 'reg_alpha': 0.5567851923942887, 'reg_lambda': 0.15895964414472274}
  Backtesting metric: 0.0012291758486180772



 35%|███▌      | 7/20 [00:52<01:30,  6.92s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'learning_rate': 0.3241126270021177, 'subsample': 0.4733435907582686, 'colsample_bytree': 0.8796782420950293, 'gamma': 0.2504553653965067, 'reg_alpha': 0.48303426426270435, 'reg_lambda': 0.985559785610705}
  Backtesting metric: 0.00035032241990334096



 40%|████      | 8/20 [01:02<01:33,  7.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.716445645908776, 'subsample': 0.8900162292032923, 'colsample_bytree': 0.19539649195672654, 'gamma': 0.5347841567472275, 'reg_alpha': 0.5933458686904657, 'reg_lambda': 0.3697633333698499}
  Backtesting metric: 0.0008641986667916418



 45%|████▌     | 9/20 [01:09<01:22,  7.52s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.36235571706028497, 'subsample': 0.7862930324068904, 'colsample_bytree': 0.633859224905999, 'gamma': 0.6917017987001771, 'reg_alpha': 0.15112745234808023, 'reg_lambda': 0.39887629272615654}
  Backtesting metric: 0.0008747064436237145



 50%|█████     | 10/20 [01:23<01:35,  9.58s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'learning_rate': 0.3241126270021177, 'subsample': 0.4733435907582686, 'colsample_bytree': 0.8796782420950293, 'gamma': 0.2504553653965067, 'reg_alpha': 0.48303426426270435, 'reg_lambda': 0.985559785610705}
  Backtesting metric: 0.0002808856282664418



 55%|█████▌    | 11/20 [01:36<01:37, 10.79s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.9334748525988541, 'subsample': 0.3711269205958685, 'colsample_bytree': 0.8674225282971953, 'gamma': 0.20640951987163517, 'reg_alpha': 0.7475181148430462, 'reg_lambda': 0.5964471436097882}
  Backtesting metric: 0.0003647271239190308



 60%|██████    | 12/20 [01:45<01:21, 10.15s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.062164887147611814, 'subsample': 0.29984426160555844, 'colsample_bytree': 0.2630057481431337, 'gamma': 0.18003233813661032, 'reg_alpha': 0.8276293806194424, 'reg_lambda': 0.8153289322828513}
  Backtesting metric: 0.000388545093524128



 65%|██████▌   | 13/20 [01:52<01:04,  9.25s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1100, 'max_depth': 4, 'learning_rate': 0.07293115310394749, 'subsample': 0.8532355988729861, 'colsample_bytree': 0.30794140712521173, 'gamma': 0.7139907623831118, 'reg_alpha': 0.13667601432350507, 'reg_lambda': 0.6677112387051197}
  Backtesting metric: 0.0004762312715448514



 70%|███████   | 14/20 [02:00<00:53,  8.87s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 5, 'learning_rate': 0.1045412986135949, 'subsample': 0.10444733224469205, 'colsample_bytree': 0.6712837902590467, 'gamma': 0.5308709646907458, 'reg_alpha': 0.22251405875025332, 'reg_lambda': 0.549107141488361}
  Backtesting metric: 0.0005903052494191676



 75%|███████▌  | 15/20 [02:07<00:40,  8.16s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'subsample': 0.5961832921746021, 'colsample_bytree': 0.7475220728070068, 'gamma': 0.42310646012446096, 'reg_alpha': 0.9807641983846155, 'reg_lambda': 0.6848297385848633}
  Backtesting metric: 0.0018780882690507



 80%|████████  | 16/20 [02:16<00:33,  8.41s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1100, 'max_depth': 7, 'learning_rate': 0.45738137714834964, 'subsample': 0.9143173969435872, 'colsample_bytree': 0.3492488766779608, 'gamma': 0.5216458048577044, 'reg_alpha': 0.8155811080668112, 'reg_lambda': 0.5437497637303044}
  Backtesting metric: 0.0013576942335224284



 85%|████████▌ | 17/20 [02:25<00:26,  8.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 4, 'learning_rate': 0.9711475622597028, 'subsample': 0.27916938006965974, 'colsample_bytree': 0.28920578495729643, 'gamma': 0.958252915298607, 'reg_alpha': 0.0030847484326094795, 'reg_lambda': 0.431957294952924}
  Backtesting metric: 0.00027497876529941406



 90%|█████████ | 18/20 [02:34<00:17,  8.77s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.5748812332498305, 'subsample': 0.38540893687529965, 'colsample_bytree': 0.6471654261637358, 'gamma': 0.8151513169462702, 'reg_alpha': 0.6056107547390692, 'reg_lambda': 0.6714023329308223}
  Backtesting metric: 0.000970008530755651



 95%|█████████▌| 19/20 [02:40<00:07,  7.82s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.115434619925389, 'subsample': 0.7636067026603856, 'colsample_bytree': 0.3513624942452013, 'gamma': 0.49302435382809684, 'reg_alpha': 0.1860900286451497, 'reg_lambda': 0.7240813648907406}
  Backtesting metric: 0.003065588118081405



100%|██████████| 20/20 [02:49<00:00,  8.47s/it]

> Done





In [None]:
df_smape_xgb.index = pd.Index(['smape'])
print(f" average smape: {round(df_smape_xgb.mean(axis=1)[0],4)}")
df_smape_xgb

 average smape: 0.1213


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
smape,0.107,0.022,0.138,0.064,0.095,0.243,0.14,0.037,0.096,0.063,0.044,0.067,0.057,0.048,0.07,0.443,0.218,0.038,0.114,0.322


In [None]:
df_submission_xgb.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_xgb

Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.93,400.01,238.1,200.75,224.48,230.6,242.35,510.77,328.43,492.36,638.41,424.8,658.66,251.77,283.82,113.31,120.81,181.16,143.99,140.2
2022-02-01,153.92,400.01,238.11,200.79,224.48,230.54,242.82,510.81,328.43,492.85,638.41,424.8,658.65,251.75,283.78,113.35,120.91,181.16,143.99,140.5
2022-03-01,153.92,400.01,238.11,200.84,224.48,230.62,242.48,510.98,328.43,493.47,638.44,424.8,658.69,251.75,283.73,113.41,120.99,181.16,143.99,140.62
2022-04-01,153.92,400.01,238.04,200.89,224.38,230.63,243.01,511.04,328.43,493.32,638.65,424.8,658.75,251.75,283.71,113.43,120.99,181.16,143.99,140.47
2022-05-01,153.91,400.03,238.04,200.84,224.54,230.67,243.43,511.14,328.36,493.19,638.9,424.8,658.75,251.75,283.68,113.41,120.95,181.16,143.99,140.63
2022-06-01,153.9,400.03,238.04,200.71,224.55,230.75,244.02,511.18,328.36,493.35,639.17,424.87,658.79,251.75,283.71,113.41,120.89,181.16,143.99,140.62
2022-07-01,153.83,400.06,238.04,200.65,224.56,230.65,243.81,511.11,328.26,493.01,639.11,424.8,658.89,251.75,283.81,113.39,120.71,181.16,143.99,140.52
2022-08-01,153.72,400.06,238.04,200.59,224.6,230.57,243.71,510.95,328.44,492.81,639.13,424.8,658.95,251.75,283.84,113.37,120.78,181.16,143.99,140.24
2022-09-01,153.64,400.03,238.04,200.63,224.54,230.8,243.67,510.91,328.44,493.05,639.11,424.8,658.94,251.75,283.84,113.37,120.71,181.16,143.99,140.04
2022-10-01,153.6,400.01,238.04,200.77,224.6,230.58,242.76,510.82,328.34,493.54,638.98,424.87,658.91,251.75,283.75,113.37,120.61,181.16,143.99,139.92


## HistGradientBoostingRegressor Model

In [None]:
# one-hot encoding
categorical_features = ["weather", "season"]
transformer_exog = make_column_transformer(
    (
        OrdinalEncoder(
            dtype=int,
            handle_unknown="use_encoded_value",
            unknown_value=-1,
            encoded_missing_value=-1
        ),
        categorical_features
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Regressor hyperparameters search space
def search_space(trial):

  # Lags grid
  lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

  search_space  = {
      'max_iter'          : trial.suggest_int('max_iter', 400, 1200, step=100),
      'max_depth'         : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate'     : trial.suggest_float('learning_rate', 0.01, 1),
      'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 20, step=1),
      'l2_regularization' : trial.suggest_float('l2_regularization', 0, 1),
      'lags'              : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog, hrbnz01):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(**best_params,
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(**best_params,
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

In [None]:
processed_data_dir = "/content/drive/MyDrive/processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_hist, df_smape_hist = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 600, 'max_depth': 5, 'learning_rate': 0.4519706078090782, 'min_samples_leaf': 6, 'l2_regularization': 0.34020419296196963}
  Backtesting metric: 0.002938385499659992



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

  5%|▌         | 1/20 [01:24<26:39, 84.17s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 1200, 'max_depth': 8, 'learning_rate': 0.18276654663052466, 'min_samples_leaf': 17, 'l2_regularization': 0.04698204281398806}
  Backtesting metric: 0.00012850260683190836



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 10%|█         | 2/20 [02:00<16:46, 55.94s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 400, 'max_depth': 9, 'learning_rate': 0.5261866428633912, 'min_samples_leaf': 3, 'l2_regularization': 0.055541818980892166}
  Backtesting metric: 0.0015162278560034773



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 15%|█▌        | 3/20 [02:57<16:03, 56.70s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 600, 'max_depth': 7, 'learning_rate': 0.6537237372640594, 'min_samples_leaf': 1, 'l2_regularization': 0.40843637127904064}
  Backtesting metric: 0.0006388529177577409



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 20%|██        | 4/20 [04:25<18:23, 68.98s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 500, 'max_depth': 4, 'learning_rate': 0.53623586010342, 'min_samples_leaf': 11, 'l2_regularization': 0.6344009585513211}
  Backtesting metric: 0.0009199587756225745



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 25%|██▌       | 5/20 [05:03<14:26, 57.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.6384304097812739, 'min_samples_leaf': 1, 'l2_regularization': 0.4114766328851145}
  Backtesting metric: 0.001955080191282731



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 30%|███       | 6/20 [05:46<12:17, 52.67s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1200, 'max_depth': 8, 'learning_rate': 0.7280316377630004, 'min_samples_leaf': 4, 'l2_regularization': 0.3729114345009181}
  Backtesting metric: 0.0013383574859107247



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 35%|███▌      | 7/20 [07:06<13:21, 61.63s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 400, 'max_depth': 9, 'learning_rate': 0.01486161582041232, 'min_samples_leaf': 20, 'l2_regularization': 0.9866314508139888}
  Backtesting metric: 0.00033828131687488555



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 40%|████      | 8/20 [08:04<12:05, 60.47s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 1200, 'max_depth': 4, 'learning_rate': 0.7656875868906424, 'min_samples_leaf': 1, 'l2_regularization': 0.18414371240706717}
  Backtesting metric: 0.0008632819711655363



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 45%|████▌     | 9/20 [08:50<10:15, 55.93s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 600, 'max_depth': 10, 'learning_rate': 0.6384304097812739, 'min_samples_leaf': 16, 'l2_regularization': 0.802707065496886}
  Backtesting metric: 0.0008860104061300407



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 50%|█████     | 10/20 [10:07<10:23, 62.39s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 1100, 'max_depth': 9, 'learning_rate': 0.18326353204809023, 'min_samples_leaf': 20, 'l2_regularization': 0.07872415857720039}
  Backtesting metric: 0.00028873848312478895



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 55%|█████▌    | 11/20 [11:29<10:17, 68.58s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 700, 'max_depth': 6, 'learning_rate': 0.43157198739286967, 'min_samples_leaf': 7, 'l2_regularization': 0.4263513069628082}
  Backtesting metric: 0.0004120374818869792



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 60%|██████    | 12/20 [12:23<08:32, 64.08s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 700, 'max_depth': 7, 'learning_rate': 0.15177273236747227, 'min_samples_leaf': 17, 'l2_regularization': 0.40843637127904064}
  Backtesting metric: 0.0003201731167706919



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 65%|██████▌   | 13/20 [13:30<07:34, 64.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'min_samples_leaf': 12, 'l2_regularization': 0.7194689697855631}
  Backtesting metric: 0.0006170840490590648



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 70%|███████   | 14/20 [14:26<06:12, 62.09s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 900, 'max_depth': 8, 'learning_rate': 0.025967914628066663, 'min_samples_leaf': 12, 'l2_regularization': 0.5567851923942887}
  Backtesting metric: 0.000619938849016912



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 75%|███████▌  | 15/20 [15:24<05:04, 60.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.368170769066091, 'min_samples_leaf': 5, 'l2_regularization': 0.29371404638882936}
  Backtesting metric: 0.001955721131894422



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 80%|████████  | 16/20 [16:35<04:16, 64.18s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'min_samples_leaf': 12, 'l2_regularization': 0.7194689697855631}
  Backtesting metric: 0.0013109252830416551



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 85%|████████▌ | 17/20 [17:19<02:54, 58.09s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1100, 'max_depth': 9, 'learning_rate': 0.12501637561856965, 'min_samples_leaf': 4, 'l2_regularization': 0.012283918831203056}
  Backtesting metric: 0.0003119072691790466



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 90%|█████████ | 18/20 [18:55<02:18, 69.43s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 400, 'max_depth': 4, 'learning_rate': 0.017271592861447777, 'min_samples_leaf': 14, 'l2_regularization': 0.27150582012182334}
  Backtesting metric: 0.0011238902124440986



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

 95%|█████████▌| 19/20 [19:53<01:05, 65.96s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 800, 'max_depth': 6, 'learning_rate': 0.3497462359893607, 'min_samples_leaf': 15, 'l2_regularization': 0.4385722446796244}
  Backtesting metric: 0.002922871102444139



Parameters: { "l2_regularization", "max_iter", "min_samples_leaf" } are not used.

100%|██████████| 20/20 [20:48<00:00, 62.44s/it]

> Done





In [None]:
df_smape_hist.index = pd.Index(['smape'])
print(f" average smape: {round(df_smape_hist.mean(axis=1)[0],4)}")
df_smape_hist

 average smape: 0.12


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
smape,0.123,0.031,0.116,0.074,0.11,0.198,0.086,0.034,0.171,0.071,0.04,0.072,0.067,0.061,0.076,0.503,0.12,0.038,0.118,0.292


In [None]:
df_submission_hist.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_hist

Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.87,399.95,237.82,200.75,224.43,230.72,242.06,510.71,328.21,492.57,638.34,424.77,658.53,251.62,283.87,113.24,120.68,181.14,143.99,139.98
2022-02-01,153.78,399.91,237.71,200.88,224.51,230.69,242.09,510.66,328.28,492.4,638.32,424.68,658.66,251.61,283.85,113.27,120.87,181.15,143.99,140.15
2022-03-01,153.61,399.95,237.66,200.95,224.62,230.63,242.15,510.54,328.79,492.68,638.4,424.7,658.69,251.62,283.76,113.3,120.97,181.16,143.94,140.37
2022-04-01,153.47,399.96,237.64,200.9,224.59,230.67,242.59,510.67,328.74,492.85,638.72,424.6,658.88,251.66,283.78,113.28,120.95,181.15,143.91,140.7
2022-05-01,152.92,399.97,237.66,200.85,224.62,230.72,243.48,510.67,328.8,493.05,638.89,424.64,659.03,251.68,283.85,113.21,120.86,181.17,143.89,140.56
2022-06-01,152.8,399.99,237.66,200.81,224.68,230.76,243.76,510.65,328.73,493.13,639.0,424.63,659.01,251.68,283.94,113.19,120.84,181.19,143.85,140.45
2022-07-01,152.77,400.04,237.63,200.74,224.72,230.79,243.65,510.46,328.71,492.79,639.1,424.76,659.04,251.64,284.22,113.23,120.72,181.21,143.9,140.42
2022-08-01,152.75,400.09,237.53,200.74,224.77,230.87,243.9,510.39,329.1,492.55,639.19,424.81,659.07,251.64,284.06,113.23,120.64,181.2,143.89,140.46
2022-09-01,152.66,400.19,237.58,200.74,224.91,230.84,243.83,510.35,329.27,492.77,639.04,424.82,659.11,251.65,283.89,113.23,120.6,181.21,143.93,140.14
2022-10-01,153.22,400.21,237.65,200.74,224.94,231.5,243.1,510.5,329.41,493.28,638.98,424.8,659.08,251.66,283.82,113.24,120.54,181.21,143.89,140.01


## Catboost Model

In [None]:
# one-hot encoding
transformer_exog = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, drop='if_binary'),
        make_column_selector(dtype_exclude=np.number),
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Regressor hyperparameters search space
def search_space(trial):
  lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])
  search_space  = {
      'n_estimators'  : trial.suggest_int('n_estimators', 100, 1000, step=100),
      'max_depth'     : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),
      'lags'          : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

  and should_run_async(code)


In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )


  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog, hrbnz01):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(**best_params,
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(**best_params,
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

  and should_run_async(code)


In [None]:
processed_data_dir = "/content/drive/MyDrive/processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_cat, df_smape_cat = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.022161079642891046}
  Backtesting metric: 0.0025810083203441506



  5%|▌         | 1/20 [01:00<19:11, 60.60s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.1294223793304205}
  Backtesting metric: 0.00014122081379716218



 10%|█         | 2/20 [01:52<16:41, 55.64s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.6852962564188632}
  Backtesting metric: 0.0018363686699477107



 15%|█▌        | 3/20 [02:38<14:26, 50.99s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.6307902614946155}
  Backtesting metric: 0.0005978478729128374



 20%|██        | 4/20 [03:53<16:11, 60.70s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.6380569489658079}
  Backtesting metric: 0.0009249799487675019



 25%|██▌       | 5/20 [05:12<16:47, 67.16s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.5760367810196446}
  Backtesting metric: 0.0016484803001947663



 30%|███       | 6/20 [06:22<15:54, 68.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.1294223793304205}
  Backtesting metric: 0.001478164670098756



 35%|███▌      | 7/20 [07:58<16:44, 77.27s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.4365541356963474}
  Backtesting metric: 0.00039241269387616233



 40%|████      | 8/20 [08:50<13:48, 69.08s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.4040638127771271}
  Backtesting metric: 0.0006960520968033341



 45%|████▌     | 9/20 [10:11<13:21, 72.85s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.4228519889144546}
  Backtesting metric: 0.0008641164129084013



 50%|█████     | 10/20 [12:38<15:57, 95.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.057459816909839465}
  Backtesting metric: 0.0002781965536466226



 55%|█████▌    | 11/20 [13:47<13:08, 87.60s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.9447184180218409}
  Backtesting metric: 0.00039725711758279195



 60%|██████    | 12/20 [15:34<12:28, 93.55s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.2409362971304964}
  Backtesting metric: 0.0003700363162067913



 65%|██████▌   | 13/20 [17:04<10:46, 92.38s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.4365541356963474}
  Backtesting metric: 0.0005277229115595612



 70%|███████   | 14/20 [17:56<08:01, 80.27s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.9447184180218409}
  Backtesting metric: 0.0006677009588725592



 75%|███████▌  | 15/20 [19:33<07:06, 85.37s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.6506396228136392}
  Backtesting metric: 0.002222447465688238



 80%|████████  | 16/20 [21:08<05:52, 88.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.8676460663045322}
  Backtesting metric: 0.0012292457705388859



 85%|████████▌ | 17/20 [21:49<03:42, 74.13s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.6380569489658079}
  Backtesting metric: 0.0003194036236794923



 90%|█████████ | 18/20 [22:58<02:24, 72.31s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'learning_rate': 0.013799762791417092}
  Backtesting metric: 0.000974349526604734



 95%|█████████▌| 19/20 [23:45<01:04, 64.82s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.1294223793304205}
  Backtesting metric: 0.0031559712752482947



100%|██████████| 20/20 [25:23<00:00, 76.15s/it]

> Done





In [None]:
df_smape_cat.index = pd.Index(['smape'])
print(f" average smape: {round(df_smape_cat.mean(axis=1)[0],4)}")
df_smape_cat

 average smape: 0.1334


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
smape,0.113,0.033,0.133,0.057,0.122,0.265,0.145,0.039,0.091,0.083,0.039,0.074,0.058,0.07,0.053,0.437,0.383,0.032,0.113,0.327


In [None]:
df_submission_cat.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_cat

  and should_run_async(code)


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.95,399.98,237.86,200.8,224.41,230.92,242.37,510.71,328.32,492.39,638.37,424.66,658.52,251.61,283.84,113.23,120.91,181.15,144.01,140.2
2022-02-01,153.96,399.98,237.67,200.76,224.57,231.22,242.34,510.78,328.5,492.51,638.35,424.75,658.57,251.64,283.71,113.22,120.9,181.12,143.98,140.3
2022-03-01,153.96,399.99,237.74,200.89,224.75,230.99,242.31,510.91,328.54,493.06,638.42,424.71,658.63,251.67,283.93,113.51,121.04,181.18,144.1,140.54
2022-04-01,153.95,400.01,237.66,200.88,224.92,230.9,242.58,511.04,328.29,493.6,638.62,424.65,658.6,251.67,283.79,113.68,121.22,181.15,144.14,140.51
2022-05-01,153.94,400.02,237.59,200.8,224.82,230.8,243.16,511.04,328.06,493.54,638.84,424.73,658.6,251.73,283.93,113.69,121.45,181.14,144.11,140.6
2022-06-01,153.9,400.08,237.56,200.62,224.98,230.92,243.72,511.19,327.9,493.14,639.02,424.63,658.71,251.72,283.96,113.6,121.28,181.15,144.1,140.72
2022-07-01,153.93,400.13,237.41,200.56,225.12,230.91,243.68,511.09,328.11,492.95,639.02,424.71,658.81,251.73,283.96,113.48,121.12,181.25,144.03,140.63
2022-08-01,153.93,400.17,237.31,200.56,224.85,230.88,243.64,510.96,328.08,492.8,638.89,424.76,658.97,251.74,283.82,113.38,121.0,181.24,143.86,140.31
2022-09-01,153.93,400.15,237.08,200.68,224.61,230.69,243.57,510.93,328.49,492.83,638.82,424.73,659.04,251.68,283.78,113.22,120.93,181.22,143.93,140.23
2022-10-01,153.86,400.1,237.37,200.82,224.61,231.02,242.96,510.84,329.0,492.65,638.73,424.77,659.05,251.7,283.78,113.39,120.86,181.2,143.95,139.85


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## This is code for a single location prediction

In [None]:
df_exog = pd.read_csv("/content/drive/MyDrive/processed_data/Burgenland-335588.csv")

# set categorical columns astype category for the transformer model to auto detect them
df_exog["season"] = df_exog["season"].astype("category")
df_exog["weather"] = df_exog["weather"].astype("category")
df_exog["date"] = pd.to_datetime(df_exog["date"])
df_exog.set_index("date", inplace=True)
# set the datatime range index to monthly freq
df_exog.index = pd.date_range(start=df_exog.index.min(), end=df_exog.index.max(), freq='MS')
df_exog

Unnamed: 0,gw-level,month,year,quarter,season,weather,month_sin,month_cos,quarter_sin,quarter_cos,...,poly_quarter_month_sin,poly_quarter_month_cos,poly_quarter_quarter_sin,poly_quarter_quarter_cos,poly_month_sin_month_cos,poly_month_sin_quarter_sin,poly_month_sin_quarter_cos,poly_month_cos_quarter_sin,poly_month_cos_quarter_cos,poly_quarter_sin_quarter_cos
1993-01-01,115.95,1,1993,1,winter,cold,2.588190e-01,0.965926,1.000000e+00,6.123234e-17,...,2.588190e-01,0.965926,1.000000e+00,6.123234e-17,2.500000e-01,2.588190e-01,1.584810e-17,9.659258e-01,5.914590e-17,6.123234e-17
1993-02-01,115.97,2,1993,1,winter,cold,5.000000e-01,0.866025,1.000000e+00,6.123234e-17,...,5.000000e-01,0.866025,1.000000e+00,6.123234e-17,4.330127e-01,5.000000e-01,3.061617e-17,8.660254e-01,5.302876e-17,6.123234e-17
1993-03-01,115.99,3,1993,1,spring,normal,7.071068e-01,0.707107,1.000000e+00,6.123234e-17,...,7.071068e-01,0.707107,1.000000e+00,6.123234e-17,5.000000e-01,7.071068e-01,4.329780e-17,7.071068e-01,4.329780e-17,6.123234e-17
1993-04-01,116.02,4,1993,2,spring,normal,8.660254e-01,0.500000,1.224647e-16,-1.000000e+00,...,1.732051e+00,1.000000,2.449294e-16,-2.000000e+00,4.330127e-01,1.060575e-16,-8.660254e-01,6.123234e-17,-5.000000e-01,-1.224647e-16
1993-05-01,116.02,5,1993,2,spring,normal,9.659258e-01,0.258819,1.224647e-16,-1.000000e+00,...,1.931852e+00,0.517638,2.449294e-16,-2.000000e+00,2.500000e-01,1.182918e-16,-9.659258e-01,3.169619e-17,-2.588190e-01,-1.224647e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-01,,10,2023,4,Fall,normal,5.000000e-01,-0.866025,-2.449294e-16,1.000000e+00,...,2.000000e+00,-3.464102,-9.797174e-16,4.000000e+00,-4.330127e-01,-1.224647e-16,5.000000e-01,2.121150e-16,-8.660254e-01,-2.449294e-16
2023-11-01,,11,2023,4,Fall,normal,2.588190e-01,-0.965926,-2.449294e-16,1.000000e+00,...,1.035276e+00,-3.863703,-9.797174e-16,4.000000e+00,-2.500000e-01,-6.339238e-17,2.588190e-01,2.365836e-16,-9.659258e-01,-2.449294e-16
2023-12-01,,12,2023,4,winter,cold,1.224647e-16,-1.000000,-2.449294e-16,1.000000e+00,...,4.898587e-16,-4.000000,-9.797174e-16,4.000000e+00,-1.224647e-16,-2.999520e-32,1.224647e-16,2.449294e-16,-1.000000e+00,-2.449294e-16
2024-01-01,,1,2024,1,winter,cold,2.588190e-01,0.965926,1.000000e+00,6.123234e-17,...,2.588190e-01,0.965926,1.000000e+00,6.123234e-17,2.500000e-01,2.588190e-01,1.584810e-17,9.659258e-01,5.914590e-17,6.123234e-17


In [None]:
pipeline_categorical = make_pipeline(
                           OrdinalEncoder(
                               dtype=int,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1,
                               encoded_missing_value=-1
                           ),
                           FunctionTransformer(
                               func=lambda x: x.astype('category'),
                               feature_names_out= 'one-to-one'
                           )
                       )

transformer_exog = make_column_transformer(
                       (
                           pipeline_categorical,
                           make_column_selector(dtype_exclude=np.number)
                       ),
                       remainder="passthrough",
                       verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Lags grid
lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

# Regressor hyperparameters search space
def search_space(trial):
    search_space  = {
        'n_estimators'    : trial.suggest_int('n_estimators', 400, 1200, step=100),
        'max_depth'       : trial.suggest_int('max_depth', 3, 10, step=1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 25, 500),
        'learning_rate'   : trial.suggest_float('learning_rate', 0.01, 0.5),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, step=0.1),
        'max_bin'         : trial.suggest_int('max_bin', 50, 250, step=25),
        'reg_alpha'       : trial.suggest_float('reg_alpha', 0, 1, step=0.1),
        'reg_lambda'      : trial.suggest_float('reg_lambda', 0, 1, step=0.1),
        'lags'            : trial.suggest_categorical('lags', lags_grid)
    }
    return search_space

In [None]:
def search_hyperparameters(data, one_hot_encoder):

    # get the estimate end train and end validation dates
    exog_data = data.drop("gw-level", axis=1)
    exog_features = exog_data.columns
    df_idx = data.index
    train_num = int(len(data) * 0.8)
    end_train = "2012-12-01"
    end_valid = "2021-12-01"

    # instantiate a forcaster transformer with categorical features
    forecaster = ForecasterAutoreg(
        regressor = LGBMRegressor(random_state=15926, verbose=-1),
        lags = 24,
        transformer_exog = one_hot_encoder,
        fit_kwargs = {"categorical_feature": "auto"}
    )

    # search for best parameters
    results_search, frozen_trial = bayesian_search_forecaster(
    forecaster         = forecaster,
    y                  = data.loc[:end_valid, 'gw-level'],
    exog               = data.loc[:end_valid, exog_features],
    search_space       = search_space,
    steps              = 26,
    refit              = False,
    metric             = 'mean_absolute_percentage_error',
    initial_train_size = len(data.loc[:end_train]),
    fixed_train_size   = False,
    n_trials           = 20,
    random_state       = 123,
    return_best        = True,
    n_jobs             = 'auto',
    verbose            = False,
    show_progress      = True
    )
    best_params = results_search['params'].iat[0]

    return best_params, forecaster

best_params, forecaster =  search_hyperparameters(df_exog, transformer_exog)

  0%|          | 0/20 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'min_data_in_leaf': 132, 'learning_rate': 0.28014423685061673, 'feature_fraction': 0.9, 'max_bin': 125, 'reg_alpha': 1.0, 'reg_lambda': 0.7000000000000001}
  Backtesting metric: 0.0023255280123888103





In [None]:
data = df_exog.copy()
exog_data = data.drop("gw-level", axis=1)
exog_features = exog_data.columns
df_idx = data.index
train_num = int(len(data) * 0.8)
valid_num = len(data.loc[:"2021-11-01"])
end_train = df_idx[train_num]
end_valid = df_idx[valid_num]
end_evaluation = df_idx[train_num+50]
evaluate_data = data.loc[df_idx[train_num+1]: end_evaluation, "gw-level"].values

  and should_run_async(code)


In [None]:
# train model with best params
forecaster = ForecasterAutoreg(
    regressor          = LGBMRegressor(**best_params, random_state=15926, verbose=-1),
    lags               = 24,
    transformer_exog   = transformer_exog,
    fit_kwargs         = {"categorical_feature": "auto"}
)
forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
)

  and should_run_async(code)


In [None]:
# make predictions and evalute the model
predictions = forecaster.predict(
    exog     = data.loc[df_idx[train_num+1]:, exog_features],
    steps    = 50
)
pred_df = pd.DataFrame(predictions)

  and should_run_async(code)


In [None]:
preds = pred_df["pred"].values
smape_value = smape(evaluate_data, preds)
pred_df.loc["smape", "pred"] = smape_value
pred_df

Unnamed: 0,pred
2018-01-01 00:00:00,116.167604
2018-02-01 00:00:00,116.142285
2018-03-01 00:00:00,116.107357
2018-04-01 00:00:00,116.122247
2018-05-01 00:00:00,116.117663
2018-06-01 00:00:00,116.08343
2018-07-01 00:00:00,115.986986
2018-08-01 00:00:00,115.986986
2018-09-01 00:00:00,115.994564
2018-10-01 00:00:00,116.03368


In [None]:
# make predictions into the future
predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 24
)
pd.DataFrame(predictions)

ValueError: To make predictions `exog` must start one step ahead of `last_window`.
    `last_window` ends at : 2017-12-01 00:00:00.
    `exog` starts at : 2022-01-01 00:00:00.
     Expected index : 2018-01-01 00:00:00.