In [1]:
!pip install skforecast tqdm catboost

Collecting skforecast
  Downloading skforecast-0.12.1-py3-none-any.whl.metadata (22 kB)
Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting optuna<3.7,>=2.10 (from skforecast)
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna<3.7,>=2.10->skforecast)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna<3.7,>=2.10->skforecast)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna<3.7,>=2.10->skforecast)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading skforecast-0.12.1-py3-none-any.whl (560 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m560.6/560.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/

In [3]:
import pandas as pd
from tqdm import tqdm
import lightgbm
import sklearn
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
import skforecast
from sklearn.feature_selection import RFECV
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import bayesian_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import select_features
import matplotlib.pyplot as plt
import numpy as np
import os
from os import path
import shutil
import re
import traceback
%matplotlib inline

In [4]:
pipeline_categorical = make_pipeline(
                           OrdinalEncoder(
                               dtype=int,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1,
                               encoded_missing_value=-1
                           ),
                           FunctionTransformer(
                               func=lambda x: x.astype('category'),
                               feature_names_out= 'one-to-one'
                           )
                       )

transformer_exog = make_column_transformer(
                       (
                           pipeline_categorical,
                           make_column_selector(dtype_exclude=np.number)
                       ),
                       remainder="passthrough",
                       verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Lags grid
lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

# Regressor hyperparameters search space for lightgbm
def search_space(trial):
    search_space  = {
        'n_estimators'    : trial.suggest_int('n_estimators', 400, 1200, step=100),
        'max_depth'       : trial.suggest_int('max_depth', 3, 10, step=1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 25, 500),
        'learning_rate'   : trial.suggest_float('learning_rate', 0.01, 0.5),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1, step=0.1),
        'max_bin'         : trial.suggest_int('max_bin', 50, 250, step=25),
        'reg_alpha'       : trial.suggest_float('reg_alpha', 0, 1, step=0.1),
        'reg_lambda'      : trial.suggest_float('reg_lambda', 0, 1, step=0.1),
        'lags'            : trial.suggest_categorical('lags', lags_grid)
    }
    return search_space

In [5]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
    forecaster = ForecasterAutoreg(
        regressor = LGBMRegressor(random_state=15926, verbose=-1),
        lags = 24,
        transformer_exog = transformer_exog,
        fit_kwargs = {"categorical_feature": "auto"}
    )

    # search for best parameters
    results_search, frozen_trial = bayesian_search_forecaster(
    forecaster         = forecaster,
    y                  = data.loc[:end_valid, 'gw-level'],
    exog               = data.loc[:end_valid, exog_features],
    search_space       = search_space,
    steps              = 30,
    refit              = False,
    metric             = 'mean_absolute_percentage_error',
    initial_train_size = len(data.loc[:end_train]),
    fixed_train_size   = False,
    n_trials           = 20,
    random_state       = 123,
    return_best        = True,
    n_jobs             = 'auto',
    verbose            = False,
    show_progress      = True
    )

    best_params = results_search['params'].iat[0]

    return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor          = LGBMRegressor(**best_params, random_state=15926, verbose=-1),
  lags               = 24,
  transformer_exog   = transformer_exog,
  fit_kwargs         = {"categorical_feature": "auto"}
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor          = LGBMRegressor(**best_params, random_state=15926, verbose=-1),
  lags               = 24,
  transformer_exog   = transformer_exog, #one_hot_encoder,
  fit_kwargs         = {"categorical_feature": "auto"}
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value


In [26]:
# Define the function to return the SMAPE value
def smape(A, F):
    tmp = 2 * np.abs(F - A) / (np.abs(A) + np.abs(F))
    len_ = np.count_nonzero(~np.isnan(tmp))
    if len_ == 0 and np.nansum(tmp) == 0: # Deals with a special case
        return 100
    return round(100 / len_ * np.nansum(tmp), 3)

In [7]:
def populate_test_data(data_dir):
  preds_dict = {}
  smape_dict = {}


  # collect all files in the directory
  filenames = os.listdir(data_dir)
  filenames = filenames[:20]

  try:

    for filename in tqdm(filenames):

        hrbnz01 = filename.split(".")[0].split("-")[-1]
        filepath = path.join(data_dir, filename)
        df_exog = pd.read_csv(filepath)
        df_exog["season"] = df_exog["season"].astype("category")
        df_exog["weather"] = df_exog["weather"].astype("category")
        df_exog["date"] = pd.to_datetime(df_exog["date"])
        df_exog.set_index("date", inplace=True)
        df_exog.index = pd.date_range(start=df_exog.index.min(), end=df_exog.index.max(), freq='MS')

        # get the estimate end train and end validation dates
        data = df_exog.copy()
        exog_data = data.drop("gw-level", axis=1)
        exog_features = exog_data.columns
        df_idx = data.index
        train_num = int(len(data) * 0.8)
        valid_num = len(data.loc[:"2021-11-01"])
        end_train = df_idx[train_num]
        end_valid = df_idx[valid_num]
        end_evaluation = df_idx[train_num+26]
        evaluate_data = data.loc[df_idx[train_num+1]: end_evaluation, "gw-level"].values


        # tune for best hyperparamters and evaluate on MAPE metric
        best_params = search_hyperparameters(data, end_train, end_valid, exog_features, transformer_exog)

        # train and make predict into 26 months in the future of the test template
        df_predictions, smape = train_and_predict(data,best_params, evaluate_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog)
        df_predictions["pred"] =  df_predictions["pred"].round(2)
        preds_dict[hrbnz01] = df_predictions['pred'].values
        smape_dict[hrbnz01] = smape


  except Exception as ex:
    print("[Error]")
    print(traceback.format_exc())

  df_smape = pd.DataFrame(smape_dict, index=[0])
  df_final_preds = pd.DataFrame(preds_dict, index=range(26))
  print("> Done")

  return df_final_preds, df_smape

In [None]:
processed_data_dir = "/content/drive/MyDrive/clean_processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_lgbm, df_smape_lgbm = populate_test_data(processed_data_dir)
# df_submission.to_csv("df_submission.csv", index=False)
# df_smape.to_csv("smape_score.csv", index=False)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'min_data_in_leaf': 132, 'learning_rate': 0.28014423685061673, 'feature_fraction': 0.9, 'max_bin': 125, 'reg_alpha': 1.0, 'reg_lambda': 0.7000000000000001}
  Backtesting metric: 0.0004987574425375068



  5%|▌         | 1/20 [00:06<02:00,  6.34s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.003563950184699962



 10%|█         | 2/20 [00:11<01:46,  5.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 9, 'min_data_in_leaf': 41, 'learning_rate': 0.35170286615271973, 'feature_fraction': 0.6, 'max_bin': 125, 'reg_alpha': 0.8, 'reg_lambda': 0.0}
  Backtesting metric: 0.0010911238652170625



 15%|█▌        | 3/20 [00:15<01:25,  5.02s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 10, 'min_data_in_leaf': 31, 'learning_rate': 0.017627004844245255, 'feature_fraction': 1.0, 'max_bin': 50, 'reg_alpha': 0.0, 'reg_lambda': 0.2}
  Backtesting metric: 0.0004938785077244215



 20%|██        | 4/20 [00:24<01:43,  6.44s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 400, 'max_depth': 8, 'min_data_in_leaf': 201, 'learning_rate': 0.40751135490746604, 'feature_fraction': 0.9, 'max_bin': 50, 'reg_alpha': 0.2, 'reg_lambda': 0.0}
  Backtesting metric: 0.0005238337051745096



 25%|██▌       | 5/20 [00:32<01:47,  7.16s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.0007523005631959096



 30%|███       | 6/20 [00:38<01:32,  6.61s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 10, 'min_data_in_leaf': 31, 'learning_rate': 0.017627004844245255, 'feature_fraction': 1.0, 'max_bin': 50, 'reg_alpha': 0.0, 'reg_lambda': 0.2}
  Backtesting metric: 0.0008141017519187233



 35%|███▌      | 7/20 [00:45<01:26,  6.68s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 40%|████      | 8/20 [00:49<01:12,  6.01s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 3, 'min_data_in_leaf': 425, 'learning_rate': 0.23265642115398555, 'feature_fraction': 0.8, 'max_bin': 125, 'reg_alpha': 0.4, 'reg_lambda': 1.0}
  Backtesting metric: 0.0005812953764654002



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 9, 'min_data_in_leaf': 239, 'learning_rate': 0.3791733226905797, 'feature_fraction': 1.0, 'max_bin': 175, 'reg_alpha': 0.2, 'reg_lambda': 0.2}
  Backtesting metric: 0.0005053180280827726



 45%|████▌     | 9/20 [00:59<01:16,  6.98s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 4, 'min_data_in_leaf': 27, 'learning_rate': 0.3889561822065921, 'feature_fraction': 0.6, 'max_bin': 200, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.4}
  Backtesting metric: 0.0002875536389365831



 50%|█████     | 10/20 [01:06<01:10,  7.06s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'min_data_in_leaf': 71, 'learning_rate': 0.32631417602951285, 'feature_fraction': 0.5, 'max_bin': 250, 'reg_alpha': 1.0, 'reg_lambda': 0.30000000000000004}
  Backtesting metric: 0.00029831970068177986



 55%|█████▌    | 11/20 [01:15<01:10,  7.85s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 10, 'min_data_in_leaf': 27, 'learning_rate': 0.3732343204321192, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.30000000000000004}
  Backtesting metric: 0.00040254491515575295



 60%|██████    | 12/20 [01:22<00:59,  7.40s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 9, 'min_data_in_leaf': 26, 'learning_rate': 0.37802946439119056, 'feature_fraction': 0.6, 'max_bin': 175, 'reg_alpha': 0.8, 'reg_lambda': 0.0}
  Backtesting metric: 0.0009125096116958019



 65%|██████▌   | 13/20 [01:28<00:48,  6.99s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

 70%|███████   | 14/20 [01:35<00:42,  7.10s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 9, 'min_data_in_leaf': 408, 'learning_rate': 0.05151409102341216, 'feature_fraction': 1.0, 'max_bin': 250, 'reg_alpha': 0.4, 'reg_lambda': 0.1}
  Backtesting metric: 0.0005816022176811139



  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1200, 'max_depth': 10, 'min_data_in_leaf': 26, 'learning_rate': 0.0927066683643777, 'feature_fraction': 1.0, 'max_bin': 50, 'reg_alpha': 0.2, 'reg_lambda': 0.0}
  Backtesting metric: 0.00015899863954679336



 75%|███████▌  | 15/20 [01:43<00:35,  7.18s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'min_data_in_leaf': 31, 'learning_rate': 0.31436208267804505, 'feature_fraction': 0.9, 'max_bin': 100, 'reg_alpha': 1.0, 'reg_lambda': 0.30000000000000004}
  Backtesting metric: 0.0027358283642795744



 80%|████████  | 16/20 [01:51<00:30,  7.53s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.0008331598072861874



 85%|████████▌ | 17/20 [01:57<00:21,  7.18s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 8, 'min_data_in_leaf': 28, 'learning_rate': 0.130475871043031, 'feature_fraction': 1.0, 'max_bin': 75, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.0}
  Backtesting metric: 0.0008098681300593174



 90%|█████████ | 18/20 [02:03<00:13,  6.73s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 900, 'max_depth': 8, 'min_data_in_leaf': 25, 'learning_rate': 0.09808714723656196, 'feature_fraction': 0.6, 'max_bin': 150, 'reg_alpha': 0.2, 'reg_lambda': 0.8}
  Backtesting metric: 0.0013529029211651008



 95%|█████████▌| 19/20 [02:06<00:05,  5.75s/it]

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [02:09<00:00,  6.49s/it]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 7, 'min_data_in_leaf': 342, 'learning_rate': 0.06189515767783878, 'feature_fraction': 0.5, 'max_bin': 100, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9}
  Backtesting metric: 0.00256416147808889

> Done





In [None]:
df_smape_lgbm.index = pd.Index(['smape'])
lgbm_mean = round(df_smape_lgbm.mean(axis=1)[0],4)
print(f" average smape: {lgbm_mean}")
df_smape_lgbm


 average smape: 0.1015


Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
smape,0.058,0.333,0.129,0.06,0.043,0.066,0.127,0.054,0.047,0.026,0.04,0.042,0.148,0.069,0.021,0.222,0.081,0.116,0.106,0.242


In [None]:
df_submission_lgbm.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_lgbm

Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
2022-01-01,515.94,117.87,301.95,249.63,501.17,229.71,216.83,178.4,195.08,542.16,559.67,395.34,159.2,204.46,607.9,118.37,428.1,166.88,158.7,129.24
2022-02-01,516.09,117.87,301.88,249.61,500.99,229.71,216.93,178.4,195.09,542.13,559.73,395.3,159.34,204.46,607.96,118.38,428.1,166.89,158.92,129.24
2022-03-01,516.21,117.87,301.82,249.72,501.24,229.71,216.92,178.4,195.17,542.22,559.76,395.35,159.4,204.46,607.99,118.43,428.1,166.93,159.14,129.24
2022-04-01,516.3,117.87,301.8,249.77,501.29,229.71,216.91,178.4,195.13,542.64,559.92,395.4,159.43,204.46,608.13,118.45,428.1,167.0,159.24,129.24
2022-05-01,516.62,117.87,301.77,249.82,501.38,229.71,216.82,178.4,195.1,543.44,560.3,395.59,159.34,204.46,608.37,118.43,428.1,167.0,159.12,129.24
2022-06-01,516.55,117.87,301.88,249.85,501.34,229.71,216.69,178.4,195.15,543.58,560.66,395.86,159.28,204.46,608.51,118.43,428.1,167.0,158.97,129.24
2022-07-01,516.29,117.87,301.89,249.86,501.4,229.71,216.55,178.4,195.11,543.59,560.93,395.98,159.29,204.46,608.44,118.37,428.1,167.01,158.81,129.24
2022-08-01,516.28,117.87,301.86,249.87,501.42,229.71,216.36,178.4,195.2,543.32,560.95,396.0,159.31,204.46,608.25,118.27,428.1,167.02,158.74,129.24
2022-09-01,516.2,117.87,302.01,249.78,501.63,229.71,216.17,178.4,195.17,542.76,560.65,395.91,159.25,204.46,608.2,118.24,428.1,167.02,158.84,129.24
2022-10-01,516.14,117.87,302.25,249.79,501.69,229.71,216.32,178.4,195.14,542.57,560.29,395.75,159.18,204.46,608.13,118.25,428.1,167.02,158.96,129.24


## Trying XGBoost Model for Comparison

In [None]:
# Regressor hyperparameters search space for xgboost
def search_space(trial):

  # Lags grid
  lags_grid = tuple([12, [1, 2, 3, 4, 7, 9, 12]])

  search_space  = {
      'n_estimators'    : trial.suggest_int('n_estimators', 400, 1200, step=100),
      'max_depth'       : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate'   : trial.suggest_float('learning_rate', 0.01, 1),
      'subsample'       : trial.suggest_float('subsample', 0.1, 1),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
      'gamma'           : trial.suggest_float('gamma', 0, 1),
      'reg_alpha'       : trial.suggest_float('reg_alpha', 0, 1),
      'reg_lambda'      : trial.suggest_float('reg_lambda', 0, 1),
      'lags'            : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(**best_params,
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = XGBRegressor(**best_params,
                  tree_method = 'hist',
                  enable_categorical = True,
                  random_state = 123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

In [None]:
processed_data_dir = "/content/drive/MyDrive/clean_processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_xgb, df_smape_xgb = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'subsample': 0.5961832921746021, 'colsample_bytree': 0.7475220728070068, 'gamma': 0.42310646012446096, 'reg_alpha': 0.9807641983846155, 'reg_lambda': 0.6848297385848633}
  Backtesting metric: 0.0004733854496327476



  5%|▌         | 1/20 [00:21<06:48, 21.51s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.6286544670746439, 'subsample': 0.7072201458890424, 'colsample_bytree': 0.8581081938582316, 'gamma': 0.08319498833243877, 'reg_alpha': 0.7636828414433382, 'reg_lambda': 0.243666374536874}
  Backtesting metric: 0.002166111550852808



 10%|█         | 2/20 [00:44<06:39, 22.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.8111336067823867, 'subsample': 0.3657988622840833, 'colsample_bytree': 0.475896226925484, 'gamma': 0.21728260640667252, 'reg_alpha': 0.6087467748940835, 'reg_lambda': 0.9615765859202877}
  Backtesting metric: 0.0011201758668707234



 15%|█▌        | 3/20 [01:00<05:34, 19.67s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.1681461925379386, 'subsample': 0.5079501371483902, 'colsample_bytree': 0.2023430562118223, 'gamma': 0.19938448157273703, 'reg_alpha': 0.28737224039527165, 'reg_lambda': 0.9844004597186841}
  Backtesting metric: 0.0005366511037171889



 20%|██        | 4/20 [01:16<04:49, 18.09s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.6286544670746439, 'subsample': 0.7072201458890424, 'colsample_bytree': 0.8581081938582316, 'gamma': 0.08319498833243877, 'reg_alpha': 0.7636828414433382, 'reg_lambda': 0.243666374536874}
  Backtesting metric: 0.0005491924706314159



 25%|██▌       | 5/20 [01:42<05:13, 20.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.024945806771908607, 'subsample': 0.9998086737999882, 'colsample_bytree': 0.3513624942452013, 'gamma': 0.8302567671777434, 'reg_alpha': 0.7863025601218423, 'reg_lambda': 0.012283918831203056}
  Backtesting metric: 0.0006669280837362962



 30%|███       | 6/20 [02:05<05:05, 21.80s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.6070295271268181, 'subsample': 0.5905612058198184, 'colsample_bytree': 0.4084874503968776, 'gamma': 0.3041207890271841, 'reg_alpha': 0.4170222110247016, 'reg_lambda': 0.6813007657927966}
  Backtesting metric: 0.000738676567785571



 35%|███▌      | 7/20 [02:20<04:13, 19.47s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 3, 'learning_rate': 0.9635955935176932, 'subsample': 0.9933130108437183, 'colsample_bytree': 0.22594617024727326, 'gamma': 0.9979492141278349, 'reg_alpha': 0.05556023765455381, 'reg_lambda': 0.4467425553167122}
  Backtesting metric: 0.0004851819452114679



 40%|████      | 8/20 [02:44<04:09, 20.78s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.36235571706028497, 'subsample': 0.7862930324068904, 'colsample_bytree': 0.633859224905999, 'gamma': 0.6917017987001771, 'reg_alpha': 0.15112745234808023, 'reg_lambda': 0.39887629272615654}
  Backtesting metric: 0.000478529985350884



 45%|████▌     | 9/20 [03:04<03:45, 20.53s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'subsample': 0.5961832921746021, 'colsample_bytree': 0.7475220728070068, 'gamma': 0.42310646012446096, 'reg_alpha': 0.9807641983846155, 'reg_lambda': 0.6848297385848633}
  Backtesting metric: 0.00029027894392587094



 50%|█████     | 10/20 [03:29<03:41, 22.15s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 3, 'learning_rate': 0.11642993576410168, 'subsample': 0.9789539318361384, 'colsample_bytree': 0.8312650429575371, 'gamma': 0.05053751551606417, 'reg_alpha': 0.7271110751620196, 'reg_lambda': 0.43690468800009885}
  Backtesting metric: 0.0002841152378178679



 55%|█████▌    | 11/20 [03:52<03:21, 22.39s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'subsample': 0.5961832921746021, 'colsample_bytree': 0.7475220728070068, 'gamma': 0.42310646012446096, 'reg_alpha': 0.9807641983846155, 'reg_lambda': 0.6848297385848633}
  Backtesting metric: 0.00037542550682135597



 60%|██████    | 12/20 [04:11<02:49, 21.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.4365541356963474, 'subsample': 0.5443165878852756, 'colsample_bytree': 0.4832472612662452, 'gamma': 0.3122612229724653, 'reg_alpha': 0.4263513069628082, 'reg_lambda': 0.8933891631171348}
  Backtesting metric: 0.0008494343815247348



 65%|██████▌   | 13/20 [04:28<02:18, 19.83s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.36235571706028497, 'subsample': 0.7862930324068904, 'colsample_bytree': 0.633859224905999, 'gamma': 0.6917017987001771, 'reg_alpha': 0.15112745234808023, 'reg_lambda': 0.39887629272615654}
  Backtesting metric: 0.0005668947139861166



 70%|███████   | 14/20 [04:51<02:05, 20.90s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 1200, 'max_depth': 4, 'learning_rate': 0.9824511943947221, 'subsample': 0.9345156271055195, 'colsample_bytree': 0.9892976403836156, 'gamma': 0.006453374219829056, 'reg_alpha': 0.7816977573675389, 'reg_lambda': 0.39158929094355194}
  Backtesting metric: 0.00017282046134609166



 75%|███████▌  | 15/20 [05:18<01:54, 22.87s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'learning_rate': 0.3241126270021177, 'subsample': 0.4733435907582686, 'colsample_bytree': 0.8796782420950293, 'gamma': 0.2504553653965067, 'reg_alpha': 0.48303426426270435, 'reg_lambda': 0.985559785610705}
  Backtesting metric: 0.002794607907256432



 80%|████████  | 16/20 [05:41<01:30, 22.74s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.7207311136846136, 'subsample': 0.6954861610622913, 'colsample_bytree': 0.8328198548377674, 'gamma': 0.004576333666320992, 'reg_alpha': 0.8204417187475518, 'reg_lambda': 0.014015836934027859}
  Backtesting metric: 0.0007990018790536004



 85%|████████▌ | 17/20 [06:06<01:10, 23.58s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 3, 'learning_rate': 0.3241126270021177, 'subsample': 0.4733435907582686, 'colsample_bytree': 0.8796782420950293, 'gamma': 0.2504553653965067, 'reg_alpha': 0.48303426426270435, 'reg_lambda': 0.985559785610705}
  Backtesting metric: 0.0006845114858774128



 90%|█████████ | 18/20 [06:24<00:43, 21.76s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.5588394172205944, 'subsample': 0.4500555167108301, 'colsample_bytree': 0.9326192406525875, 'gamma': 0.8416699969127163, 'reg_alpha': 0.35739756668317624, 'reg_lambda': 0.04359146379904055}
  Backtesting metric: 0.0008170603091491386



 95%|█████████▌| 19/20 [06:39<00:19, 19.90s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 12] 
  Parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.5726605983557926, 'subsample': 0.42011259990938976, 'colsample_bytree': 0.30794140712521173, 'gamma': 0.8255929189207487, 'reg_alpha': 0.6001549324220541, 'reg_lambda': 0.3205159576366789}
  Backtesting metric: 0.0014143519698211



100%|██████████| 20/20 [07:03<00:00, 21.16s/it]

> Done





In [None]:
df_smape_xgb.index = pd.Index(['smape'])
xgb_mean = round(df_smape_xgb.mean(axis=1)[0],4)
print(f" average smape: {xgb_mean}")
df_smape_xgb

 average smape: 0.1477


  and should_run_async(code)


Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
smape,0.053,0.871,0.135,0.051,0.057,0.056,0.102,0.306,0.048,0.028,0.038,0.041,0.232,0.097,0.021,0.197,0.084,0.034,0.313,0.19


In [None]:
df_submission_xgb.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_xgb

Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.93,400.01,238.1,200.75,224.48,230.6,242.35,510.77,328.43,492.36,638.41,424.8,658.66,251.77,283.82,113.31,120.81,181.16,143.99,140.2
2022-02-01,153.92,400.01,238.11,200.79,224.48,230.54,242.82,510.81,328.43,492.85,638.41,424.8,658.65,251.75,283.78,113.35,120.91,181.16,143.99,140.5
2022-03-01,153.92,400.01,238.11,200.84,224.48,230.62,242.48,510.98,328.43,493.47,638.44,424.8,658.69,251.75,283.73,113.41,120.99,181.16,143.99,140.62
2022-04-01,153.92,400.01,238.04,200.89,224.38,230.63,243.01,511.04,328.43,493.32,638.65,424.8,658.75,251.75,283.71,113.43,120.99,181.16,143.99,140.47
2022-05-01,153.91,400.03,238.04,200.84,224.54,230.67,243.43,511.14,328.36,493.19,638.9,424.8,658.75,251.75,283.68,113.41,120.95,181.16,143.99,140.63
2022-06-01,153.9,400.03,238.04,200.71,224.55,230.75,244.02,511.18,328.36,493.35,639.17,424.87,658.79,251.75,283.71,113.41,120.89,181.16,143.99,140.62
2022-07-01,153.83,400.06,238.04,200.65,224.56,230.65,243.81,511.11,328.26,493.01,639.11,424.8,658.89,251.75,283.81,113.39,120.71,181.16,143.99,140.52
2022-08-01,153.72,400.06,238.04,200.59,224.6,230.57,243.71,510.95,328.44,492.81,639.13,424.8,658.95,251.75,283.84,113.37,120.78,181.16,143.99,140.24
2022-09-01,153.64,400.03,238.04,200.63,224.54,230.8,243.67,510.91,328.44,493.05,639.11,424.8,658.94,251.75,283.84,113.37,120.71,181.16,143.99,140.04
2022-10-01,153.6,400.01,238.04,200.77,224.6,230.58,242.76,510.82,328.34,493.54,638.98,424.87,658.91,251.75,283.75,113.37,120.61,181.16,143.99,139.92


## HistGradientBoostingRegressor Model

#### I have only trained the new data and predict on HistGradientBoostingRegressor since it outperforms other models

In [None]:
# one-hot encoding
categorical_features = ["weather", "season"]
transformer_exog = make_column_transformer(
    (
        OrdinalEncoder(
            dtype=int,
            handle_unknown="use_encoded_value",
            unknown_value=-1,
            encoded_missing_value=-1
        ),
        categorical_features
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Regressor hyperparameters search space
def search_space(trial):

  # Lags grid
  lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

  search_space  = {
      'max_iter'          : trial.suggest_int('max_iter', 400, 1200, step=100),
      'max_depth'         : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate'     : trial.suggest_float('learning_rate', 0.01, 1),
      'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 20, step=1),
      'l2_regularization' : trial.suggest_float('l2_regularization', 0, 1),
      'lags'              : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(**best_params,
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = HistGradientBoostingRegressor(**best_params,
                  categorical_features=categorical_features,
                  random_state=123
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )
  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

In [None]:
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/clean_processed_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
processed_data_dir = "/content/drive/MyDrive/clean_processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"
#df = pd.read_csv(test_template)

df_submission_hist, df_smape_hist = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'min_samples_leaf': 12, 'l2_regularization': 0.7194689697855631}
  Backtesting metric: 0.000501016334235111



  5%|▌         | 1/20 [01:32<29:26, 92.96s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 600, 'max_depth': 7, 'learning_rate': 0.802171833321352, 'min_samples_leaf': 15, 'l2_regularization': 0.8572382405167807}
  Backtesting metric: 0.0012492665018444306



 10%|█         | 2/20 [02:29<21:26, 71.46s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 500, 'max_depth': 10, 'learning_rate': 0.6384304097812739, 'min_samples_leaf': 20, 'l2_regularization': 0.9446643567874401}
  Backtesting metric: 0.0008026331328263398



 15%|█▌        | 3/20 [03:15<16:57, 59.86s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2212153375368012, 'min_samples_leaf': 3, 'l2_regularization': 0.22553258665355755}
  Backtesting metric: 0.0004552370250429928



 20%|██        | 4/20 [04:43<18:56, 71.03s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 1200, 'max_depth': 5, 'learning_rate': 0.6808050236946397, 'min_samples_leaf': 3, 'l2_regularization': 0.9800168471250429}
  Backtesting metric: 0.0005740284429844393



 25%|██▌       | 5/20 [06:38<21:42, 86.82s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2188459764198009, 'min_samples_leaf': 8, 'l2_regularization': 0.22871857519128538}
  Backtesting metric: 0.0008225027827420667



 30%|███       | 6/20 [08:21<21:30, 92.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 400, 'max_depth': 5, 'learning_rate': 0.4569682468857075, 'min_samples_leaf': 2, 'l2_regularization': 0.18349470861613965}
  Backtesting metric: 0.0006723914883828241



 35%|███▌      | 7/20 [09:16<17:20, 80.05s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 900, 'max_depth': 8, 'learning_rate': 0.025967914628066663, 'min_samples_leaf': 12, 'l2_regularization': 0.5567851923942887}
  Backtesting metric: 0.0008292340973308502



 40%|████      | 8/20 [10:53<17:07, 85.65s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2345829390285611, 'min_samples_leaf': 12, 'l2_regularization': 0.7194689697855631}
  Backtesting metric: 0.0006049764421585446



 45%|████▌     | 9/20 [12:33<16:31, 90.16s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 400, 'max_depth': 10, 'learning_rate': 0.014884729909749192, 'min_samples_leaf': 20, 'l2_regularization': 0.015325290080303589}
  Backtesting metric: 0.00031664864196954864



 50%|█████     | 10/20 [13:39<13:47, 82.73s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 700, 'max_depth': 6, 'learning_rate': 0.43157198739286967, 'min_samples_leaf': 7, 'l2_regularization': 0.4263513069628082}
  Backtesting metric: 0.00029354768375133215



 55%|█████▌    | 11/20 [14:55<12:05, 80.64s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 900, 'max_depth': 8, 'learning_rate': 0.025967914628066663, 'min_samples_leaf': 12, 'l2_regularization': 0.5567851923942887}
  Backtesting metric: 0.000393607218227383



 60%|██████    | 12/20 [16:28<11:15, 84.44s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 500, 'max_depth': 4, 'learning_rate': 0.53623586010342, 'min_samples_leaf': 11, 'l2_regularization': 0.6344009585513211}
  Backtesting metric: 0.0009004830988564573



 65%|██████▌   | 13/20 [17:24<08:49, 75.63s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 700, 'max_depth': 6, 'learning_rate': 0.43157198739286967, 'min_samples_leaf': 7, 'l2_regularization': 0.4263513069628082}
  Backtesting metric: 0.0008077101785025181



 70%|███████   | 14/20 [19:04<08:19, 83.19s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 800, 'max_depth': 6, 'learning_rate': 0.3497462359893607, 'min_samples_leaf': 15, 'l2_regularization': 0.4385722446796244}
  Backtesting metric: 0.00013925160652429816



 75%|███████▌  | 15/20 [20:17<06:40, 80.05s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 500, 'max_depth': 4, 'learning_rate': 0.53623586010342, 'min_samples_leaf': 11, 'l2_regularization': 0.6344009585513211}
  Backtesting metric: 0.0024970845058997723



 80%|████████  | 16/20 [21:44<05:28, 82.19s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 700, 'max_depth': 6, 'learning_rate': 0.43157198739286967, 'min_samples_leaf': 7, 'l2_regularization': 0.4263513069628082}
  Backtesting metric: 0.0009526922109313341



 85%|████████▌ | 17/20 [23:12<04:11, 83.70s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'max_iter': 1100, 'max_depth': 8, 'learning_rate': 0.025593714537902525, 'min_samples_leaf': 8, 'l2_regularization': 0.7758131619491934}
  Backtesting metric: 0.00046369699347848707



 90%|█████████ | 18/20 [24:41<02:51, 85.55s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'max_iter': 1000, 'max_depth': 5, 'learning_rate': 0.2682210846314651, 'min_samples_leaf': 14, 'l2_regularization': 0.8152881878604432}
  Backtesting metric: 0.0009979775565897866



 95%|█████████▌| 19/20 [25:33<01:15, 75.47s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'max_iter': 400, 'max_depth': 4, 'learning_rate': 0.47361554254833266, 'min_samples_leaf': 20, 'l2_regularization': 0.9207154485812868}
  Backtesting metric: 0.001259566662602886



100%|██████████| 20/20 [26:45<00:00, 80.28s/it]

> Done





In [None]:
df_smape_hist.index = pd.Index(['smape'])
hist_mean = round(df_smape_hist.mean(axis=1)[0],4)
print(f" average smape: {hist_mean}")
df_smape_hist

 average smape: 0.0982


Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
smape,0.057,0.266,0.163,0.06,0.058,0.068,0.08,0.056,0.058,0.028,0.045,0.041,0.155,0.076,0.017,0.206,0.12,0.126,0.142,0.143


In [None]:
df_submission_hist.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_hist

Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
2022-01-01,515.92,117.25,301.93,249.62,501.15,229.83,216.75,178.33,195.17,542.18,559.69,395.41,159.17,204.11,607.91,118.29,427.83,166.83,158.87,129.25
2022-02-01,515.96,117.25,301.89,249.61,501.28,229.79,216.9,178.31,195.16,542.15,559.67,395.46,159.19,204.11,607.94,118.37,427.93,166.85,159.21,129.33
2022-03-01,516.19,117.28,301.97,249.65,501.42,229.79,216.92,178.3,195.18,542.21,559.64,395.49,159.15,204.16,608.03,118.45,428.04,166.88,159.38,129.27
2022-04-01,516.19,117.23,302.04,249.64,501.57,229.68,216.91,178.25,195.21,542.47,559.86,395.44,159.13,204.42,608.14,118.46,427.93,166.93,159.34,129.25
2022-05-01,516.37,117.35,302.04,249.67,501.55,229.66,216.84,178.35,195.27,543.21,560.19,395.65,159.04,204.46,608.36,118.41,427.94,167.01,159.3,129.07
2022-06-01,516.46,117.44,302.19,249.68,501.71,229.76,216.79,178.38,195.19,543.67,560.64,395.92,159.08,204.41,608.53,118.38,428.21,167.03,159.2,128.89
2022-07-01,516.3,117.63,302.46,249.63,501.65,229.69,216.65,178.33,195.24,543.51,560.69,395.95,159.3,204.48,608.4,118.26,428.2,167.1,159.0,128.66
2022-08-01,516.34,117.53,302.56,249.64,501.57,229.61,216.31,178.38,195.23,543.29,560.67,395.87,159.43,204.56,608.21,118.14,428.14,167.13,158.99,128.63
2022-09-01,516.3,117.5,302.46,249.62,501.7,229.55,216.11,178.33,195.2,542.82,560.44,395.78,159.62,204.52,608.08,118.2,427.93,167.19,159.06,128.76
2022-10-01,516.16,117.45,302.5,249.61,501.54,229.52,216.24,178.26,195.19,542.7,560.29,395.75,159.66,204.54,608.09,118.21,427.77,167.21,159.06,128.77


## Catboost Model

In [None]:
# one-hot encoding
transformer_exog = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, drop='if_binary'),
        make_column_selector(dtype_exclude=np.number),
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Regressor hyperparameters search space
def search_space(trial):
  lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])
  search_space  = {
      'n_estimators'  : trial.suggest_int('n_estimators', 100, 1000, step=100),
      'max_depth'     : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),
      'lags'          : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [None]:
def search_hyperparameters(data, end_train, end_valid, exog_features,transformer_exog):

  # instantiate a forcaster transformer with categorical features
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )


  # search for best parameters
  results_search, frozen_trial = bayesian_search_forecaster(
  forecaster         = forecaster,
  y                  = data.loc[:end_valid, 'gw-level'],
  exog               = data.loc[:end_valid, exog_features],
  search_space       = search_space,
  steps              = 30,
  refit              = False,
  metric             = 'mean_absolute_percentage_error',
  initial_train_size = len(data.loc[:end_train]),
  fixed_train_size   = False,
  n_trials           = 20,
  random_state       = 123,
  return_best        = True,
  n_jobs             = 'auto',
  verbose            = False,
  show_progress      = True
  )

  best_params = results_search['params'].iat[0]

  return best_params

def train_and_predict(data, best_params, actual_data, end_valid, end_train, valid_num, train_num, df_idx, exog_features, transformer_exog):

  # train for evaluation of the model
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(**best_params,
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_train, 'gw-level'],
    exog = data.loc[:end_train, exog_features]
  )

  # make predictions and evalute the model
  predictions = forecaster.predict(
      exog     = data.loc[df_idx[train_num+1]:, exog_features],
      steps    = 26
  )
  df_preds = pd.DataFrame(predictions)
  preds = df_preds["pred"].values
  # evaluate on symmetric mean absolute percentage error
  smape_value = smape(actual_data, preds)

  # train for future predictions
  forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(**best_params,
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

  # train the model the time series train and validation dataset
  forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
  )

  # make predictions into the future
  predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
  )
  df_preds = pd.DataFrame(predictions)

  # free resources since it's going to run on iterations
  del forecaster

  return df_preds, smape_value

In [None]:
processed_data_dir = "/content/drive/MyDrive/clean_processed_data_part1"
#test_template = "/content/drive/MyDrive/gw_test_empty.csv"

df_submission_cat, df_smape_cat = populate_test_data(processed_data_dir)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.2345829390285611}
  Backtesting metric: 0.0005560948323258439



  5%|▌         | 1/20 [02:48<53:12, 168.03s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.9693326783970091}
  Backtesting metric: 0.0019005686050848983



 10%|█         | 2/20 [07:36<1:11:39, 238.89s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 10, 'learning_rate': 0.22510978034260581}
  Backtesting metric: 0.0010052876285154132



 15%|█▌        | 3/20 [22:36<2:33:11, 540.67s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.368170769066091}
  Backtesting metric: 0.0005206644116548673



 20%|██        | 4/20 [25:15<1:44:03, 390.20s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.368170769066091}
  Backtesting metric: 0.0005892953835552806



 25%|██▌       | 5/20 [31:20<1:35:17, 381.15s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.012677653971639419}
  Backtesting metric: 0.0007037403318859072



 30%|███       | 6/20 [40:29<1:42:10, 437.93s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.6282476644194794}
  Backtesting metric: 0.0007176098541020712



 35%|███▌      | 7/20 [42:51<1:13:55, 341.22s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.018888592010598032}
  Backtesting metric: 0.0004892561432333647



 40%|████      | 8/20 [46:13<59:24, 297.03s/it]  

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.6967438670968221}
  Backtesting metric: 0.0005736500204410869



 45%|████▌     | 9/20 [48:05<43:51, 239.24s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.011036251734997974}
  Backtesting metric: 0.00031828207536629206



 50%|█████     | 10/20 [1:04:11<1:17:13, 463.38s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.057459816909839465}
  Backtesting metric: 0.00031970864888050335



 55%|█████▌    | 11/20 [1:07:48<58:13, 388.14s/it]  

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.2430145228918057}
  Backtesting metric: 0.0004066275628902225



 60%|██████    | 12/20 [1:09:20<39:43, 297.99s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 900, 'max_depth': 8, 'learning_rate': 0.2832707144595471}
  Backtesting metric: 0.0006545736786261157



 65%|██████▌   | 13/20 [1:15:53<38:07, 326.73s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.7387169517315111}
  Backtesting metric: 0.0008674957936529703



 70%|███████   | 14/20 [1:19:59<30:14, 302.40s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.3030439020955955}
  Backtesting metric: 0.0001752525906341015



 75%|███████▌  | 15/20 [1:27:45<29:19, 351.82s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.6495879842349511}
  Backtesting metric: 0.0029279596927255227



 80%|████████  | 16/20 [1:31:14<20:34, 308.68s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.013799762791417092}
  Backtesting metric: 0.0007925685178916267



 85%|████████▌ | 17/20 [1:33:51<13:09, 263.06s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.6827176903673347}
  Backtesting metric: 0.0013315726677447053



 90%|█████████ | 18/20 [1:35:48<07:18, 219.34s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.11386606446550962}
  Backtesting metric: 0.0006725002785947311



 95%|█████████▌| 19/20 [1:38:50<03:28, 208.05s/it]

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.13301558451355833}
  Backtesting metric: 0.0013504470434576965



100%|██████████| 20/20 [1:41:45<00:00, 305.25s/it]

> Done





In [None]:
df_smape_cat.index = pd.Index(['smape'])
cat_mean = round(df_smape_cat.mean(axis=1)[0],4)
print(f" average smape: {cat_mean}")
df_smape_cat

 average smape: 0.0868


Unnamed: 0,321778,345199,345439,324038,309872,316661,345017,327031,312660,334052,330381,328401,345371,301812,330001,305813,309419,335653,345389,345165
smape,0.059,0.178,0.119,0.059,0.054,0.058,0.08,0.038,0.108,0.031,0.038,0.046,0.115,0.108,0.025,0.158,0.061,0.145,0.106,0.149


In [None]:
df_submission_cat.index = pd.date_range(start=pd.Timestamp("2021-12-01") + pd.DateOffset(months=1), periods=26, freq='MS')
df_submission_cat

  and should_run_async(code)


Unnamed: 0,300111,345710,329078,301838,345116,345512,326934,321646,335208,309625,309948,328104,310029,335778,326868,306167,345314,313569,313387,331124
2022-01-01,153.95,399.98,237.86,200.8,224.41,230.92,242.37,510.71,328.32,492.39,638.37,424.66,658.52,251.61,283.84,113.23,120.91,181.15,144.01,140.2
2022-02-01,153.96,399.98,237.67,200.76,224.57,231.22,242.34,510.78,328.5,492.51,638.35,424.75,658.57,251.64,283.71,113.22,120.9,181.12,143.98,140.3
2022-03-01,153.96,399.99,237.74,200.89,224.75,230.99,242.31,510.91,328.54,493.06,638.42,424.71,658.63,251.67,283.93,113.51,121.04,181.18,144.1,140.54
2022-04-01,153.95,400.01,237.66,200.88,224.92,230.9,242.58,511.04,328.29,493.6,638.62,424.65,658.6,251.67,283.79,113.68,121.22,181.15,144.14,140.51
2022-05-01,153.94,400.02,237.59,200.8,224.82,230.8,243.16,511.04,328.06,493.54,638.84,424.73,658.6,251.73,283.93,113.69,121.45,181.14,144.11,140.6
2022-06-01,153.9,400.08,237.56,200.62,224.98,230.92,243.72,511.19,327.9,493.14,639.02,424.63,658.71,251.72,283.96,113.6,121.28,181.15,144.1,140.72
2022-07-01,153.93,400.13,237.41,200.56,225.12,230.91,243.68,511.09,328.11,492.95,639.02,424.71,658.81,251.73,283.96,113.48,121.12,181.25,144.03,140.63
2022-08-01,153.93,400.17,237.31,200.56,224.85,230.88,243.64,510.96,328.08,492.8,638.89,424.76,658.97,251.74,283.82,113.38,121.0,181.24,143.86,140.31
2022-09-01,153.93,400.15,237.08,200.68,224.61,230.69,243.57,510.93,328.49,492.83,638.82,424.73,659.04,251.68,283.78,113.22,120.93,181.22,143.93,140.23
2022-10-01,153.86,400.1,237.37,200.82,224.61,231.02,242.96,510.84,329.0,492.65,638.73,424.77,659.05,251.7,283.78,113.39,120.86,181.2,143.95,139.85


In [None]:
smapes_all = {
    "model": ["lightgbm", "xgboost", "histgradientboosting", "catboost"],
    "smape": [lgbm_mean, xgb_mean, hist_mean, cat_mean]
}
df_smapes_all = pd.DataFrame(smapes_all, index=range(4))
df_smapes_all

Unnamed: 0,model,smape
0,lightgbm,0.1015
1,xgboost,0.1477
2,histgradientboosting,0.0982
3,catboost,0.0868


## This is code for a single location prediction

In [15]:
df_exog = pd.read_csv("/content/drive/MyDrive/clean_processed_data_part1/processed_Burgenland-305540.csv")

# set categorical columns astype category for the transformer model to auto detect them
df_exog["season"] = df_exog["season"].astype("category")
df_exog["weather"] = df_exog["weather"].astype("category")
df_exog["date"] = pd.to_datetime(df_exog["date"])
df_exog.set_index("date", inplace=True)

# set the datatime range index to monthly freq
df_exog.index = pd.date_range(start=df_exog.index.min(), end=df_exog.index.max(), freq='MS')
df_exog

Unnamed: 0,gw-level,temp,temp_roll_mean_1_year,temp_roll_mean_2_year,temp_roll_max_1_year,temp_roll_min_1_year,month,year,quarter,season,...,poly_quarter_month_sin,poly_quarter_month_cos,poly_quarter_quarter_sin,poly_quarter_quarter_cos,poly_month_sin_month_cos,poly_month_sin_quarter_sin,poly_month_sin_quarter_cos,poly_month_cos_quarter_sin,poly_month_cos_quarter_cos,poly_quarter_sin_quarter_cos
1957-04-01,117.28,11.14,11.14,11.14,11.14,11.14,4,1957,2,spring,...,1.732051e+00,1.000000e+00,2.449294e-16,-2.000000e+00,4.330127e-01,1.060575e-16,-8.660254e-01,6.123234e-17,-5.000000e-01,-1.224647e-16
1957-05-01,117.21,11.14,11.14,11.14,11.14,11.14,5,1957,2,spring,...,1.931852e+00,5.176381e-01,2.449294e-16,-2.000000e+00,2.500000e-01,1.182918e-16,-9.659258e-01,3.169619e-17,-2.588190e-01,-1.224647e-16
1957-06-01,117.10,11.14,11.14,11.14,11.14,11.14,6,1957,2,summer,...,2.000000e+00,1.224647e-16,2.449294e-16,-2.000000e+00,6.123234e-17,1.224647e-16,-1.000000e+00,7.498799e-33,-6.123234e-17,-1.224647e-16
1957-07-01,117.00,11.14,11.14,11.14,11.14,11.14,7,1957,3,summer,...,2.897777e+00,-7.764571e-01,-3.000000e+00,-5.510911e-16,-2.500000e-01,-9.659258e-01,-1.774377e-16,2.588190e-01,4.754429e-17,1.836970e-16
1957-08-01,116.97,11.14,11.14,11.14,11.14,11.14,8,1957,3,summer,...,2.598076e+00,-1.500000e+00,-3.000000e+00,-5.510911e-16,-4.330127e-01,-8.660254e-01,-1.590863e-16,5.000000e-01,9.184851e-17,1.836970e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-01,0.00,14.97,12.43,12.40,14.85,10.78,10,2023,4,Fall,...,2.000000e+00,-3.464102e+00,-9.797174e-16,4.000000e+00,-4.330127e-01,-1.224647e-16,5.000000e-01,2.121150e-16,-8.660254e-01,-2.449294e-16
2023-11-01,0.00,14.87,12.44,12.40,14.97,10.78,11,2023,4,Fall,...,1.035276e+00,-3.863703e+00,-9.797174e-16,4.000000e+00,-2.500000e-01,-6.339238e-17,2.588190e-01,2.365836e-16,-9.659258e-01,-2.449294e-16
2023-12-01,0.00,12.50,12.49,12.44,14.97,10.78,12,2023,4,winter,...,4.898587e-16,-4.000000e+00,-9.797174e-16,4.000000e+00,-1.224647e-16,-2.999520e-32,1.224647e-16,2.449294e-16,-1.000000e+00,-2.449294e-16
2024-01-01,0.00,12.28,12.49,12.44,14.97,10.78,1,2024,1,winter,...,2.588190e-01,9.659258e-01,1.000000e+00,6.123234e-17,2.500000e-01,2.588190e-01,1.584810e-17,9.659258e-01,5.914590e-17,6.123234e-17


In [18]:
# one-hot encoding
transformer_exog = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, drop='if_binary'),
        make_column_selector(dtype_exclude=np.number),
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


# Lags grid
lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])

# Regressor hyperparameters search space
def search_space(trial):
  lags_grid = tuple([12, 24, [1, 2, 3, 4, 7, 9, 24]])
  search_space  = {
      'n_estimators'  : trial.suggest_int('n_estimators', 100, 1000, step=100),
      'max_depth'     : trial.suggest_int('max_depth', 3, 10, step=1),
      'learning_rate' : trial.suggest_float('learning_rate', 0.01, 1),
      'lags'          : trial.suggest_categorical('lags', lags_grid)
  }
  return search_space

In [19]:
def search_hyperparameters(data, one_hot_encoder):

    # get the estimate end train and end validation dates
    exog_data = data.drop("gw-level", axis=1)
    exog_features = exog_data.columns
    df_idx = data.index
    train_num = int(len(data) * 0.8)
    end_train = "2012-12-01"
    end_valid = "2021-12-01"

    # instantiate a forcaster transformer with categorical features
    forecaster = ForecasterAutoreg(
    regressor = CatBoostRegressor(
                    random_state=123,
                    silent=True,
                    allow_writing_files=False,
                    boosting_type = 'Plain', # Faster training
                    leaf_estimation_iterations = 3, # Faster training
                ),
    lags = 24,
    transformer_exog = one_hot_encoder
    )

    # search for best parameters
    results_search, frozen_trial = bayesian_search_forecaster(
    forecaster         = forecaster,
    y                  = data.loc[:end_valid, 'gw-level'],
    exog               = data.loc[:end_valid, exog_features],
    search_space       = search_space,
    steps              = 26,
    refit              = False,
    metric             = 'mean_absolute_percentage_error',
    initial_train_size = len(data.loc[:end_train]),
    fixed_train_size   = False,
    n_trials           = 20,
    random_state       = 123,
    return_best        = True,
    n_jobs             = 'auto',
    verbose            = False,
    show_progress      = True
    )
    best_params = results_search['params'].iat[0]

    return best_params, forecaster

best_params, forecaster =  search_hyperparameters(df_exog, transformer_exog)

  0%|          | 0/20 [00:00<?, ?it/s]



`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  7  9 24] 
  Parameters: {'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.48612258246951734}
  Backtesting metric: 0.00363004906467534



In [20]:
data = df_exog.copy()
exog_data = data.drop("gw-level", axis=1)
exog_features = exog_data.columns
df_idx = data.index
train_num = int(len(data) * 0.8)
valid_num = len(data.loc[:"2021-11-01"])
end_train = df_idx[train_num]
end_valid = df_idx[valid_num]
end_evaluation = df_idx[train_num+50]
evaluate_data = data.loc[df_idx[train_num+1]: end_evaluation, "gw-level"].values

In [28]:
# train model with best params
forecaster = ForecasterAutoreg(
  regressor = CatBoostRegressor(**best_params,
                  random_state=123,
                  silent=True,
                  allow_writing_files=False,
                  boosting_type = 'Plain', # Faster training
                  leaf_estimation_iterations = 3, # Faster training
              ),
  lags = 24,
  transformer_exog = transformer_exog
  )

forecaster.fit(
    y    = data.loc[:end_valid, 'gw-level'],
    exog = data.loc[:end_valid, exog_features]
)

  and should_run_async(code)


In [25]:
# make predictions and evalute the model
predictions = forecaster.predict(
    exog     = data.loc[df_idx[train_num+1]:, exog_features],
    steps    = 50
)
pred_df = pd.DataFrame(predictions)

In [27]:
preds = pred_df["pred"].values
smape_value = smape(evaluate_data, preds)
pred_df.loc["smape", "pred"] = smape_value
pred_df

Unnamed: 0,pred
2010-11-01 00:00:00,116.743928
2010-12-01 00:00:00,116.769964
2011-01-01 00:00:00,116.643535
2011-02-01 00:00:00,116.531555
2011-03-01 00:00:00,116.508718
2011-04-01 00:00:00,116.500629
2011-05-01 00:00:00,116.513734
2011-06-01 00:00:00,116.540813
2011-07-01 00:00:00,116.615981
2011-08-01 00:00:00,116.686118


In [29]:
# make predictions into the future
predictions = forecaster.predict(
    exog     = data.loc[df_idx[valid_num+1]:, exog_features],
    steps    = 26
)
pd.DataFrame(predictions)

Unnamed: 0,pred
2022-01-01,115.650742
2022-02-01,115.710939
2022-03-01,115.830525
2022-04-01,115.720865
2022-05-01,115.759641
2022-06-01,115.765882
2022-07-01,115.691732
2022-08-01,115.642076
2022-09-01,115.516711
2022-10-01,115.649122


In [31]:
# Extract feature importance
importance = forecaster.get_feature_importances()
importance.sort_values(by='importance', ascending=False).head(20)

Unnamed: 0,feature,importance
1,lag_2,31.465973
0,lag_1,24.828055
106,poly_month_sin_quarter_sin,4.944304
8,lag_9,4.201372
56,poly_temp_roll_mean_1_year_temp_roll_mean_2_year,3.679363
10,lag_11,3.492097
84,poly_temp_roll_min_1_year_year,3.422089
7,lag_8,2.061529
99,poly_year_quarter_sin,1.669685
9,lag_10,1.483241
