In [12]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'c:\\Users\\Joaquín Amat\\Documents\\GitHub\\skforecast'

In [13]:
from typing import Union, Tuple, Optional, Callable
import pandas as pd
import warnings
import logging
from copy import deepcopy
from joblib import Parallel, delayed, cpu_count
from tqdm.auto import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler

from skforecast.exceptions import LongTrainingWarning
from skforecast.model_selection.model_selection import _get_metric
from skforecast.model_selection.model_selection import _create_backtesting_folds
from skforecast.utils import check_backtesting_input
from skforecast.utils import select_n_jobs_backtesting
from skforecast.ForecasterSarimax import ForecasterSarimax
from pmdarima import ARIMA



def _evaluate_grid_hyperparameters_sarimax_ic(
    forecaster,
    y: pd.Series,
    param_grid: dict,
    metric: Union[str, list],
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
    return_best: bool=True,
    show_progress: bool=True,
    suppress_warnings_fit: bool=True
) -> pd.DataFrame:
    """
    Evaluate parameter values for a ForecasterSarimax object using information criteria.
    
    Parameters
    ----------
    forecaster : ForecasterSarimax
        Forecaster model.
    y : pandas Series
        Training time series. 
    param_grid : dict
        Dictionary with parameters names (`str`) as keys and lists of parameter
        settings to try as values.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model. Allowed inputs are:
        {'aic', 'aicc', 'bic', 'hqic'} or a list containing multiple metrics.
    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    return_best : bool, default `True`
        Refit the `forecaster` using the best found parameters on the whole data.
    show_progress: bool, default `True`
        Whether to show a progress bar.
    suppress_warnings_fit : bool, default `True`
        If True, warnings during model fitting are suppressed.

    Returns
    -------
    results : pandas DataFrame
        Results for each combination of parameters.

            - column params: lower bound of the interval.
            - column metric: metric value estimated for the combination of parameters.
            - additional n columns with param = value.

    """
    print(f"Number of models compared: {len(param_grid)}.")
    if not isinstance(metric, list):
        metric = [metric]

    params_list = []
    metric_results = []    
    
    if show_progress:
        param_grid = tqdm(param_grid, desc='params grid', position=0)
    for params in param_grid:
        params_list.append(params)
        forecaster.set_params(params)
        try:
            forecaster.fit(y, exog=exog, suppress_warnings=suppress_warnings_fit)
            metric_values = {m: forecaster.get_info_criteria(m) for m in metric}  
        except:
            metric_values = {m: None for m in metric}
        metric_results.append(metric_values)

    results = pd.DataFrame({
                 'params': params_list,
                 'metrics': metric_results
              })

    results = pd.concat([results, results['metrics'].apply(pd.Series)], axis=1)
    results = pd.concat([results, results['params'].apply(pd.Series)], axis=1)
    results = results.drop(columns=['metrics'])
    results = results.sort_values(by=metric[0], ascending=True)

    if return_best:
        
        best_params = results['params'].iloc[0]
        best_metric = results[metric[0]].iloc[0]
        
        forecaster.set_params(best_params)
        forecaster.fit(y, exog=exog, suppress_warnings=suppress_warnings_fit)
        
        print(
            f"`Forecaster` refitted using the best-found parameters, and the whole data set: \n"
            f"  Parameters: {best_params}\n"
            f"  Backtesting metric: {best_metric}\n"
        )
            
    return results
    


In [14]:
def grid_search_sarimax(
    forecaster,
    y: pd.Series,
    param_grid: dict,
    steps: int,
    metric: Union[str, Callable, list],
    initial_train_size: int,
    fixed_train_size: bool=True,
    gap: int=0,
    allow_incomplete_fold: bool=True,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
    refit: Optional[Union[bool, int]]=False,
    return_best: bool=True,
    n_jobs: Optional[Union[int, str]]='auto',
    verbose: bool=True,
    show_progress: bool=True,
    suppress_warnings_fit: bool=True
) -> pd.DataFrame:
    """
    Exhaustive search over specified parameter values for a ForecasterSarimax object.
    Validation is done using time series backtesting.
    
    Parameters
    ----------
    forecaster : ForecasterSarimax
        Forecaster model.
    y : pandas Series
        Training time series. 
    param_grid : dict
        Dictionary with parameters names (`str`) as keys and lists of parameter
        settings to try as values.
    steps : int
        Number of steps to predict.
    metric : str, Callable, list
        Metric used to quantify the goodness of fit of the model.
        
            - If `string`: {'mean_squared_error', 'mean_absolute_error',
             'mean_absolute_percentage_error', 'mean_squared_log_error'}
            - If `Callable`: Function with arguments y_true, y_pred that returns 
            a float.
            - If `list`: List containing multiple strings and/or Callables.
    initial_train_size : int 
        Number of samples in the initial train split. The backtest forecaster is
        trained using the first `initial_train_size` observations.
    fixed_train_size : bool, default `True`
        If True, train size doesn't increase but moves by `steps` in each iteration.
    gap : int, default `0`
        Number of samples to be excluded after the end of each training set and 
        before the test set.
    allow_incomplete_fold : bool, default `True`
        Last fold is allowed to have a smaller number of samples than the 
        `test_size`. If `False`, the last fold is excluded.
    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and should be aligned so that y[i] is
        regressed on exog[i].
    refit : bool, int, default `False`
        Whether to re-fit the forecaster in each iteration. If `refit` is an integer, 
        the Forecaster will be trained every that number of iterations.
    return_best : bool, default `True`
        Refit the `forecaster` using the best found parameters on the whole data.
    n_jobs : int, 'auto', default `'auto'`
        The number of jobs to run in parallel. If `-1`, then the number of jobs is 
        set to the number of cores. If 'auto', `n_jobs` is set using the function
        skforecast.utils.select_n_jobs_backtesting.
        **New in version 0.9.0**
    verbose : bool, default `True`
        Print number of folds used for cv or backtesting.
    show_progress: bool, default `True`
        Whether to show a progress bar.
    suppress_warnings_fit: bool, default `True`
        If `True`, warnings during model fitting are suppressed.

    Returns
    -------
    results : pandas DataFrame
        Results for each combination of parameters.

            - column params: parameters configuration for each iteration.
            - column metric: metric value estimated for each iteration.
            - additional n columns with param = value.
    
    """

    param_grid = list(ParameterGrid(param_grid))

    if (
        isinstance(metric, str)
        and metric in ["aic", "bic", "hqic"]
        or isinstance(metric, list)
        and set(metric).issubset(["aic", "bic", "hqic"])
    ):
        results = _evaluate_grid_hyperparameters_sarimax_ic(
            forecaster            = forecaster,
            y                     = y,
            param_grid            = param_grid,
            metric                = metric,
            return_best           = return_best,
            show_progress         = show_progress,
            suppress_warnings_fit = suppress_warnings_fit,
        )

    else:
        results = _evaluate_grid_hyperparameters_sarimax(
            forecaster            = forecaster,
            y                     = y,
            param_grid            = param_grid,
            steps                 = steps,
            metric                = metric,
            initial_train_size    = initial_train_size,
            fixed_train_size      = fixed_train_size,
            gap                   = gap,
            allow_incomplete_fold = allow_incomplete_fold,
            exog                  = exog,
            refit                 = refit,
            return_best           = return_best,
            n_jobs                = n_jobs,
            verbose               = verbose,
            show_progress         = show_progress
        )

    return results

In [15]:
# Download data
# ======================================================================================
url = (
    'https://raw.githubusercontent.com/JoaquinAmatRodrigo/Estadistica-machine-learning-python/'
    'master/data/consumos-combustibles-mensual.csv'
)
data = pd.read_csv(url, sep=',')
data = data[['Fecha', 'Gasolinas']]
data = data.rename(columns={'Fecha':'date', 'Gasolinas':'litters'})
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.loc[:'1990-01-01 00:00:00']
data = data.asfreq('MS')
data = data['litters']
data.head(4)

# Train-test dates
# ======================================================================================
end_train = '1980-01-01 00:00:00'
print(f"Train dates : {data.index.min()} --- {data.loc[:end_train].index.max()}  (n={len(data.loc[:end_train])})")
print(f"Test dates  : {data.loc[end_train:].index.min()} --- {data.loc[:].index.max()}  (n={len(data.loc[end_train:])})")
data_train = data.loc[:end_train]
data_test  = data.loc[end_train:]

forecaster = ForecasterSarimax(
                 regressor=ARIMA(order=(1, 1, 1), maxiter=500), # Placeholder replaced in the grid search
             )

param_grid = {
    'order': [(1, 1, 0), (0, 1, 1), (1, 1, 1), (2, 1, 1)],
    'seasonal_order': [(0, 0, 0, 0), (0, 1, 0, 12), (1, 1, 1, 12)],
    'trend': [None, 'n', 'c']
}


# param_grid = list(ParameterGrid(param_grid))

# results_grid = _evaluate_grid_hyperparameters_sarimax_ic(
#                    forecaster              = forecaster,
#                    y                       = data,
#                    param_grid              = param_grid,
#                    metric                  = ['aic', 'bic'],
#                    return_best             = False,
#                    show_progress           = True,
#                    suppress_warnings_fit   = False,
#                )

results_grid = grid_search_sarimax(
                   forecaster              = forecaster,
                   y                       = data,
                   initial_train_size      = 100, # No es necesario pero da error sin el 
                   steps                   = 100, # No es necesario pero da error sin el 
                   param_grid              = param_grid,
                   metric                  = ['aic', 'bic'],
                   return_best             = False,
                   show_progress           = True,
                   suppress_warnings_fit   = False,
               )

results_grid

Train dates : 1969-01-01 00:00:00 --- 1980-01-01 00:00:00  (n=133)
Test dates  : 1980-01-01 00:00:00 --- 1990-01-01 00:00:00  (n=121)
Number of models compared: 36.


params grid:   0%|          | 0/36 [00:00<?, ?it/s]

Unnamed: 0,params,aic,bic,order,seasonal_order,trend
25,"{'order': (1, 1, 1), 'seasonal_order': (1, 1, ...",5471.444287,5488.847482,"(1, 1, 1)","(1, 1, 1, 12)",n
34,"{'order': (2, 1, 1), 'seasonal_order': (1, 1, ...",5473.180691,5494.064525,"(2, 1, 1)","(1, 1, 1, 12)",n
26,"{'order': (1, 1, 1), 'seasonal_order': (1, 1, ...",5473.324206,5494.20804,"(1, 1, 1)","(1, 1, 1, 12)",c
24,"{'order': (1, 1, 1), 'seasonal_order': (1, 1, ...",5473.324206,5494.20804,"(1, 1, 1)","(1, 1, 1, 12)",
35,"{'order': (2, 1, 1), 'seasonal_order': (1, 1, ...",5475.067119,5499.431592,"(2, 1, 1)","(1, 1, 1, 12)",c
33,"{'order': (2, 1, 1), 'seasonal_order': (1, 1, ...",5475.067119,5499.431592,"(2, 1, 1)","(1, 1, 1, 12)",
7,"{'order': (1, 1, 0), 'seasonal_order': (1, 1, ...",5482.577761,5496.500317,"(1, 1, 0)","(1, 1, 1, 12)",n
6,"{'order': (1, 1, 0), 'seasonal_order': (1, 1, ...",5484.559063,5501.962257,"(1, 1, 0)","(1, 1, 1, 12)",
8,"{'order': (1, 1, 0), 'seasonal_order': (1, 1, ...",5484.559063,5501.962257,"(1, 1, 0)","(1, 1, 1, 12)",c
16,"{'order': (0, 1, 1), 'seasonal_order': (1, 1, ...",5485.157587,5499.080143,"(0, 1, 1)","(1, 1, 1, 12)",n
