# Forecasting Optuna Search CV 
## Modulus Set 3

**Notebook Goal**
- A modeling pipeline that optimizes the hyperparameters of the sktime forecasters that have the [capavility:pred_int tag](https://www.sktime.net/en/stable/examples/01b_forecasting_proba.html) 
- This notebook will focus on the ones where `i mod 4 = 3` wher `i` is the index of the registry table in the above link.
- The work will be based on this documentation: [ForecastingOptunaSearchCV](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingOptunaSearchCV.html)

In [3]:
# Load the autoreload extension
%load_ext autoreload

# Reload all modules automatically before executing code
%autoreload 2

In [8]:
from sktime.registry import all_estimators
from dotenv import load_dotenv
import sys
import os

# Load environment variables from .env file
load_dotenv()
repo_dir = os.path.abspath(os.path.join(os.getcwd(), "../../"))  # points to the root
src_dir = os.path.join(repo_dir, "src")
sys.path.insert(0, repo_dir)
# Now the import works! :)
from src.data.data_loader import load_data
from src.data.data_cleaner import clean_data, perform_train_test_split

# Load the data
df = load_data()
# Get the list of forecasters with prediction interval capability
models = all_estimators(
    "forecaster", filter_tags={"capability:pred_int": True}, as_dataframe=True
)


  return pd.read_csv(file_path, usecols=keep_columns)


In [9]:
clean_data(df) # in place
print(df.columns)

Index(['id', 'p_num', 'time', 'bg-0:00', 'insulin-0:00', 'carbs-0:00',
       'hr-0:00', 'steps-0:00', 'cals-0:00', 'activity-0:00', 'bg+1:00'],
      dtype='object')


In [17]:
filtered_models = models.iloc[3::4]
filtered_models

Unnamed: 0,name,object
3,AutoETS,<class 'sktime.forecasting.ets.AutoETS'>
7,ColumnEnsembleForecaster,<class 'sktime.forecasting.compose._column_ens...
11,DirRecTabularRegressionForecaster,<class 'sktime.forecasting.compose._reduce.Dir...
15,DynamicFactor,<class 'sktime.forecasting.dynamic_factor.Dyna...
19,ForecastingGridSearchCV,<class 'sktime.forecasting.model_selection._tu...
23,ForecastingSkoptSearchCV,<class 'sktime.forecasting.model_selection._tu...
27,NaiveForecaster,<class 'sktime.forecasting.naive.NaiveForecast...
31,Prophet,<class 'sktime.forecasting.fbprophet.Prophet'>
35,PytorchForecastingNHiTS,<class 'sktime.forecasting.pytorchforecasting....
39,SARIMAX,<class 'sktime.forecasting.sarimax.SARIMAX'>


In [10]:
y_train, y_test, X_train, X_test = perform_train_test_split(df, target_col='bg-0:00')

### Load Search Space

Load the search space configurations (in the YAML file)

In [18]:
import yaml
from sktime.forecasting.model_selection import (
    ForecastingOptunaSearchCV,
)
import optuna
import optuna.distributions
from sktime.split import ExpandingWindowSplitter
from sktime.forecasting.base import ForecastingHorizon
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError

# Define forecasting horizon
fh = ForecastingHorizon(y_test.index, is_relative=False)

# Cross-validation strategy
cv = ExpandingWindowSplitter(initial_window=int(len(y_train) * 0.5), step_length=1, fh=fh)


In [19]:
def map_yaml_to_optuna(param_dict):
    '''
    Maps the search grid in the yaml file to distributions used
    by OptunaSearch for search_grid
    Args:
        param_dict: the yaml file contents. See search_space.yaml for an example
    '''
    optuna_params = {}
    for param, details in param_dict.items():
        if details["type"] == "int":
            optuna_params[param] = optuna.distributions.IntDistribution(details["low"], details["high"])
        elif details["type"] == "float":
            optuna_params[param] = optuna.distributions.FloatDistribution(details["low"], details["high"])
        elif details["type"] == "categorical":
            optuna_params[param] = optuna.distributions.CategoricalDistribution(details["values"])
    return optuna_params

In [37]:
def perform_optuna_search(param_space):
    '''
    Performs optuna search on the models specified in filtered_models dataframe
    Args:
        param_space: a dictionary of search space. Keys as model values, and values contain a dictionary of the search space. See search_space.yaml for example
    Returns:
        A dictionary. The keys are the model names. The values are as follows:
        {
            "best_forecaster": instance of best one for the model,
            "best_params": best params for the model,
            "best_score": best score for the model,
        }
    '''
    best_forecasters = {}

    for _, row in filtered_models.iterrows():
        model_name = row['name']
        model_class = row['object']

        # Check if model has hyperparameter space defined in YAML
        if model_name not in param_space.keys():
            print(f"No hyperparameter space defined for {model_name}. Skipping.")
            continue

        # Instantiate the model
        try:
            forecaster = model_class()
        except Exception as e:
            print(f"Failed to initialize {model_name}: {e}")
            continue

        # Load the model's hyperparameter space
        optuna_param_distributions = map_yaml_to_optuna(param_space[model_name])

        # Run OptunaSearch
        optuna_search = ForecastingOptunaSearchCV(
            forecaster=forecaster,
            param_grid=optuna_param_distributions,
            cv=cv,
            scoring=MeanAbsolutePercentageError(symmetric=False),
            n_evals=50,
        )

        print(f"Running OptunaSearch for {model_name}...")
        try:
            optuna_search.fit(y_train, X_train)
        except Exception as e:
            print(f"Failed to fit {model_name}: {e}")
            continue

        # Store best results
        best_forecasters[model_name] = {
            "best_forecaster": optuna_search.best_forecaster_,
            "best_params": optuna_search.best_params_,
            "best_score": optuna_search.best_score_,
        }
    
    return best_forecasters

In [35]:
with open("search_space.yaml", "r") as file:
    param_space = yaml.safe_load(file)

# print(param_space['models'].keys())
param_space = param_space['models']
res = perform_optuna_search(param_space)

  warn(
[I 2025-01-28 16:56:26,594] A new study created in memory with name: no-name-ad58d5e7-e4be-4c70-9449-bcb5d2d355e2
  warn(
[I 2025-01-28 16:56:26,633] A new study created in memory with name: no-name-6b9f436f-3382-4918-bcad-926ae41ebace


Running OptunaSearch for AutoETS...
Failed to fit AutoETS: `fh` must be relative, but found absolute `fh`
No hyperparameter space defined for ColumnEnsembleForecaster. Skipping.
No hyperparameter space defined for DirRecTabularRegressionForecaster. Skipping.
No hyperparameter space defined for DynamicFactor. Skipping.
No hyperparameter space defined for ForecastingGridSearchCV. Skipping.
No hyperparameter space defined for ForecastingSkoptSearchCV. Skipping.
Running OptunaSearch for NaiveForecaster...
Failed to fit NaiveForecaster: `fh` must be relative, but found absolute `fh`
No hyperparameter space defined for Prophet. Skipping.
No hyperparameter space defined for PytorchForecastingNHiTS. Skipping.
Running OptunaSearch for SARIMAX...
Failed to fit SARIMAX: ForecastingOptunaSearchCV cannot handle missing data (nans), but y passed contained missing data.
No hyperparameter space defined for StatsForecastAutoARIMA. Skipping.
No hyperparameter space defined for StatsForecastAutoTheta. Sk

  warn(


In [36]:
res

{}