# Forecasting Optuna Search CV 
## Modulus Set 3

**Notebook Goal**
- A modeling pipeline that optimizes the hyperparameters of the sktime forecasters that have the [capavility:pred_int tag](https://www.sktime.net/en/stable/examples/01b_forecasting_proba.html) 
- This notebook will focus on the ones where `i mod 4 = 3` wher `i` is the index of the registry table in the above link.
- The work will be based on this documentation: [ForecastingOptunaSearchCV](https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingOptunaSearchCV.html)

In [45]:
# Load the autoreload extension
%load_ext autoreload

# Reload all modules automatically before executing code
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
from sktime.registry import all_estimators
from dotenv import load_dotenv
import sys
import os

# Load environment variables from .env file
load_dotenv()
repo_dir = os.path.abspath(os.path.join(os.getcwd(), "../../"))  # points to the root
src_dir = os.path.join(repo_dir, "src")
sys.path.insert(0, repo_dir)
# Now the import works! :)
from src.data.data_loader import load_data
from src.data.data_cleaner import clean_data, perform_train_test_split

# Load the data
df = load_data()
# Get the list of forecasters with prediction interval capability
models = all_estimators(
    "forecaster", filter_tags={"capability:pred_int": True}, as_dataframe=True
)


  return pd.read_csv(file_path, usecols=keep_columns)


In [47]:
df = clean_data(df) # in place
print(df.columns)

Index(['id', 'p_num', 'time', 'bg-0:00', 'insulin-0:00', 'carbs-0:00',
       'hr-0:00', 'steps-0:00', 'cals-0:00', 'bg+1:00'],
      dtype='object')


In [48]:
filtered_models = models.iloc[3::4]

In [49]:
y_train, y_test, X_train, X_test = perform_train_test_split(df, target_col='bg-0:00')

### Load Search Space

Load the search space configurations (in the YAML file)

In [81]:
import yaml
from sktime.forecasting.model_selection import (
    ForecastingOptunaSearchCV,
)
import optuna
import optuna.distributions
from sktime.split import ExpandingWindowSplitter
from sktime.forecasting.base import ForecastingHorizon
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
import pandas as pd

# Define forecasting horizon
fh_relative = ForecastingHorizon([1,2,3,4,5,6,7,8], is_relative=False).to_relative(cutoff=y_train.index[-1])
# Cross-validation strategy
cv_relative = ExpandingWindowSplitter(initial_window=int(len(y_train) * 0.5), step_length=1, fh=fh_relative)


In [136]:
def map_yaml_to_optuna(param_dict):
    '''
    Maps the search grid in the yaml file to distributions used
    by OptunaSearch for search_grid
    Args:
        param_dict: the yaml file contents. See search_space.yaml for an example
    '''
    optuna_params = {}
    default_params = {}
    
    for param, details in param_dict.items():
        # if it's not a dictionary, add the value directly
        if not isinstance(details, dict):
            default_params[param] = details
        else:
            # Handle the case where the details is a dictionary
            if 'type' in details.keys():
                if details['type'] == 'int':
                    optuna_params[param] = optuna.distributions.IntDistribution(details['low'], details['high'])
                elif details['type'] == 'float':
                    optuna_params[param] = optuna.distributions.FloatDistribution(details['low'], details['high'])
                elif details['type'] == 'categorical':
                    optuna_params[param] = optuna.distributions.CategoricalDistribution(details['values'])
                # If type is a model name (eg: Forecaster arg for ConformalIntervals), then initiate the model with its params
                elif details.get('type') == 'model_name':

                    model_name = details.get('model_name')
                    if model_name and model_name in models['name'].values:
                        print(details.get('hyperparameters', {}))
                        model_cls = models[models['name'] == model_name]['object'].iloc[0]
                        cls_instance = model_cls(**details.get('hyperparameters', {}))
                        default_params[param] = cls_instance
                    else:
                        print(f"Model name '{model_name}' not found in models DataFrame.")
                
                elif details['type'] == "list":
                    processed_list = []
                    for item in details["values"]:
                        if isinstance(item, list) and item[1] in models['name'].values:
                            print("IS LIST")
                            model_class = models.set_index('name').loc[item[1], 'object']
                            processed_list.append((item[0], model_class(), item[2]))
                        else:
                            processed_list.append(item)
                    default_params[param] = processed_list
            # If it's another dictionary, process it recursively
            else:
                optuna_params[param] = map_yaml_to_optuna(details)  # Recursive call for nested dictionaries

    return optuna_params, default_params

In [90]:
def perform_optuna_search(param_space):
    '''
    Performs optuna search on the models specified in filtered_models dataframe
    Args:
        param_space: a dictionary of search space. Keys as model values, and values contain a dictionary of the search space. See search_space.yaml for example
    Returns:
        A dictionary. The keys are the model names. The values are as follows:
        {
            "best_forecaster": instance of best one for the model,
            "best_params": best params for the model,
            "best_score": best score for the model,
        }
    '''
    best_forecasters = {}

    for _, row in filtered_models.iterrows():
        model_name = row['name']
        model_class = row['object']

        # Check if model has hyperparameter space defined in YAML
        if model_name not in param_space.keys():
            print(f"No hyperparameter space defined for {model_name}. Skipping.")
            continue

        # Load the model's hyperparameter space
        optuna_param_distributions, default_params = map_yaml_to_optuna(param_space[model_name])

        # Instantiate the model
        try:
            forecaster = model_class(**default_params)
        except Exception as e:
            print(f"Failed to initialize {model_name}: {e}")
            continue

        # Run OptunaSearch
        optuna_search = ForecastingOptunaSearchCV(
            forecaster=forecaster,
            param_grid=optuna_param_distributions,
            cv=cv_relative,
            scoring=MeanAbsolutePercentageError(symmetric=False),
            n_evals=50,
        )

        print(f"Running OptunaSearch for {model_name}...")
        try:
            optuna_search.fit(y_train, X_train)
        except Exception as e:
            print(f"Failed to fit {model_name}: {e}")
            continue

        # Store best results
        best_forecasters[model_name] = {
            "best_forecaster": optuna_search.best_forecaster_,
            "best_params": optuna_search.best_params_,
            "best_score": optuna_search.best_score_,
        }
    
    return best_forecasters

In [117]:
filtered_models = models[models['name'] == "ColumnEnsembleForecaster"]
# filtered_models.head()
# models

In [153]:
with open("search_space.yaml", "r") as file:
    param_space = yaml.safe_load(file)

# print(param_space['models'].keys())
param_space = param_space['models']
res = perform_optuna_search(param_space)

  warn(
[I 2025-01-29 16:32:39,014] A new study created in memory with name: no-name-e3f99247-2ab2-43de-8ffc-1ebeafda5afc


IS LIST
Running OptunaSearch for ColumnEnsembleForecaster...


In [154]:
res

{'ColumnEnsembleForecaster': {'best_forecaster': ColumnEnsembleForecaster(forecasters=[('naive', NaiveForecaster(), 0)]),
  'best_params': {'naive__strategy': 'last'},
  'best_score': 0.6715778926219356}}