# Optuna study

> Combine it with papermill and wandb for seamless hyperparameter tuning

In [15]:
import sys
sys.path.append('..')
import os
import optuna
from tsai.optuna import *
from tsai.basics import load_object
import papermill as pm
from tsai.optuna import run_optuna_study
from fastcore.basics import *
from optuna.distributions import *
from optuna.samplers import TPESampler
import wandb

In [16]:
config = AttrDict(
    study_name = 'general_study_extended', # name of the Optuna study
    study_type = 'bayesian', # 'bayesian' or 'gridsearch' or 'random'
    n_trials = 2, # number of trials
    train_nb = f'{os.getcwd()}/solfsmy_ensemble_train.ipynb', # path to the notebook to be executed
    search_space = {
        "lookback": CategoricalDistribution([18, 24, 36, 128, 192]),
        "learner1.arch.attn_dropout": DiscreteUniformDistribution(0.0, 0.5, 0.1),
        "learner1.arch.d_model": IntUniformDistribution(32, 512, 32),
        "learner1.arch.d_ff": IntUniformDistribution(32, 512, 32),
        "learner1.arch.decomposition": CategoricalDistribution([True, False]),
        "learner1.arch.dropout": DiscreteUniformDistribution(0.0, 0.5, 0.1), 
        "learner1.arch.individual": CategoricalDistribution([True, False]),
        "learner1.arch.n_layers": IntUniformDistribution(1, 6, 1),
        "learner1.arch.n_heads": CategoricalDistribution([2, 4, 8, 16, 32]),
        "learner1.init_weights": CategoricalDistribution([True, False]), # true = kaiming
        # "learner1.deltaHL": FloatDistribution(1., 15., step=1.),
        
        "learner2.arch.attn_dropout": DiscreteUniformDistribution(0.0, 0.5, 0.1),
        "learner2.arch.d_model": IntUniformDistribution(32, 512, 32),
        "learner2.arch.d_ff": IntUniformDistribution(32, 512, 32),
        "learner2.arch.decomposition": CategoricalDistribution([True, False]),
        "learner2.arch.dropout": DiscreteUniformDistribution(0.0, 0.5, 0.1), 
        "learner2.arch.individual": CategoricalDistribution([True, False]),
        "learner2.arch.n_layers": IntUniformDistribution(1, 6, 1),
        "learner2.arch.n_heads": CategoricalDistribution([2, 4, 8, 16, 32]),
        "learner2.init_weights": CategoricalDistribution([True, False]),
        # "learner2.deltaHL": FloatDistribution(1., 15., step=1.),
    },
    # Add extra parameters that are fixed, but not part of the search space
    extra_params = {
        "n_epoch": 30,
        "bs": 32,
        "is_optuna_study": True
    },
    use_wandb = True,
    wandb_mode = 'offline'
)

config

```json
{ 'extra_params': {'bs': 32, 'is_optuna_study': True, 'n_epoch': 30},
  'n_trials': 2,
  'search_space': { 'learner1.arch.attn_dropout': DiscreteUniformDistribution(high=0.5, low=0.0, q=0.1),
                    'learner1.arch.d_ff': IntUniformDistribution(high=512, low=32, step=32),
                    'learner1.arch.d_model': IntUniformDistribution(high=512, low=32, step=32),
                    'learner1.arch.decomposition': CategoricalDistribution(choices=(True, False)),
                    'learner1.arch.dropout': DiscreteUniformDistribution(high=0.5, low=0.0, q=0.1),
                    'learner1.arch.individual': CategoricalDistribution(choices=(True, False)),
                    'learner1.arch.n_heads': CategoricalDistribution(choices=(2, 4, 8, 16, 32)),
                    'learner1.arch.n_layers': IntUniformDistribution(high=6, low=1, step=1),
                    'learner1.init_weights': CategoricalDistribution(choices=(True, False)),
                    'learner2.arch.attn_dropout': DiscreteUniformDistribution(high=0.5, low=0.0, q=0.1),
                    'learner2.arch.d_ff': IntUniformDistribution(high=512, low=32, step=32),
                    'learner2.arch.d_model': IntUniformDistribution(high=512, low=32, step=32),
                    'learner2.arch.decomposition': CategoricalDistribution(choices=(True, False)),
                    'learner2.arch.dropout': DiscreteUniformDistribution(high=0.5, low=0.0, q=0.1),
                    'learner2.arch.individual': CategoricalDistribution(choices=(True, False)),
                    'learner2.arch.n_heads': CategoricalDistribution(choices=(2, 4, 8, 16, 32)),
                    'learner2.arch.n_layers': IntUniformDistribution(high=6, low=1, step=1),
                    'learner2.init_weights': CategoricalDistribution(choices=(True, False)),
                    'lookback': CategoricalDistribution(choices=(18, 24, 36, 128, 192))},
  'study_name': 'general_study_extended',
  'study_type': 'bayesian',
  'train_nb': '/workspaces/sw-driver-forecaster/dev_nbs/solfsmy_ensemble_train.ipynb',
  'use_wandb': True,
  'wandb_mode': 'offline'}
```

In [17]:
def create_objective(train_nb, search_space, extra_params=None, use_wandb=False, skip_tags=[]):
    """
        Create objective function to be minimized by Optuna.
        Inputs:
            trial: Optuna trial object
            train_nb: path to the training notebook
            search_vars: keys of the search space to be used
            wandb_group: name of the wandb group to be used
        Output:
            valid_loss: validation loss
    """
    def objective(trial:optuna.Trial):
        # Define the parameters to be passed to the training notebook through papermill
        pm_parameters = {}
        for k,v in search_space.items():
            pm_parameters[f'config.{k}'] = trial._suggest(k, v)



        # Add the extra parameters to the dictionary. The key of every parameter 
        # must be 'config.<param_name>'
        if extra_params is not None:
            for k,v in extra_params.items():
                pm_parameters['config.' + k] = v
                
        # If using wandb, enable that in the training runs, all of them gathered
        # into a group (NOTE: The train nb must have and use these config arguments)
        if use_wandb:
            pm_parameters['config.use_wandb'] = True
            pm_parameters['config.wandb_group'] = config.study_name + '_runs'

        # Call the training notebook using papermill (don't print the output)
        stdout_file = open('tmp/pm_stdout.txt', 'w')
        stderr_file = open('tmp/pm_stderr.txt', 'w')

        pm.execute_notebook(
            train_nb,
            './tmp/pm_output.ipynb',
            parameters = pm_parameters,
            stdout_file = stdout_file,
            stderr_file = stderr_file
        )

        # Close the output files
        stdout_file.close()
        stderr_file.close()

        # Get the output value of interest from the source notebook
        loss = None
        %store -r valid_loss
        return valid_loss

    return objective

In [18]:
obj = create_objective(config.train_nb, config.search_space, 
                       extra_params=config.extra_params, use_wandb=True)
study = run_optuna_study(obj, study_type='bayesian', direction='minimize', path='./tmp',
                 study_name=config.study_name, n_trials=config.n_trials, show_progress_bar=True)

[I 2024-08-09 13:43:03,567] A new study created in memory with name: general_study_extended
  0%|          | 0/2 [00:00<?, ?it/s]Passed unknown parameter: config.lookback
Passed unknown parameter: config.learner1.arch.attn_dropout
Passed unknown parameter: config.learner1.arch.d_model
Passed unknown parameter: config.learner1.arch.d_ff
Passed unknown parameter: config.learner1.arch.decomposition
Passed unknown parameter: config.learner1.arch.dropout
Passed unknown parameter: config.learner1.arch.individual
Passed unknown parameter: config.learner1.arch.n_layers
Passed unknown parameter: config.learner1.arch.n_heads
Passed unknown parameter: config.learner1.init_weights
Passed unknown parameter: config.learner2.arch.attn_dropout
Passed unknown parameter: config.learner2.arch.d_model
Passed unknown parameter: config.learner2.arch.d_ff
Passed unknown parameter: config.learner2.arch.decomposition
Passed unknown parameter: config.learner2.arch.dropout
Passed unknown parameter: config.learne

[I 2024-08-09 13:46:20,830] Trial 0 finished with value: 1.7801625728607178 and parameters: {'lookback': 18, 'learner1.arch.attn_dropout': 0.30000000000000004, 'learner1.arch.d_model': 128, 'learner1.arch.d_ff': 448, 'learner1.arch.decomposition': True, 'learner1.arch.dropout': 0.5, 'learner1.arch.individual': False, 'learner1.arch.n_layers': 5, 'learner1.arch.n_heads': 16, 'learner1.init_weights': True, 'learner2.arch.attn_dropout': 0.4, 'learner2.arch.d_model': 384, 'learner2.arch.d_ff': 512, 'learner2.arch.decomposition': True, 'learner2.arch.dropout': 0.2, 'learner2.arch.individual': True, 'learner2.arch.n_layers': 2, 'learner2.arch.n_heads': 8, 'learner2.init_weights': True}. Best is trial 0 with value: 1.7801625728607178.


Executing: 100%|██████████| 15/15 [02:39<00:00, 10.65s/cell]
Best trial: 1. Best value: 1.45655: 100%|██████████| 2/2 [05:57<00:00, 178.52s/it]

[I 2024-08-09 13:49:00,598] Trial 1 finished with value: 1.4565500617027283 and parameters: {'lookback': 128, 'learner1.arch.attn_dropout': 0.1, 'learner1.arch.d_model': 512, 'learner1.arch.d_ff': 320, 'learner1.arch.decomposition': False, 'learner1.arch.dropout': 0.30000000000000004, 'learner1.arch.individual': True, 'learner1.arch.n_layers': 1, 'learner1.arch.n_heads': 32, 'learner1.init_weights': True, 'learner2.arch.attn_dropout': 0.0, 'learner2.arch.d_model': 384, 'learner2.arch.d_ff': 32, 'learner2.arch.decomposition': False, 'learner2.arch.dropout': 0.1, 'learner2.arch.individual': True, 'learner2.arch.n_layers': 3, 'learner2.arch.n_heads': 4, 'learner2.init_weights': False}. Best is trial 1 with value: 1.4565500617027283.

Optuna study saved to tmp/general_study_extended.pkl
To reload the study run: study = joblib.load('tmp/general_study_extended.pkl')






Study statistics    : 
  Study name        : general_study_extended
  # finished trials : 2
  # pruned trials   : 0
  # complete trials : 2

Best trial          :
  value             : 1.4565500617027283
  best_params = {'lookback': 128, 'learner1.arch.attn_dropout': 0.1, 'learner1.arch.d_model': 512, 'learner1.arch.d_ff': 320, 'learner1.arch.decomposition': False, 'learner1.arch.dropout': 0.30000000000000004, 'learner1.arch.individual': True, 'learner1.arch.n_layers': 1, 'learner1.arch.n_heads': 32, 'learner1.init_weights': True, 'learner2.arch.attn_dropout': 0.0, 'learner2.arch.d_model': 384, 'learner2.arch.d_ff': 32, 'learner2.arch.decomposition': False, 'learner2.arch.dropout': 0.1, 'learner2.arch.individual': True, 'learner2.arch.n_layers': 3, 'learner2.arch.n_heads': 4, 'learner2.init_weights': False}



In [19]:
run = wandb.init(config=config, mode=config.wandb_mode, 
                 job_type='optuna-study') if config.use_wandb else None

In [20]:
if run is not None:
    run.log(dict(study.best_params, **{'best_value': study.best_value, 
                                       'best_trial_number': study.best_trial.number}))
    run.log_artifact(f'./tmp/{config.study_name}.pkl', type='optuna_study')
    run.log({
        'contour': optuna.visualization.plot_contour(study),
        'edf': optuna.visualization.plot_edf(study),
        'intermediate_values': optuna.visualization.plot_intermediate_values(study),
        'optimization_history': optuna.visualization.plot_optimization_history(study),
        'parallel_coordinate' : optuna.visualization.plot_parallel_coordinate(study),
        'param_importances': optuna.visualization.plot_param_importances(study),
        'slice': optuna.visualization.plot_slice(study)
    })

[W 2024-08-09 13:49:01,836] Param learner1.init_weights unique value length is less than 2.
[W 2024-08-09 13:49:01,839] Param learner2.arch.d_model unique value length is less than 2.
[W 2024-08-09 13:49:01,840] Param learner2.arch.individual unique value length is less than 2.
[W 2024-08-09 13:49:01,843] Param learner1.init_weights unique value length is less than 2.


[W 2024-08-09 13:49:01,847] Param learner2.arch.d_model unique value length is less than 2.
[W 2024-08-09 13:49:01,851] Param learner2.arch.individual unique value length is less than 2.
[W 2024-08-09 13:49:01,854] Param learner1.init_weights unique value length is less than 2.
[W 2024-08-09 13:49:01,856] Param learner2.arch.d_model unique value length is less than 2.
[W 2024-08-09 13:49:01,857] Param learner2.arch.individual unique value length is less than 2.
[W 2024-08-09 13:49:01,860] Param learner1.init_weights unique value length is less than 2.
[W 2024-08-09 13:49:01,862] Param learner2.arch.d_model unique value length is less than 2.
[W 2024-08-09 13:49:01,865] Param learner2.arch.individual unique value length is less than 2.
[W 2024-08-09 13:49:01,867] Param learner1.init_weights unique value length is less than 2.
[W 2024-08-09 13:49:01,869] Param learner2.arch.d_model unique value length is less than 2.
[W 2024-08-09 13:49:01,871] Param learner2.arch.individual unique value

In [21]:
if run is not None:
    run.finish()

0,1
best_trial_number,▁
best_value,▁
learner1.arch.attn_dropout,▁
learner1.arch.d_ff,▁
learner1.arch.d_model,▁
learner1.arch.decomposition,▁
learner1.arch.dropout,▁
learner1.arch.individual,▁
learner1.arch.n_heads,▁
learner1.arch.n_layers,▁

0,1
best_trial_number,1
best_value,1.45655
learner1.arch.attn_dropout,0.1
learner1.arch.d_ff,320
learner1.arch.d_model,512
learner1.arch.decomposition,False
learner1.arch.dropout,0.3
learner1.arch.individual,True
learner1.arch.n_heads,32
learner1.arch.n_layers,1
