# Step 1: Install dependencies

# Step 2: Import libraries

Libraries used for the Bayesian Optimization Loop

In [1]:
import os
import torch
import numpy as np
import plotly
import plotly.graph_objects as go
import pandas as pd

import gymnasium as gym

import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

from hebo.design_space.design_space import DesignSpace
from hebo.optimizers.hebo import HEBO



# Step 3: Define objetive function
This will be the lower bound of the mean reward of a trained model 

In [2]:
def get_hyp_values(hyperparams_dataframe):
  '''
  Returns a tuple of values from a tensor containing a hyperparameter configuration

          Parameters:
                  hyperparams_tensor (torch.DoubleTensor): A tensor of size 1xn (1 row, n columns) with n being the number of hyperparameters to tune
          
          Returns:
                  hyperparams_tuple (tuple): A tuple with the unpacked values of the hyperparams_tensor 

  '''
  hyp_values = hyperparams_dataframe.values
  lr = hyp_values[0][0]
  gamma = hyp_values[0][1]
  return lr, gamma


def create_model(hyperparams,
                 policy='MlpPolicy',
                 env_name='LunarLander-v2'):
  '''
  Returns a PPO model given a policy, environment, and hyperparameters of PPO

          Parameters:
                  hyperparams (pd.Dataframe): A dataframe of 1 row with columns learning_rate and gamma to train the model with
                  policy (str): The NN to train with PPO in the environment. Default is 'MlpPolicy'
                  env (stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv): Specifies the gym environment to use for the training

          Returns:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
  '''
  lr, gamma  = get_hyp_values(hyperparams)
  env = make_vec_env(env_name, n_envs=1)
  model = PPO(policy = policy,
              env = env,
              learning_rate = lr,
              n_steps = 1024,
              batch_size = 64,
              n_epochs = 4,
              gamma = gamma,
              gae_lambda = 0.98,
              ent_coef = 0.01,
              verbose=0)
  
  return model


def train_model(model, timesteps=1000000):
  '''
  Trains a PPO model during a number of timesteps
          
          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  timesteps (int): The number of timesteps used to train the model

          Returns:
                  None
  '''
  model.learn(total_timesteps=timesteps)
  return


def evaluate_model(model, 
                   rl_env_name='LunarLander-v2', 
                   n_eval_episodes=25):
  '''
  Evaluates the model for a number of episodes in a specified environment, this environment MUST be the same as the one the model has been trained in.

          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  rl_env_name (str): The name of the gym environment where the model has been trained
                  n_eval_episodes (int): The number of episodes for which the model will be evaluated to obtain a mean and standard deviation

          Returns:
                  lower_mean_reward (numpy.array): An array of shape (1,1) containing the lower mean reward multiplied by -1 as we are trying to maximize and hebo minimizes
  '''
  eval_env = gym.make(rl_env_name)
  mean_reward, std_reward = evaluate_policy(model, 
                                            eval_env, 
                                            n_eval_episodes=n_eval_episodes, 
                                            deterministic=True)
  
  print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
  lower_mean_reward = -1 * (mean_reward - std_reward)
  lower_mean_reward_np = np.array([lower_mean_reward]).reshape(-1,1)
  return lower_mean_reward_np


def target_function(hyperparams, 
                    timesteps=1000000,
                    rl_env_name='LunarLander-v2'):
  '''
  Given a hyperparameter configuration, evaluates their performance
          Parameters:
                  hyperparams (pd.Dataframe): A dataframe of 1 row with columns learning_rate and gamma to train the model with
                  timesteps (int): timesteps (int): The number of timesteps used to train the model
                  rl_env_name (str): The name of the gym environment where the model has been trained

          Returns:
                  lower_mean_reward (float): An array of shape (1,1) containing the lower mean reward multiplied by -1 as we are trying to maximize and hebo minimizes
  '''
  model = create_model(hyperparams, env_name=rl_env_name)
  
  train_model(model, 
              timesteps)
  
  lower_mean_reward = evaluate_model(model, 
                                     rl_env_name)
  
  return lower_mean_reward


# Step 4: Define hyperparameters to tune
We will use the DesignSpace object from HEBO library

In [3]:
hyperparams = [
    {'name' : 'learning_rate', 
     'type' : 'num',  # Float
     'lb' : 0.0001,   # Lower bound
     'ub' : 0.1},     # Upper bound
    
    {'name' : 'gamma',  
     'type' : 'num',  # Float
     'lb' : 0.8,      # Lower bound  
     'ub' : 0.9997}   # Upper bound
]


hyperparams_space = DesignSpace().parse(hyperparams)

# Step 5: Define functions needed for the Bayesian Optimization Loop

# Step 6: Set experiments' configuration

First let us define functions to save and load checkpoints

In [4]:
import itertools
def create_experiment_df():
  '''
  Creates an empty dataframe to save checkpoints
          Parameters:

          Returns:
                  experiment_df (pandas.DataFrame): An empty Dataframe with columns specified below that will be used to save the experiment history
  '''
  columns = ["method",
             "experiment",
             "iteration",
             "learning_rate",
             "gamma",
             "reward_lower_bound",
             "best_learning_rate",
             "best_gamma",
             "best_reward_lower_bound"
             ]

  experiment_df = pd.DataFrame(columns=columns)
  return experiment_df


def get_filepath(experiment_name):
  '''
  Returns the path of the csv of the experiment specified.
          Parameters:
                  experiment_name (string): The name of the experiment

          Returns:
                  filepath (string): The path to the .csv file that has the data of the experiment
  '''
  filepath = f"./experiments_results/{experiment_name}.csv"
  return filepath


def update_experiment_history(method, 
                              experiment_number, 
                              iteration,
                              lr,
                              gamma,
                              reward_lower_bound,
                              best_lr,
                              best_gamma,
                              best_reward_lower_bound,
                              experiment_df,
                              experiment_name):
  '''
  Updates the experiment dataframe and saves it in GDrive
          Parameters:
                  method (int): 0 if Bayesian Optimization, 1 if Random Search
                  experiment_number (int): The id of the experiment
                  iteration (int): The current iteration of the experiment
                  lr (float): The learning rate value selected for this iteration
                  gamma (float): The gamma value selected for this iteration
                  reward_lower_bound (float): The reward's lower bound obtained by the model trained this iteration
                  best_lr (float): The learning rate value that has induced the best reward lower bound
                  best_gamma (float): The gamma value that has induced the best reward lower bound
                  best_reward_lower_bound (float): The best reward lower bound obtained until this iteration
                  experiment_df (pandas.DataFrame): The dataframe containing the experiments history
                  experiment_name (string): The name of the experiment

          Returns:
                  concatenated_df (pandas.DataFrame): The updated experiments history dataframe
  '''
  # Define the columns of the dataframe
  columns = ["method",
             "experiment",
             "iteration",
             "learning_rate",
             "gamma",
             "reward_lower_bound",
             "best_learning_rate",
             "best_gamma",
             "best_reward_lower_bound"]

  # Set the values of the columns given by the iteration configuration and result
  iteration_results = [[method,
                        experiment_number,
                        iteration,
                        lr,
                        gamma,
                        reward_lower_bound,
                        best_lr,
                        best_gamma,
                        best_reward_lower_bound]]

  # Create a one row dataframe for this experiment
  new_iteration_df = pd.DataFrame(iteration_results, columns=columns)

  # Concatenate the experiments history with this experiment's results
  concatenated_df = pd.concat([experiment_df, new_iteration_df], ignore_index=True)

  # Save the updated history in google drive
  save_checkpoint(concatenated_df,
                  experiment_name)
  
  # Return the concatenated dataframe representing the updated experiment history
  return concatenated_df


def save_checkpoint(experiment_df,
                    experiment_name):
  '''
  Saves the experiments history dataframe in google drive
          Parameters:
                  experiment_df (pandas.DataFrame): The dataframe containing the experiments history
                  experiment_name (string): The name of the experiment
          
          Returns:
                  None
  '''

  filepath = get_filepath(experiment_name)
  experiment_df.to_csv(filepath, index=False)


def load_checkpoint(experiment_name):
  '''
  Loads a checkpoint of an experiment given its name

          Parameters:
                  experiment_name (string): The name of the experiment
          Returns:
                  experiment_df (pandas.DataFrame): A dataframe with the experiment history
                  last_bo_experiment (int): The number of the last bayesian optimization experiment
                  last_rs_experiment (int): The number of the last random search experiment
                  last_bo_iteration (int): The last iteration in the last bayesian optimization experiment
                  last_rs_iteration (int): The last iteration in the last random search experiment
                  past_observations (pandas.Dataframe): With the learning rate and gammas of the last experiments observations
                  past_results (numpy.array): A numpy array of shape (1,n) containing the lower mean rewards of the experiments. n being the number of iterations done in the experiment
                  best_init_y (float): The best lower_mean_reward bound obtained in the last experiment
                  best_rs_lr: The best random search learning rate found 
                  best_rs_gamma: The best gamma found 
                  best_rs_r: The best lower mean reward bound obtained in the last experiment
                  bo_done: If the bayesian optimization has finised
  '''
  # First we retrieve the dataframe from GDrive
  filepath = get_filepath(experiment_name)
  experiment_df = pd.read_csv(filepath)
  
  

  method, exp, iter, lr, gamma, rlb, best_lr, best_gamma, best_rlb = experiment_df.iloc[-1]

  if method == 0:
    bo_done = False
    last_bo_experiment = int(exp)
    last_rs_experiment = 0

    # Plus one because we want to start in the next one
    last_bo_iteration = int(iter)+1
    last_rs_iteration = 1

  else:
    bo_done = True
    last_bo_experiment = experiment_configurations.shape[1]-1
    last_rs_experiment = int(exp)

    # Plus one because we want to start in the next one
    last_bo_iteration = experiment_configurations.shape[2]-1
    last_rs_iteration = int(iter)+1

  # Now lets get the current experiment data
  current_experiment_df = experiment_df[(experiment_df["method"]==method) & (experiment_df["experiment"]==int(exp))]
  past_results = current_experiment_df.reward_lower_bound.to_numpy().reshape(-1,1)
  past_observations = current_experiment_df[["learning_rate", "gamma"]].copy()
  past_observations = past_observations.reset_index()
  past_observations.drop(columns=["index"], inplace=True)


  rs_experiment_df = experiment_df[experiment_df["method"]==0]
  if rs_experiment_df.empty:
      best_rs_lr = 0
      best_rs_gamma = 0
      best_rs_r = 0
  else:
      best_rs_lr = rs_experiment_df.iloc[-1]["best_learning_rate"]
      best_rs_gamma = rs_experiment_df.iloc[-1]["best_gamma"]
      best_rs_r = rs_experiment_df.iloc[-1]["best_reward_lower_bound"]
             
  return (experiment_df, 
         last_bo_experiment, 
         last_rs_experiment, 
         last_bo_iteration, 
         last_rs_iteration, 
         past_observations, 
         past_results, 
         best_init_y, 
         best_rs_lr, 
         best_rs_gamma, 
         best_rs_r,
         bo_done)

Now let's set the configuration for the experiments

In [5]:
import numpy as np
# The name of the file (WITHOUT EXTENSION) where the history of experiments will be saved
experiment_name = "lunar_lander_learning_rate_gamma_hebo"
# If true, this will look for the experiment history .csv in google drive and continue from there
continue_from_checkpoint = False

# Number of experiments per method
n_experiments = 8

# Number of iterations per experiment after the first random point being evaluated
n_iterations = 25

# Number of methods
n_methods = 2

# Index of Bayesian Optimization method
bo_method = 0

# Index of Random Search method
rs_method = 1

# Number of Hyperparameters
n_hyperparameters = 2

# Arrays containing the results and configurations of experiments
experiment_results = np.zeros((n_methods, n_experiments, n_iterations+1))
experiment_configurations = np.zeros((n_methods, n_experiments, n_iterations+1, n_hyperparameters))

# Now load checkpoint if necessary
if continue_from_checkpoint:
  experiment_df, last_bo_experiment, last_rs_experiment, last_bo_iteration, last_rs_iteration, past_observations, past_rewards, best_init_y, best_observed_lr_rs, best_observed_gamma_rs, best_observed_result_rs, bo_done = load_checkpoint(experiment_name) 
  
  hebo_seq = HEBO(hyperparams_space, 
                  model_name = 'gpy')
  hebo_seq.X = past_observations
  hebo_seq.y = past_rewards
    
else: 
  experiment_df = create_experiment_df()
  
  save_checkpoint(experiment_df, 
                  experiment_name)

  
  

First we try the bayesian optimization method

In [None]:
if continue_from_checkpoint:
    init_experiment = last_bo_experiment
    init_iteration = last_bo_iteration
else:
    init_experiment = 0
    init_iteration = 1
    bo_done = False

if not bo_done:
    for e in range(init_experiment, n_experiments):
        print(f"EXPERIMENT {e}")
        if not (continue_from_checkpoint and init_experiment == e):
            # Create our optimizer
            hebo_seq = HEBO(hyperparams_space, 
                        model_name = 'gpy')


            # Do the first observation
            new_candidates = hebo_seq.suggest(n_suggestions=3)
            hebo_seq.observe(new_candidates, target_function(new_candidates))
     
            # Initialize the observations
            best_observed_result_bo = hebo_seq.y.min()
            best_observed_lr_bo, best_observed_gamma_bo = get_hyp_values(new_candidates)

            # Update the experiment history
            experiment_df = update_experiment_history(bo_method,
                                                  e,
                                                  0,
                                                  best_observed_lr_bo, # The learning rate selected for this iteration
                                                  best_observed_gamma_bo, # The gamma selected for this iteration
                                                  -best_observed_result_bo, # The reward lower bound of the model (the minus is added because we multiplied by -1 the reward in the target function in order to maximize it as HEBO uses LCB, so now to obtain the real reading we multiply by -1 again)
                                                  best_observed_lr_bo, # The best learning rate
                                                  best_observed_gamma_bo, # The best gamma
                                                  -best_observed_result_bo, # The reward lower bound (the minus is added because we multiplied by -1 the reward in the target function in order to maximize it as HEBO uses LCB, so now to obtain the real reading we multiply by -1 again)
                                                  experiment_df,
                                                  experiment_name)

        else:
            # Initialize the observations
            best_observed_result_bo = hebo_seq.y.min()
            best_observed_lr_bo = experiment_df.iloc[-1]["best_learning_rate"]
            best_observed_gamma_bo = experiment_df.iloc[-1]["best_gamma"]
        
    
        for i in range(init_iteration, n_iterations+1):
            # Obtain new candidates from HEBO
            new_candidates = hebo_seq.suggest(n_suggestions=1)
            candidate_lr_bo, candidate_gamma_bo = get_hyp_values(new_candidates)

            # Evaluate new candidates
            hebo_seq.observe(new_candidates, target_function(new_candidates))

            if (hebo_seq.y.min() < best_observed_result_bo):
                best_observed_result_bo = hebo_seq.y.min()
                best_observed_lr_bo, best_observed_gamma_bo = get_hyp_values(new_candidates)

            # Show iteration info
            print(f"Number of iteration: {i}")
            print(f"  - Unnormalized learning rate: {candidate_lr_bo}")
            print(f"  - Unnormalized gamma: {candidate_gamma_bo}")
            print(f"  - Unstandardized lower reward bound: {-hebo_seq.y[-1][0]}")
            print(f"Best point performs this way: {-best_observed_result_bo}")


            experiment_df = update_experiment_history(bo_method, 
                                                      e, 
                                                      i,
                                                      candidate_lr_bo, # The lr selected for this iteration
                                                      candidate_gamma_bo, # The gamma selected for this iteration
                                                      -hebo_seq.y[-1][0], # The reward lower bound of the model (the minus is added because we multiplied by -1 the reward in the target function in order to maximize it as HEBO uses LCB, so now to obtain the real reading we multiply by -1 again)
                                                      best_observed_lr_bo, # The best learning rate
                                                      best_observed_gamma_bo, # The best gamma
                                                      -best_observed_result_bo, # The reward lower bound (the minus is added because we multiplied by -1 the reward in the target function in order to maximize it as HEBO uses LCB, so now to obtain the real reading we multiply by -1 again) 
                                                      experiment_df,
                                                      experiment_name)
        
        init_iteration = 1

EXPERIMENT 0
