# Step 1: Install dependencies

# Step 2: Import libraries

Libraries used for the Bayesian Optimization Loop

In [1]:
import os
import torch
import numpy as np
import plotly
import plotly.graph_objects as go
import pandas as pd

import gymnasium as gym

import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

import botorch
from botorch.utils.transforms import standardize, normalize, unnormalize

Libraries used to save checkpoints in GDrive

# Step 3: Define objetive function
This will be the lower bound of the mean reward of a trained model 

In [2]:
def get_hyp_values(hyperparams_tensor):
  '''
  Returns a tuple of values from a tensor containing a hyperparameter configuration

          Parameters:
                  hyperparams_tensor (torch.DoubleTensor): A tensor of size 1xn (1 row, n columns) with n being the number of hyperparameters to tune
          
          Returns:
                  hyperparams_tuple (tuple): A tuple with the unpacked values of the hyperparams_tensor 

  '''
  hyperparams_list = [hyperparams_tensor[i].item() for i in range(len(hyperparams_tensor))]
  hyperparams_tuple = tuple(hyperparams_list)
  return tuple(hyperparams_list)


def create_model(hyperparams,
                 policy='MlpPolicy',
                 env_name='LunarLander-v2'):
  '''
  Returns a PPO model given a policy, environment, and hyperparameters of PPO

          Parameters:
                  hyperparams (torch.DoubleTensor): A tensor of size 1x2 (1 row, n columns) with the learning rate and gamma to train the model with
                  policy (str): The NN to train with PPO in the environment. Default is 'MlpPolicy'
                  env (stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv): Specifies the gym environment to use for the training

          Returns:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
  '''
  lr, gamma  = get_hyp_values(hyperparams)
  env = make_vec_env(env_name, n_envs=1)
  model = PPO(policy = policy,
              env = env,
              learning_rate = lr,
              n_steps = 1024,
              batch_size = 64,
              n_epochs = 4,
              gamma = gamma,
              gae_lambda = 0.98,
              ent_coef = 0.01,
              verbose=0)
  
  return model


def train_model(model, timesteps=1000000):
  '''
  Trains a PPO model during a number of timesteps
          
          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  timesteps (int): The number of timesteps used to train the model

          Returns:
                  None
  '''
  model.learn(total_timesteps=timesteps)
  return


def evaluate_model(model, 
                   rl_env_name='LunarLander-v2', 
                   n_eval_episodes=25):
  '''
  Evaluates the model for a number of episodes in a specified environment, this environment MUST be the same as the one the model has been trained in.

          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  rl_env_name (str): The name of the gym environment where the model has been trained
                  n_eval_episodes (int): The number of episodes for which the model will be evaluated to obtain a mean and standard deviation

          Returns:
                  lower_mean_reward (float): A tensor of size 1x1 (1 row, 1 column) containing the mean_reward
  '''
  eval_env = gym.make(rl_env_name)
  mean_reward, std_reward = evaluate_policy(model, 
                                            eval_env, 
                                            n_eval_episodes=n_eval_episodes, 
                                            deterministic=True)
  
  print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
  lower_mean_reward = mean_reward - std_reward
  return lower_mean_reward


def target_function(hyperparams, 
                    timesteps=1000000,
                    rl_env_name='LunarLander-v2'):
  '''
  Given a hyperparameter configuration, evaluates their performance
          Parameters:
                  hyperparams (torch.DoubleTensor): A tensor of size 1x2 (1 row, n columns) with the learning rate and gamma to train the model with
                  timesteps (int): timesteps (int): The number of timesteps used to train the model
                  rl_env_name (str): The name of the gym environment where the model has been trained

          Returns:
                  lower_mean_reward (float): A tensor of size 1x1 (1 row, 1 column) containing the mean_reward
  '''
  model = create_model(hyperparams, env_name=rl_env_name)
  
  train_model(model, 
              timesteps)
  
  lower_mean_reward = evaluate_model(model, 
                                     rl_env_name)
  
  return lower_mean_reward


# Step 4: Define hyperparameters to tune
First define the bounds

In [3]:
lr = 0.05
gamma = 0.85
# Define here the list of parameters to tune
hyperparams_list = [lr, gamma]
# Define the lower bounds of the parameters
lower_bounds = [0.0001, 0.8]
# Define the upper bounds of the parameters
upper_bounds = [0.1, 0.9997]

Then convert lists to tensors

In [4]:
# Create tensors with the hyperparameters configurations and bounds for BOTorch to use
hyperparams_tensor = torch.DoubleTensor([hyperparams_list])
bounds_tensor = torch.DoubleTensor([lower_bounds, upper_bounds])

# Step 5: Define functions needed for the Bayesian Optimization Loop

In [5]:
def generate_initial_data(bounds, 
                          n=3):
  '''
  Gets n values of the hyperparameter's bounded space and evaluates them
          Parameters:
                bounds (torch.DoubleTensor): The torch tensor containing the upper and lower bounds of the hyperparameters (lr and gamma in this case)
                n (int): The number of initial points to get. Default is 3
          
          Returns:
                train_x (torch.DoubleTensor): A tensor of size (n, 1) (n rows and 1 column) with the initial points
                exact_obj (torch.DoubleTensor): A tensor of size (n, 1) (n rows and 1 column) containing the evaluation of the model with the sampled hyperparameters values
                best_observed_vale: The best evaluation of the hyperparameters
  '''
  # Create our initial hyperparameter values
  lower_bounds = bounds[0]
  upper_bounds = bounds[1]
  train_x = torch.rand(n, len(lower_bounds), dtype=torch.double) * (upper_bounds - lower_bounds) + lower_bounds

  # Evaluate them and store them in a torch.Tensor
  exact_obj = torch.tensor([[target_function(hyp)] for hyp in train_x])

  # Get the best observed value
  best_observed_value = exact_obj.max().item()
  
  return train_x, exact_obj, best_observed_value

In [6]:
from botorch.acquisition.analytic import ExpectedImprovement, UpperConfidenceBound
from botorch.optim import optimize_acqf
from botorch.utils.transforms import standardize, normalize, unnormalize
from botorch.models import SingleTaskGP
from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
from botorch import fit_gpytorch_model
  

def get_next_points(init_x,
                    init_y,
                    best_init_y,
                    normalized_bounds,
                    n_points=1):
  '''
  Function that computes the next point to add to the Gaussian Process and visualizes the acquisition function and function distribution
          Parameters:
                  init_x (torch.Tensor): A tensor of shape {iterations}x2 containing the previous hyperparameters
                  init_y (torch.Tensor): A tensor of shape {iterations}x1 containing the previous rewards of the models trained with the init_x hyperparameters values
                  best_init_y (float): Best reward obtained until the moment
                  normalized_bounds (torch.Tensor): Normalized bounds of the hyperparameter values in the form of tensors of shape 2x1 (2 rows, 1 column), first row containing lower bound, second containing upper bound
                  n_points (int): Number of candidates to obtain for the next iteration. Default is 1

          Returns:
                  candidates (torch.Tensor): A tensor of shape 1x2 containing the value of the hyperparameters that optimizes the acquisition function
  '''
  # Create our probabilistic model with the points
  single_model = SingleTaskGP(init_x, init_y)
  mll = ExactMarginalLogLikelihood(single_model.likelihood, 
                                   single_model)
  # Fit our model
  fit_gpytorch_model(mll)

  # Instantiate the acquisition function given our model
  UCB = UpperConfidenceBound(model=single_model,
                             beta=0.2,
                             maximize=True)
  
  # Maximize the acquisition function to obtain our candidates 
  candidates, _ = optimize_acqf(acq_function=UCB, 
                                bounds=normalized_bounds,
                                q=n_points, num_restarts=200,
                                raw_samples=512,
                                options={"batch_limit": 5, "maxiter": 200})

  return candidates

# Step 6: Set experiments' configuration

First let us define functions to save and load checkpoints

In [7]:
import itertools
def create_experiment_df():
  '''
  Creates an empty dataframe to save checkpoints
          Parameters:

          Returns:
                  experiment_df (pandas.DataFrame): An empty Dataframe with columns specified below that will be used to save the experiment history
  '''
  columns = ["method",
             "experiment",
             "iteration",
             "learning_rate",
             "gamma",
             "reward_lower_bound",
             "best_learning_rate",
             "best_gamma",
             "best_reward_lower_bound"
             ]

  experiment_df = pd.DataFrame(columns=columns)
  return experiment_df


def get_filepath(experiment_name):
  '''
  Returns the path of the csv of the experiment specified.
          Parameters:
                  experiment_name (string): The name of the experiment

          Returns:
                  filepath (string): The path to the .csv file that has the data of the experiment
  '''
  filepath = f"./experiments_results/{experiment_name}.csv"
  return filepath


def update_experiment_history(method, 
                              experiment_number, 
                              iteration,
                              lr,
                              gamma,
                              reward_lower_bound,
                              best_lr,
                              best_gamma,
                              best_reward_lower_bound,
                              experiment_df,
                              experiment_name):
  '''
  Updates the experiment dataframe and saves it in GDrive
          Parameters:
                  method (int): 0 if Bayesian Optimization, 1 if Random Search
                  experiment_number (int): The id of the experiment
                  iteration (int): The current iteration of the experiment
                  lr (float): The learning rate value selected for this iteration
                  gamma (float): The gamma value selected for this iteration
                  reward_lower_bound (float): The reward's lower bound obtained by the model trained this iteration
                  best_lr (float): The learning rate value that has induced the best reward lower bound
                  best_gamma (float): The gamma value that has induced the best reward lower bound
                  best_reward_lower_bound (float): The best reward lower bound obtained until this iteration
                  experiment_df (pandas.DataFrame): The dataframe containing the experiments history
                  experiment_name (string): The name of the experiment

          Returns:
                  concatenated_df (pandas.DataFrame): The updated experiments history dataframe
  '''
  # Define the columns of the dataframe
  columns = ["method",
             "experiment",
             "iteration",
             "learning_rate",
             "gamma",
             "reward_lower_bound",
             "best_learning_rate",
             "best_gamma",
             "best_reward_lower_bound"]

  # Set the values of the columns given by the iteration configuration and result
  iteration_results = [[method,
                        experiment_number,
                        iteration,
                        lr,
                        gamma,
                        reward_lower_bound,
                        best_lr,
                        best_gamma,
                        best_reward_lower_bound]]

  # Create a one row dataframe for this experiment
  new_iteration_df = pd.DataFrame(iteration_results, columns=columns)

  # Concatenate the experiments history with this experiment's results
  concatenated_df = pd.concat([experiment_df, new_iteration_df], ignore_index=True)

  # Save the updated history in google drive
  save_checkpoint(concatenated_df,
                  experiment_name)
  
  # Return the concatenated dataframe representing the updated experiment history
  return concatenated_df


def save_checkpoint(experiment_df,
                    experiment_name):
  '''
  Saves the experiments history dataframe in google drive
          Parameters:
                  experiment_df (pandas.DataFrame): The dataframe containing the experiments history
                  experiment_name (string): The name of the experiment
          
          Returns:
                  None
  '''

  filepath = get_filepath(experiment_name)
  experiment_df.to_csv(filepath, index=False)


def load_checkpoint(experiment_name,
                    experiment_results,
                    experiment_configurations):
  '''
  Loads a checkpoint of an experiment given its name

          Parameters:
                  experiment_name (string): The name of the experiment
                  experiment_results (numpy.array): A numpy array of three dimensions (method, iteration, best_result)
                  expeirment_configuration (numpy.array): A numpy array of three dimensions (method, iteration, best_learning rate)

          Returns:
                  experiment_df (pandas.DataFrame): A dataframe with the experiment history
  '''
  # First we retrieve the dataframe from GDrive
  filepath = get_filepath(experiment_name)
  experiment_df = pd.read_csv(filepath)

  # Now we iterate through the rows of the dataframe to update the experiment history numpy arrays that we will use later to compare the methods and plot results
  for index, row in experiment_df.iterrows():
      # Unpack the columns
      method, exp, iter, lr, gamma, rlb, best_lr, best_gamma, best_rlb = row.values
      # Add them to the experiments arrays
      experiment_results[int(method)][int(exp)][int(iter)] = best_rlb
      experiment_configurations[int(method)][int(exp)][int(iter)][0] = best_lr
      experiment_configurations[int(method)][int(exp)][int(iter)][1] = best_gamma
  

  method, exp, iter, lr, gamma, rlb, best_lr, best_gamma, best_rlb = experiment_df.iloc[-1]

  if method == 0:
    bo_done = False
    last_bo_experiment = int(exp)
    last_rs_experiment = 0

    # Plus one because we want to start in the next one
    last_bo_iteration = int(iter)+1
    last_rs_iteration = 1

  else:
    bo_done = True
    last_bo_experiment = experiment_configurations.shape[1]-1
    last_rs_experiment = int(exp)

    # Plus one because we want to start in the next one
    last_bo_iteration = experiment_configurations.shape[2]-1
    last_rs_iteration = int(iter)+1

  # Now lets get the initial data
  bo_experiment_df = experiment_df[(experiment_df["method"]==method) & (experiment_df["experiment"]==int(exp))]

  init_x = torch.DoubleTensor([[float(lr), float(gamma)] for (lr, gamma) in zip(bo_experiment_df.learning_rate.values, bo_experiment_df.gamma.values)])

  init_y = torch.DoubleTensor([[float(reward)] for reward in bo_experiment_df.reward_lower_bound.values])
  best_init_y = init_y.max().item()

  rs_experiment_df = experiment_df[experiment_df["method"]==0]
  if rs_experiment_df.empty:
      best_rs_lr = 0
      best_rs_gamma = 0
      best_rs_r = 0
  else:
      best_rs_lr = rs_experiment_df.iloc[-1]["best_learning_rate"]
      best_rs_gamma = rs_experiment_df.iloc[-1]["best_gamma"]
      best_rs_r = rs_experiment_df.iloc[-1]["best_reward_lower_bound"]
             
  return experiment_df, last_bo_experiment, last_rs_experiment, last_bo_iteration, last_rs_iteration, init_x, init_y, best_init_y, best_rs_lr, best_rs_gamma, best_rs_r, bo_done

Now let's set the configuration for the experiments

In [11]:
import numpy as np
# The name of the file (WITHOUT EXTENSION) where the history of experiments will be saved
experiment_name = "lunar_lander_learning_rate_gamma"
# If true, this will look for the experiment history .csv in google drive and continue from there
continue_from_checkpoint = False

# Number of experiments per method
n_experiments = 15

# Number of iterations per experiment after the first random point being evaluated
n_iterations = 25

# Number of methods
n_methods = 2

# Index of Bayesian Optimization method
bo_method = 0

# Index of Random Search method
rs_method = 1

# Number of Hyperparameters
n_hyperparameters = 2

# Arrays containing the results and configurations of experiments
experiment_results = np.zeros((n_methods, n_experiments, n_iterations+1))
experiment_configurations = np.zeros((n_methods, n_experiments, n_iterations+1, len(bounds_tensor[0])))

# Now load checkpoint if necessary
if continue_from_checkpoint:
  experiment_df, last_bo_experiment, last_rs_experiment, last_bo_iteration, last_rs_iteration, init_x, init_y, best_init_y, best_observed_lr_rs, best_observed_gamma_rs, best_observed_result_rs, bo_done = load_checkpoint(experiment_name,
                                            experiment_results,
                                            experiment_configurations) 
else: 
  experiment_df = create_experiment_df()
  
  save_checkpoint(experiment_df, 
                  experiment_name)

  
  

First we try the bayesian optimization method

In [9]:
if continue_from_checkpoint:
  init_experiment = last_bo_experiment
  init_iteration = last_bo_iteration
else:
  init_experiment = 0
  init_iteration = 1
  bo_done = False

if not bo_done:
  for e in range(init_experiment, n_experiments):
    print(f"EXPERIMENT {e}")
    if not (continue_from_checkpoint and init_experiment == e):
      # Sample initial hyperparameter values and evaluate the models obtained with them
      init_x, init_y, best_init_y = generate_initial_data(bounds_tensor,
                                                          1)

    # We normalize the bounds of the hyperparameters as BOTorch assumes this
    normalized_bounds = torch.tensor([np.zeros(len(bounds_tensor[0])), np.ones(len(bounds_tensor[0]))])

    # Normalize the hyperparameter as BOTorch assumes this
    init_x_normalized = normalize(init_x,
                                  bounds=bounds_tensor)

    # Standardize the objective as BOTorch assumes this
    init_y_standardized = standardize(init_y)

    # Obtain the best result among the initial random experiments
    best_init_y_standardized = init_y_standardized.max().item()

    best_observed_result_bo = best_init_y
    best_observed_lr_bo, best_observed_gamma_bo = get_hyp_values(init_x[0])

    if not (continue_from_checkpoint and init_experiment == e):
      experiment_df = update_experiment_history(bo_method, 
                                                e, 
                                                0,
                                                best_observed_lr_bo, # The learning rate selected for this iteration
                                                best_observed_gamma_bo, # The gamma selected for this iteration
                                                best_observed_result_bo, # The reward lower bound of the model
                                                best_observed_lr_bo, # The best learning rate
                                                best_observed_gamma_bo, # The best gamma
                                                best_observed_result_bo, # The reward lower bound
                                                experiment_df,
                                                experiment_name)
      
      experiment_configurations[rs_method,e,0,0] = best_observed_lr_bo
      experiment_configurations[rs_method,e,0,1] = best_observed_gamma_bo
      experiment_results[rs_method,e,0] = best_observed_result_bo

    for i in range(init_iteration, n_iterations+1):
      # Get the next points given our actual queries
      normalized_new_candidates = get_next_points(init_x_normalized,
                                                  init_y_standardized, 
                                                  best_init_y_standardized, 
                                                  normalized_bounds,
                                                  n_points=1)
    
      # Unnormalize the candidate hyperparameter value
      new_candidates = unnormalize(normalized_new_candidates,
                                   bounds=bounds_tensor)
      
      # Compute the performance of the model
      new_results = torch.tensor([[target_function(new_candidates[0])]])

      # Update our hyperparameters and rewards history
      init_x = torch.cat([init_x, new_candidates])
      init_y = torch.cat([init_y, new_results])

      # Normalize our updated hyperparameters and rewards history
      init_x_normalized = normalize(init_x, bounds=bounds_tensor)
      init_y_standardized = standardize(init_y)

      # Update the best reward
      best_init_y = init_y.max().item()
      best_init_y_standardized = init_y_standardized.max().item()
      
      # Show iteration info
      
      print(f"Number of iteration: {i}")
      print(f"  - Unnormalized learning rate: {new_candidates[0][0].item()}")
      print(f"  - Normalized learning rate: {normalized_new_candidates[0][0].item()}")
      print(f"  - Unnormalized gamma: {new_candidates[0][1].item()}")
      print(f"  - Normalized gamma: {normalized_new_candidates[0][1].item()}")
      print(f"  - Unstandardized lower reward bound: {new_results.item()}")
      print(f"  - Standardized lower reward bound: {init_y_standardized[-1].item()}")
      print(f"Best point performs this way: {best_init_y}")


      if best_observed_result_bo < new_results[0][0].item():
        best_observed_result_bo = new_results[0][0].item()
        best_observed_lr_bo, best_observed_gamma_bo = get_hyp_values(new_candidates[0])

      experiment_df = update_experiment_history(bo_method, 
                                                e, 
                                                i,
                                                new_candidates[0][0].item(), # The lr selected for this iteration
                                                new_candidates[0][1].item(), # The gamma selected for this iteration
                                                new_results[0][0].item(), # The reward lower bound of the model
                                                best_observed_lr_bo, # The best learning rate
                                                best_observed_gamma_bo, # The best gamma
                                                best_observed_result_bo, # The reward lower bound
                                                experiment_df,
                                                experiment_name)

      experiment_configurations[bo_method,e,i,0] = best_observed_lr_bo
      experiment_configurations[bo_method,e,i,1] = best_observed_gamma_bo
      experiment_results[bo_method,e,i] = best_observed_result_bo
      print('----------------------')
    init_iteration = 1

EXPERIMENT 0




mean_reward=-851.87 +/- 521.1562181712144


  normalized_bounds = torch.tensor([np.zeros(len(bounds_tensor[0])), np.ones(len(bounds_tensor[0]))])


mean_reward=-343.31 +/- 96.31171471689615
Number of iteration: 1
  - Unnormalized learning rate: 0.0001
  - Normalized learning rate: 0.0
  - Unnormalized gamma: 0.9997
  - Normalized gamma: 1.0
  - Unstandardized lower reward bound: -439.6173378644187
  - Standardized lower reward bound: 0.7071067811865476
Best point performs this way: -439.6173378644187
----------------------


KeyboardInterrupt: 

Now we perform a random search

In [12]:
if continue_from_checkpoint:
  init_experiment = last_rs_experiment
  init_iteration = last_rs_iteration
else:
  init_experiment = 0
  init_iteration = 1

for e in range(init_experiment, n_experiments):
  if not (continue_from_checkpoint and init_experiment == e) or (init_experiment==0 and init_iteration==1):
    # Initiate with a random value
    random_value = torch.rand(1, len(bounds_tensor[0])) * (bounds_tensor[1] - bounds_tensor[0]) + bounds_tensor[0]
    best_observed_result_rs = target_function(random_value[0])
    best_observed_lr_rs, best_observed_gamma_rs = get_hyp_values(random_value[0])
    # Update our experiments histories
    experiment_df = update_experiment_history(rs_method, 
                                              e, 
                                              0,
                                              best_observed_lr_rs, # The learning rate value selected for this iteration
                                              best_observed_gamma_rs, # The gamma value selected for this iteration
                                              best_observed_result_rs, # The reward lower bound of the model
                                              best_observed_lr_rs, # The best learning rate
                                              best_observed_gamma_rs, # The best gamma
                                              best_observed_result_rs, # The reward lower bound
                                              experiment_df,
                                              experiment_name)
  
  # Iterate with random search
  for i in range(init_iteration, n_iterations+1):
    # Get a new random value for the hyperparameter
    random_value = torch.rand(1, len(bounds_tensor[0])) * (bounds_tensor[1] - bounds_tensor[0]) + bounds_tensor[0]
    # Evaluate the model with that hyperparameter value
    rs_obj_fun_result = target_function(random_value[0])

    # Update best reward and candidate found if necessary
    if best_observed_result_rs < rs_obj_fun_result:
      best_observed_result_rs = rs_obj_fun_result
      best_observed_lr_rs, best_observed_gamma_rs = get_hyp_values(random_value[0])
    
    # Update our experiments histories
    experiment_df = update_experiment_history(rs_method, 
                                              e, 
                                              i,
                                              best_observed_lr_rs, # The learning rate value selected for this iteration
                                              best_observed_gamma_rs, # The gamma value selected for this iteration
                                              best_observed_result_rs, # The reward lower bound of the model
                                              best_observed_lr_rs, # The best learning rate
                                              best_observed_gamma_rs, # The best gamma
                                              best_observed_result_rs, # The reward lower bound
                                              experiment_df,
                                              experiment_name)
    
    experiment_configurations[rs_method,e,i,0] = best_observed_lr_rs
    experiment_configurations[rs_method,e,i,1] = best_observed_gamma_rs
    experiment_results[rs_method,e,i] = best_observed_result_rs

  init_iteration = 1



mean_reward=-263.70 +/- 204.69715022061874
mean_reward=-844.08 +/- 457.2561413948255
mean_reward=-507.06 +/- 58.7860923011357
mean_reward=-169.76 +/- 145.36366431745546
mean_reward=-1203.63 +/- 589.8601764455119


KeyboardInterrupt: 

# Step 7: Compare the results

First we give the recommendation as the best observed result

In [None]:
best_observed_result = np.max(experiment_results)
index_set = np.where(experiment_results==best_observed_result)
print("The best observed result is: " + str(best_observed_result))
print("The best observed result belong to the : " + str(index_set[0][0]) + " method. Its value is " + str(experiment_configurations[index_set][0]))

And now we plot the results to compare both methods

In [None]:
x = np.linspace(1, n_iterations, n_iterations).astype(int)
mean_bo = np.mean(experiment_results[0,:,:], axis=0)
mean_rs = np.mean(experiment_results[1,:,:], axis=0)
std_bo = np.std(experiment_results[0,:,:], axis=0) * 0.25
std_rs = np.std(experiment_results[1,:,:], axis=0) * 0.25
bo_ub_results = go.Scatter(x=x, y=mean_bo + std_bo, mode='lines', name="", line_color="green", line_width=0.1)
bo_results = go.Scatter(x=x, y=mean_bo, mode='lines', fill='tonexty', line_color="green", name="Bayesian Optimization")
bo_lb_results = go.Scatter(x=x, y=mean_bo - std_bo, mode='lines', fill='tonexty', name="", line_color="green", line_width=0.1)

rs_ub_results = go.Scatter(x=x, y=mean_rs + std_rs, mode='lines', name="", line_color="red", line_width=0.1)
rs_results = go.Scatter(x=x, y=mean_rs, mode='lines', fill='tonexty', line_color="red", name="Random Search")
rs_lb_results = go.Scatter(x=x, y=mean_rs - std_rs, mode='lines', fill='tonexty', name="", line_color="red", line_width=0.1)
  
fig = go.Figure()
fig.add_trace(bo_ub_results)
fig.add_trace(bo_results)
fig.add_trace(bo_lb_results)
fig.add_trace(rs_ub_results)
fig.add_trace(rs_results)
fig.add_trace(rs_lb_results)
fig.update_layout(title="Performance comparison between BO and RS", xaxis_title="Iterations", yaxis_title="Reward lower bound")
fig.show()