## Step 1: Install dependencies

In [None]:
!pip install importlib-metadata==4.12.0 # To overcome an issue with importlib-metadata https://stackoverflow.com/questions/73929564/entrypoints-object-has-no-attribute-get-digital-ocean
!pip install gym[box2d]
!pip install stable-baselines3[extra]
!pip install pyglet==1.5.1
!pip install ale-py==0.7.4 # To overcome an issue with gym (https://github.com/DLR-RM/stable-baselines3/issues/875)
!pip install botorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting importlib-metadata==4.12.0
  Downloading importlib_metadata-4.12.0-py3-none-any.whl (21 kB)
Installing collected packages: importlib-metadata
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 6.0.0
    Uninstalling importlib-metadata-6.0.0:
      Successfully uninstalled importlib-metadata-6.0.0
Successfully installed importlib-metadata-4.12.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Step 2: Import libraries

In [None]:
import os
import torch
import numpy as np
import plotly

import gym

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

## Step 3: Define hyperparameters

In [None]:
rl_env_name = 'LunarLander-v2'

In [None]:
policy = 'MlpPolicy'
env = make_vec_env(rl_env_name)
n_steps = 1024
batch_size = 64
n_epochs = 4
gamma = 0.999
gae_lambda = 0.98
ent_coef = 0.01

hyperparams_list = [n_steps, batch_size, n_epochs, gamma, gae_lambda, ent_coef]
lower_bounds = [100, 8, 2, 0.8, 0.8, 0.001]
upper_bounds = [4000, 256, 10, 0.999, 0.999, 0.05]

Convert lists to tensors

In [None]:
hyperparams_tensor = torch.DoubleTensor([hyperparams_list])
bounds_tensor = torch.DoubleTensor([lower_bounds, upper_bounds])

## Step 4: Create initial results

In [None]:
def get_hyp_values(hyperparams_tensor):
  hyperparameters_list = [hyperparams_tensor[0][i].item() for i in range(len(hyperparams_tensor[0]))]
  return tuple(hyperparameters_list)


def create_model(policy,
                 env,
                 hyperparams):
  
  n_steps, batch_size, n_epochs, gamma, gae_lambda, ent_coef = get_hyp_values(hyperparams)
  model = PPO(policy = policy,
              env = env,
              n_steps = 1024,
              batch_size = 64,
              n_epochs = 4,
              gamma = 0.999,
              gae_lambda = 0.98,
              ent_coef = 0.01,
              verbose=0)
  
  return  model

def train_model(model):
  model.learn(total_timesteps=10000)
  return

def evaluate_model(model):
  eval_env = gym.make("LunarLander-v2")
  mean_reward, std_reward = evaluate_policy(model, 
                                            eval_env, 
                                            n_eval_episodes=10, 
                                            deterministic=True)
  
  print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
  return torch.DoubleTensor([[mean_reward]])


In [None]:
model = create_model(policy, 
                     env, 
                     hyperparams_tensor)

Train the model for the firs time

In [None]:
# Train for timesteps
train_model(model)

Evaluate the model, create the rewards tensor (init_y) and get the best reward

In [None]:
rewards_tensor = evaluate_model(model)
best_reward = rewards_tensor.min().item()

## Step 4: Use Gaussian Process with initial data

We set which model and which likelihood will we use. In our case we will use a classic Gaussian process and compute its hyper-parameters using the exact marginal log likelihood (which can produce overfitting when points are reduced but well...)

In [None]:
from botorch.models import SingleTaskGP, ModelListGP
from gpytorch.mlls import LeaveOneOutPseudoLikelihood, ExactMarginalLogLikelihood
from botorch.models.transforms.outcome import Standardize

single_model = SingleTaskGP(hyperparams_tensor, 
                            rewards_tensor, 
                            outcome_transform=Standardize(m=1))

mll = ExactMarginalLogLikelihood(single_model.likelihood, 
                                 single_model)

Now that our model is declared, we fit the previous points with the Gaussian process setting its hyperparameters via Exact Marginal log likelihood of the points. The output shows the default covariance function used by the GP and its hyper-hyperparameters. It also shows the Gaussian likelihood used and the homoskedastic noise added to the Matern Kernel to capture the noise of the data. 

In [None]:
from botorch import fit_gpytorch_model
fit_gpytorch_model(mll)

Now we declare the acquisition function that is going to be computed using the predictive distribution of the previous Gaussian process in all the input space. We will use the upper confidence bound.

In [None]:
from botorch.acquisition.analytic import UpperConfidenceBound #use the noisy version if the problem has noise

UCB = UpperConfidenceBound(model=single_model, 
                           beta=0.1, 
                           maximize=False)

We will now optimize the acquisition function, all the hyper parameters here are a good heuristic default to try and find the global optima of the acquisition function

In [None]:
from botorch.optim import optimize_acqf

candidates, _ = optimize_acqf(acq_function=UCB, 
                              bounds=bounds_tensor, 
                              q=1, 
                              num_restarts=200,
                              raw_samples=512, 
                              options={"batch_limit": 5, "maxiter": 200})
candidates

We now have all the code of an iteration so we just put it in a loop. To do so: We just wrap previous code into a function.



In [None]:
def get_next_hyperparameters(hyperparams_tensor,
                             rewards_tensor,
                             best_reward,
                             bounds_tensor,
                             n_points=1,
                             noise=np.float64(0.07)
                             ):
  single_model = SingleTaskGP(hyperparams_tensor,
                              rewards_tensor,
                              outcome_transform=Standardize(m=1))
  mll = ExactMarginalLogLikelihood(single_model.likelihood, 
                                   single_model)
  fit_gpytorch_model(mll)

  UCB = UpperConfidenceBound(model=single_model,
                             beta=0.2, 
                             maximize=True)
  
  candidates, _ = optimize_acqf(acq_function=UCB,
                                bounds=bounds_tensor,
                                q=n_points, 
                                num_restarts=100,
                                raw_samples=512,
                                options={"batch_limit": 5, "maxiter": 200})
  
  return candidates

Finally, we embed the previous code into the Bayesian optimization loop

In [None]:
n_iterations = 5

for i in range(n_iterations):
  print(f"Number of iterations done: {i}")
  new_hyperparams = get_next_hyperparameters(hyperparams_tensor, 
                                            rewards_tensor, 
                                            best_reward, 
                                            bounds_tensor, 
                                            1)
  print(new_hyperparams)
  
  model = create_model(policy,
                       env,
                       new_hyperparams)
  train_model(model)
  new_reward = evaluate_model(model)
  
  print(f"New candidates are: {new_hyperparams}")
  hyperparams_tensor = torch.cat([hyperparams_tensor, new_hyperparams])
  rewards_tensor = torch.cat([rewards_tensor, new_reward])
 
  best_reward = rewards_tensor.max().item()
  print(f"Best hyperparameters get this mean reward: {best_reward}")