# Step 1: Install dependencies

In [None]:
!pip install gym[box2d]
!pip install stable-baselines3[extra]
!pip install botorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 KB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not r

# Step 2: Import libraries

In [None]:
import os
import torch
import numpy as np
import plotly
import plotly.graph_objects as go

import gym

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

import botorch
from botorch.utils.transforms import standardize, normalize, unnormalize

# Step 3: Define objetive function
This will be the lower bound of the mean reward of a trained model 

In [None]:
def get_hyp_values(hyperparams_tensor):
  '''
  Returns a tuple of values from a tensor containing a hyperparameter configuration

          Parameters:
                  hyperparams_tensor (torch.DoubleTensor): A tensor of size 1xn (1 row, n columns) with n being the number of hyperparameters to tune
          
          Returns:
                  hyperparams_tuple (tuple): A tuple with the unpacked values of the hyperparams_tensor 

  '''
  hyperparams_list = [hyperparams_tensor[0][i].item() for i in range(len(hyperparams_tensor[0]))]
  hyperparams_tuple = tuple(hyperparams_list)
  return tuple(hyperparams_list)


def create_model(lr,
                 policy='MlpPolicy',
                 env='CartPole-v1'):
  '''
  Returns a PPO model given a policy, environment, and hyperparameters of PPO

          Parameters:
                  lr (float): The value of the learning_rate to train the model with
                  policy (str): The NN to train with PPO in the environment. Default is 'MlpPolicy'
                  env (stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv): Specifies the gym environment to use for the training

          Returns:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
  '''
  # lr,  = get_hyp_values(hyperparams)
  model = PPO(policy = policy,
              env = env,
              learning_rate = lr,
              n_steps = 1024,
              batch_size = 64,
              n_epochs = 4,
              gamma = 0.999,
              gae_lambda = 0.98,
              ent_coef = 0.01,
              verbose=0)
  
  return  model


def train_model(model, timesteps=10000):
  '''
  Trains a PPO model during a number of timesteps
          
          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  timesteps (int): The number of timesteps used to train the model

          Returns:
                  None
  '''
  model.learn(total_timesteps=timesteps)
  return


def evaluate_model(model, 
                   rl_env_name='CartPole-v1', 
                   n_eval_episodes=10):
  '''
  Evaluates the model for a number of episodes in a specified environment, this environment MUST be the same as the one the model has been trained in.

          Parameters:
                  model (stable_baselines3.ppo.ppo.PPO): The model to train
                  rl_env_name (str): The name of the gym environment where the model has been trained
                  n_eval_episodes (int): The number of episodes for which the model will be evaluated to obtain a mean and standard deviation

          Returns:
                  lower_mean_reward (float): A tensor of size 1x1 (1 row, 1 column) containing the mean_reward
  '''
  eval_env = gym.make(rl_env_name)
  mean_reward, std_reward = evaluate_policy(model, 
                                            eval_env, 
                                            n_eval_episodes=n_eval_episodes, 
                                            deterministic=True)
  
  print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
  lower_mean_reward = mean_reward - std_reward
  return lower_mean_reward


def target_function(hyperparams, 
                    timesteps=100000,
                    rl_env_name='CartPole-v1'):
  '''
  Given a hyperparameter configuration, evaluates their performance
          Parameters:
                  hyperparams (float): The value of the learning_rate to train the model with
                  timesteps (int): timesteps (int): The number of timesteps used to train the model
                  rl_env_name (str): The name of the gym environment where the model has been trained

          Returns:
                  lower_mean_reward (float): A tensor of size 1x1 (1 row, 1 column) containing the mean_reward
  '''
  model = create_model(hyperparams)
  
  train_model(model, 
              timesteps)
  
  lower_mean_reward = evaluate_model(model, 
                                     rl_env_name)
  
  return lower_mean_reward


# Step 4: Define hyperparameters to tune
First define the bounds

In [None]:
learning_rate = 0.03
# Define here the list of parameters to tune
hyperparams_list = [learning_rate]
# Define the lower bounds of the parameters
lower_bounds = [0.0001]
# Define the upper bounds of the parameters
upper_bounds = [0.1]

Then convert lists to tensors

In [None]:
# Create tensors with the hyperparameters configurations and bounds for BOTorch to use
hyperparams_tensor = torch.DoubleTensor([hyperparams_list])
bounds_tensor = torch.DoubleTensor([lower_bounds, upper_bounds])

# Step 5: Get initial data points

In [None]:
def generate_initial_data(upper_bound, 
                          lower_bound, 
                          n=3):
  '''
  Gets n values of the hyperparameter's bounded space and evaluates them
          Parameters:
                upper_bound (float): The upper bound of the hyperparameter value
                lower_bound (float): The lower bound of the hyperparameter value
                n (int): The number of initial points to get. Default is 3
          
          Returns:
                train_x (torch.DoubleTensor): A tensor of size (n, 1) (n rows and 1 column) with the initial points
                exact_obj (torch.DoubleTensor): A tensor of size (n, 1) (n rows and 1 column) containing the evaluation of the model with the sampled hyperparameters values
                best_observed_vale: The best evaluation of the hyperparameters
  '''
  # Create our initial hyperparameter values
  train_x = torch.rand(n, 1, dtype=torch.double) * (upper_bound - lower_bound) + lower_bound

  # Evaluate them and store them in a torch.Tensor
  exact_obj = torch.tensor([[target_function(float(lr))] for lr in train_x])

  # Get the best observed value
  best_observed_value = exact_obj.max().item()
  
  return train_x, exact_obj, best_observed_value

In [None]:
hyperparams_tensor, rewards_tensor, best_reward = generate_initial_data(upper_bounds[0], lower_bounds[0], 3)



mean_reward=9.40 +/- 0.8
mean_reward=9.20 +/- 0.7483314773547883
mean_reward=9.50 +/- 0.5


In [None]:
from botorch.acquisition.analytic import ExpectedImprovement
from botorch.optim import optimize_acqf
from botorch.utils.transforms import standardize, normalize, unnormalize
from botorch.models import SingleTaskGP
from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
from botorch import fit_gpytorch_model

def compute_acquisition_function(single_model, 
                                 best_init_y,
                                 l_bound=-2.,
                                 h_bound=10.,
                                 resolution=1000):
  
  '''
  Evaluates the acquisition function in the discretized space of the bounded space
          Parameters:
                  single_model (botorch.models.gp_regression): A Gaussian Process regression model
                  best_init_y (float): The best lower_bound_reward obtained until the moment
                  l_bound (float): The lower bound of the hyperparameter value
                  h_bound (float): The upper bound of the hyperparameter value
                  resolution (int): The number of discretized points of the hyperparameter value bounded space

          Returns:
                  result_tensor (torch.Tensor): A tensor of shape 1xresolution (1 row, {resolution} colums) containing the evaluations of the acquisition function
  '''
  # Discretize the bounded hyperparameter value space
  linspace = torch.linspace(l_bound, 
                            h_bound,
                            steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)

  # Compute our acquistion function
  EI = ExpectedImprovement(model=single_model, 
                           best_f=best_init_y, 
                           maximize=True)
  result = []

  # Evaluate the acquisition function in the discretized space
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(EI(x_test))
  
  # Convert results list in a tensor
  result_tensor = torch.tensor(result)
  return result_tensor


def print_acquisition_function(acq_fun, 
                               iteration,
                               l_bound=-2.,
                               h_bound=10.,
                               resolution=1000, 
                               suggested=None):
  '''
  Plots the acquistion function given a series of evaluations previously computed
          Parameters:
                  acq_fun (torch.Tensor): A tensor of shape 1xresolution with the evaluations of the acquisition function
                  iteration (int): The iteration number of the Gaussian Process
                  l_bound (float): The lower bound of the hyperparameter value
                  h_bound (float): The upper bound of the hyperparameter value
                  resolution (int): The number of discretized points of the hyperparameter value bounded space
                  suggested (float): The suggested value of the hyperparameter

          Returns:
                  None
  '''

  # Discretize the hyperparameter value space
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = acq_fun

  # Get the value that maximizes the acquisition value
  max_acq_fun = x[((acq_fun == acq_fun.max().item()).nonzero(as_tuple=True)[0])]

  # Plot our data
  data = go.Scatter(x=x, y=z, line_color="yellow")

  # Axis
  fig = go.Figure(data=data)
  fig.update_layout(title="Expected Improvement acquisition function. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output")

  # If we have suggested a point, draw a red vertical line on it, otherwise draw a red vertical line on the value that maximizes the acq function
  if(suggested==None):
    fig.add_vline(x=max_acq_fun, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")
  fig.show()


def compute_predictive_distribution(single_model,
                                    best_init_y,
                                    l_bound=-2.,
                                    h_bound=10., 
                                    resolution=1000):
  '''
  Computes the predictive distribution of the functions given a Gaussian Process Regresion model
          Parameters:
                  single_model (botorch.models.gp_regression): A Gaussian Process regression model
                  best_init_y (float): The best lower_bound_reward obtained until the moment
                  l_bound (float): The lower bound of the hyperparameter value
                  h_bound (float): The upper bound of the hyperparameter value
                  resolution (int): The number of discretized points of the hyperparameter value bounded space

          Returns:
                  means_tensor (torch.Tensor): A tensor of shape 1xresolution (1 row, {resolution} colums) containing the means of the discretized points
                  variances_tensor (torch.Tensor): A tensor of shape 1xresolution (1 row, {resolution} colums) containing the variances of the discretized points
  '''
  # Discretize the hyperparameter value bounded space
  linspace = torch.linspace(l_bound, h_bound, steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)

  # Initialize our lists, result for the means
  result = []
  variances = []

  # Evaluate means and variances given the GP model
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(single_model.posterior(x_test).mean)
    variances.append(single_model.posterior(x_test).variance)
  
  # Convert results into tensors
  means_tensor = torch.tensor(result)
  variances_tensor = torch.tensor(variances)

  return torch.tensor(result), torch.tensor(variances)


def print_predictive_mean(predictive_mean, 
                          predictive_variance,
                          iteration, 
                          l_bound=-2.,
                          h_bound=10.,
                          resolution=1000,
                          suggested=None,
                          old_obs=[],
                          old_values=[]):
  '''
  Plots the function distribution obtained by the Gaussian Process
          Parameters:
                  predictive_mean (torch.Tensor): A tensor of shape 1xresolution (1 row, {resolution} colums) containing the means of the discretized points
                  predictive_variance (torch.Tensor): A tensor of shape 1xresolution (1 row, {resolution} colums) containing the variances of the discretized points
                  iteration (int): The iteration number of the Gaussian Process
                  l_bound (float): The lower bound of the hyperparameter value
                  h_bound (float): The upper bound of the hyperparameter value
                  resolution (int): The number of discretized points of the hyperparameter value bounded space
                  suggested (float): The suggested value of the hyperparameter
                  old_obs (list): A list with previous candidates selected by the optimization of the acquisition function
                  old_values (list): A list with previous lower bound reward of the hyperparameter candidates selected previously by the optimization of the acquisition function

          Returns:
                  None
  '''

  # Discretize the space
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = predictive_mean

  # Get the value that maximizes the predictive mean
  max_predictive_mean = x[((predictive_mean == predictive_mean.max().item()).nonzero(as_tuple=True)[0])]

  # Create figure
  fig = go.Figure()

  # Plot upper bound of the expected reward (predictive mean + predictive variance)
  fig.add_trace(go.Scatter(x=x, 
                           y= predictive_mean + np.sqrt(predictive_variance),
                           mode='lines',
                           line=dict(color="#19D3F3",width =0.1),
                           name='upper bound'))
  
  # Plot predictive mean of each point's expected reward
  fig.add_trace(go.Scatter(x=x, 
                           y= predictive_mean,
                           mode='lines',
                           line=dict(color="blue"),
                           fill='tonexty',
                           name='predictive mean'))
  
  # Plot lower bound of the expected reward (predictive mean - predictive variance)
  fig.add_trace(go.Scatter(x=x, y= predictive_mean - np.sqrt(predictive_variance),
                         mode='lines',
                         line=dict(color="blue", width =0.1),
                         fill='tonexty',
                         name='lower bound'))
  
  
  # Axis
  fig.update_layout(title="GP Predictive distribution. Iteration " + str(iteration), xaxis_title="Learning rate", yaxis_title="Expected reward", showlegend=False)

  # Add a vertical line in the point that maximizes the expected reward or in the suggested point if provided in the parameters
  if(suggested==None):
    fig.add_vline(x=max_predictive_mean, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")  

  # Plot old values
  if(len(old_obs)>0):
    fig.add_trace(go.Scatter(x=old_obs, y=old_values, mode = 'markers', marker_color="black", marker_size=10))

  fig.show()


def visualize_functions(single_model,
                        best_init_y,
                        best_candidate,
                        candidate_acq_fun,
                        iteration,
                        previous_observations,
                        previous_values,
                        bounds,
                        best_candidate_normalized):
  '''
  Function that visualizes the acquisition function and gaussian process
          Parameters:
                  single_model (botorch.models.gp_regression): A Gaussian Process regression model
                  best_init_y (float): The best lower_bound_reward obtained until the moment
                  best_candidate (float): The normalized best candidate until this iteration of the optimization process 
                  candidate_acq_fun (float): The selected unnormalized candidate in this iteration of the optimization process
                  iteration (int): The iteration number of the optimization process
                  previous_observations (list): A list with previous candidates selected by the optimization of the acquisition function
                  previous_values (list): A list with previous lower bound reward of the hyperparameter candidates selected previously by the optimization of the acquisition function
                  bounds (torch.Tensor): Normalized bounds of the hyperparameter values in the form of tensors of shape 2x1 (2 rows, 1 column), first row containing lower bound, second containing upper bound

          Returns:
                  None

  '''

  # Compute the mean and variance of the function distribution given by the gaussian process (our single model)
  predictive_mean, predictive_variance = compute_predictive_distribution(single_model, 
                                                                         best_init_y,
                                                                         l_bound=0,
                                                                         h_bound=1)
  # Plot the distribution
  print_predictive_mean(predictive_mean, 
                        predictive_variance, 
                        iteration,
                        suggested=candidate_acq_fun, 
                        old_obs=previous_observations,
                        old_values=previous_values,
                        l_bound=bounds[0][0],
                        h_bound=bounds[1][0])
  
  # Compute the acquisition function 
  acq_fun = compute_acquisition_function(single_model, 
                                         best_init_y, 
                                         l_bound=0,
                                         h_bound=1)

  # Plot the acquisition function
  print_acquisition_function(acq_fun,
                             iteration,
                             suggested=candidate_acq_fun, 
                             l_bound=bounds[0][0],
                             h_bound=bounds[1][0])
  

def get_next_points_and_visualize_norm(init_x,
                                       init_y, 
                                       best_init_y, 
                                       normalized_bounds, 
                                       iteration, 
                                       previous_observations,
                                       previous_values,
                                       bounds,
                                       n_points=1):
  '''
  Function that computes the next point to add to the Gaussian Process and visualizes the acquisition function and function distribution
  '''
  # Create our model with the points
  single_model = SingleTaskGP(init_x, init_y)

  mll = ExactMarginalLogLikelihood(single_model.likelihood, single_model)
  fit_gpytorch_model(mll)

  # Instantiaet the acquisition function given our model
  EI = ExpectedImprovement(model=single_model, best_f=best_init_y, maximize=True)
  
  # Optimize the acquisition function
  candidates, _ = optimize_acqf(acq_function=EI, 
                                bounds=normalized_bounds, 
                                q=n_points,
                                num_restarts=200,
                                raw_samples=512, 
                                options={"batch_limit": 5, "maxiter": 200})
  
  # Get the best candidate unnormalized
  best_candidate = unnormalize(init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0], bounds=normalized_bounds)
  # Get our best candidate normalized for the GP to use
  best_candidate_normalized = init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0]

  # Visualize 
  visualize_functions(single_model, 
                      best_init_y,
                      best_candidate,
                      unnormalize(candidates, bounds=bounds),
                      iteration, previous_observations,
                      previous_values, 
                      bounds, 
                      best_candidate_normalized)

  return candidates

In [None]:
# Set the number of iterations of the GP
n_iterations=50

# Sample initial hyperparameter values and evaluate the models obtained with them
init_x, init_y, best_init_y = generate_initial_data(upper_bounds[0],
                                                    lower_bounds[0],
                                                    3)

# We normalize the bound of the hyperparameters as BOTorch assumes this
normalized_bounds = torch.tensor([[0.0], [1.0]])
init_x_normalized = normalize(init_x, bounds=bounds_tensor)

# Standardize the objective as BOTorch assumes this
init_y_standardized = standardize(init_y)

# Obtain the best result among the initial random experiments
best_init_y_standardized = init_y_standardized.max().item()

candidates=[]
results=[]
for i in range(n_iterations):
  print(f"Number of iterations done: {i}")
  normalized_new_candidates = get_next_points_and_visualize_norm(init_x_normalized,
                                                                 init_y_standardized, 
                                                                 best_init_y_standardized, 
                                                                 normalized_bounds,
                                                                 i, 
                                                                 init_x,
                                                                 init_y,
                                                                 bounds_tensor,
                                                                 1)
  new_candidates = unnormalize(normalized_new_candidates, bounds=bounds_tensor)
  new_results = torch.tensor([[target_function(float(new_candidates))]])

  print(f"New candidates are: {new_candidates}")
  init_x = torch.cat([init_x, new_candidates])
  init_y = torch.cat([init_y, new_results])
  init_x_normalized = normalize(init_x, bounds=bounds_tensor)
  init_y_standardized = standardize(init_y)

  best_init_y = init_y.max().item()
  best_init_y_standardized = init_y_standardized.max().item()
  print(f"Best point performs this way: {best_init_y}")
  candidates.append(float(normalized_new_candidates[0][0]))
  results.append(float(standardize(new_results[0][0])))
  print('----------------------')

mean_reward=110.70 +/- 59.60880807397511
mean_reward=297.80 +/- 62.82961085348214
mean_reward=9.10 +/- 0.7000000000000001
Number of iterations done: 0



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=9.40 +/- 0.8
New candidates are: tensor([[0.1000]], dtype=torch.float64)
Best point performs this way: 234.97038914651787
Number of iterations done: 1



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 2



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 3



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0051]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 4



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=439.30 +/- 98.83121976379731
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 5



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=487.40 +/- 20.766318884193222
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 6



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=487.90 +/- 26.04016128982307
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 7



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 8



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 9



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 10



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=318.60 +/- 96.33815443530149
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 11



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=428.80 +/- 82.12161712971805
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 12



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=446.10 +/- 67.81069237222107
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 13



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=9.40 +/- 0.8
New candidates are: tensor([[0.0717]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 14



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=481.90 +/- 54.300000000000004
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 15



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 16



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=478.30 +/- 47.613128441638864
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 17



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=495.30 +/- 14.1
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 18



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 19



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=464.30 +/- 55.48702551047407
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 20



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0079]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 21



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=425.70 +/- 125.72831821033795
New candidates are: tensor([[0.0080]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 22



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 23



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 24



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 25



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 26



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 27



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=489.10 +/- 31.389329397105637
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 28



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=481.70 +/- 37.02445138013526
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 29



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 30



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=476.20 +/- 54.75180362325976
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 31



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 32



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 33



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=340.40 +/- 97.16707261207368
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 34



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=449.20 +/- 82.85746797965768
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 35



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=9.10 +/- 0.7000000000000001
New candidates are: tensor([[0.0079]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 36



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=471.30 +/- 59.799749163353525
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 37



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=491.70 +/- 22.360903380677627
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 38



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 39



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 40



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 41



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=485.10 +/- 43.05914537006046
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 42



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=486.00 +/- 42.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 43



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 44



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 45



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=453.70 +/- 72.25517282520332
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 46



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=489.30 +/- 24.091699815496625
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 47



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 48



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=488.80 +/- 33.6
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
Number of iterations done: 49



Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.



mean_reward=500.00 +/- 0.0
New candidates are: tensor([[0.0001]], dtype=torch.float64)
Best point performs this way: 500.0
