# BOTorch tutorial
Adapted of https://www.youtube.com/watch?v=BQ4kVn-Rt84

First we install BOTorch

In [None]:
!pip install botorch

Import libraries

In [None]:
import os
import torch
import numpy as np
import plotly

Objective function:

$e^{-(x-2)^2}+e^{-(x-6)^2/10} + \frac{1}{x^2+1}$

In [None]:
def target_function(individuals):
  result = []
  for x in individuals:
    result.append(np.exp(-(x[0]-2)**2) + np.exp(-(x[0]-6)**2/10) + 1/(x[0]**2+1))
  return torch.tensor(result)

Print objective function that we want to maximize

In [None]:
import plotly.graph_objects as go

x = np.linspace(-2., 10., 100)
x_new = x.reshape((100,-1))
z = target_function(x_new)

data = go.Scatter(x=x, y=z, line_color="#FE73FF")

fig = go.Figure(data=data)
fig.update_layout(title="Objective function", xaxis_title="input", yaxis_title="output")
fig.show()

Generate some data. First 10 random points from the input space.

In [None]:
train_x = torch.rand(10, 1)
train_x

Then we compute the latent function $f(x)$. The true evaluation would be contaminated. $y = f(x) + \epsilon \quad s.t. \quad \epsilon \approx N(0, \sigma)$

In [None]:
exact_obj = target_function(train_x).unsqueeze(-1)
exact_obj

Let us see which is the best observed value so far.

In [None]:
best_observed_value = exact_obj.max().item()
best_observed_value

We wrap all of the previous code into a function to be used freely

In [None]:
def generate_initial_data(n=10):
  train_x = torch.rand(n, 1, dtype=torch.double)
  exact_obj = target_function(train_x).unsqueeze(-1)
  best_observed_value = exact_obj.max().item()
  return train_x, exact_obj, best_observed_value

In [None]:
generate_initial_data(20)

Let us now invoke this function to start the BO iteration and set the bounds of the 1-D $f(x) : x \in [0,10]$

In [None]:
init_x, init_y, best_init_y = generate_initial_data(20)
bounds = torch.tensor([[-2.], [10.]]) #bounds for 2D: torch.tensor([[0., 1.], [10.,2.]]) 

We set which model and which likelihood will we use. In our case we will use a classic Gaussian process and compute its hyper-parameters using the exact marginal log likelihood (which can produce overfitting when points are reduced but well...)

In [None]:
from botorch.models import SingleTaskGP, ModelListGP
from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood

single_model = SingleTaskGP(init_x, init_y)
mll = ExactMarginalLogLikelihood(single_model.likelihood, single_model)

Now that our model is declared, we fit the previous points with the Gaussian process setting its hyperparameters via Exact Marginal log likelihood of the points. The output shows the default covariance function used by the GP and its hyper-hyperparameters. It also shows the Gaussian likelihood used and the homoskedastic noise added to the Matern Kernel to capture the noise of the data. 

In [None]:
from botorch import fit_gpytorch_model
fit_gpytorch_model(mll)

Now we declare the acquisition function that is going to be computed using the predictive distribution of the previous Gaussian process in all the input space. We will use the expected improvement

In [None]:
from botorch.acquisition.monte_carlo import qExpectedImprovement #use the noisy version if the problem has noise

EI = qExpectedImprovement(model=single_model, best_f=best_init_y)

We will now optimize the acquisition function, all the hyper parameters here are a good heuristic default to try and find the global optima of the acquisition function



In [None]:
from botorch.optim import optimize_acqf

candidates, _ = optimize_acqf(acq_function=EI, bounds=bounds, q=1, num_restarts=200, raw_samples=512, options={"batch_limit": 5, "maxiter": 200})

candidates

We now have all the code of an iteration so we just put it in a loop. To do so: We just wrap previous code into a function.

In [None]:
def get_next_points(init_x, init_y, best_init_y, bounds, n_points=1):
  single_model = SingleTaskGP(init_x, init_y)
  mll = ExactMarginalLogLikelihood(single_model.likelihood, single_model)
  fit_gpytorch_model(mll)

  EI = qExpectedImprovement(model=single_model, best_f=best_init_y)
  
  candidates, _ = optimize_acqf(acq_function=EI, bounds=bounds, q=n_points, num_restarts=200, raw_samples=512, options={"batch_limit": 5, "maxiter": 200})

  return candidates


We test the function

In [None]:
get_next_points(init_x, init_y, best_init_y, bounds, n_points=1)

Finally, we embed the previous code into the Bayesian optimization loop

In [None]:
n_iterations=2

init_x, init_y, best_init_y = generate_initial_data(20)
bounds = torch.tensor([[0.], [10.]])

for i in range(n_iterations):
  print(f"Number of iterations done: {i}")
  new_candidates = get_next_points(init_x, init_y, best_init_y, bounds, 1)
  new_results = target_function(new_candidates).unsqueeze(-1)

  print(f"New candidates are: {new_candidates}")
  init_x = torch.cat([init_x, new_candidates])
  init_y = torch.cat([init_y, new_results])

  best_init_y = init_y.max().item()
  print(f"Best point performs this way: {best_init_y}")


Get the best observed result of the optimization. We can see in the previous figure how the result is exactly the maximum. The optimization has been successful.

In [None]:
print(f"Best observed result: {best_init_y}")
best_candidate = init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0]
print(f"Best location of observed result: {best_candidate}")

In [None]:
def print_objective_function(best_candidate, iteration):
  x = np.linspace(-2., 10., 100)
  x_new = x.reshape((100,-1))
  z = target_function(x_new)

  data = go.Scatter(x=x, y=z, line_color="#FE73FF")

  fig = go.Figure(data=data)
  fig.update_layout(title="Objective function. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output")
  fig.add_vline(x=best_candidate, line_width=3, line_color="red")
  fig.show()


print_objective_function(best_candidate, 1)

In [None]:
x = torch.linspace(-2., 10., steps=100)
x_test = torch.tensor([x[0]]).unsqueeze(-1)
EI = qExpectedImprovement(model=single_model, best_f=best_init_y)
EI(x_test)

We can also plot the acquisition function, with its maximum, which is the point suggested to be evaluated in the next iteration

In [None]:
def compute_acquisition_function(single_model, best_init_y, l_bound=-2., h_bound=10., resolution=1000):
  linspace = torch.linspace(l_bound, h_bound, steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)
  EI = qExpectedImprovement(model=single_model, best_f=best_init_y)
  result = []
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(EI(x_test))
  return torch.tensor(result)

In [None]:
def print_acquisition_function(acq_fun, iteration, l_bound=-2., h_bound=10., resolution=1000, suggested=None):
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = acq_fun
  max_acq_fun = x[((acq_fun == acq_fun.max().item()).nonzero(as_tuple=True)[0])]
  data = go.Scatter(x=x, y=z, line_color="yellow")

  fig = go.Figure(data=data)
  fig.update_layout(title="Expected Improvement acquisition function. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output")
  if(suggested==None):
    fig.add_vline(x=max_acq_fun, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")
  fig.show()
  

In [None]:
acq_fun = compute_acquisition_function(single_model, best_init_y)
print_acquisition_function(acq_fun, 1)

We can as well plot the GP predictive mean and standard deviation, its predictive distribution, for all the input space.

In [None]:
def compute_predictive_distribution(single_model, best_init_y, l_bound=-2., h_bound=10., resolution=1000):
  linspace = torch.linspace(l_bound, h_bound, steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)
  result = []
  variances = []
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(single_model.posterior(x_test).mean)
    variances.append(single_model.posterior(x_test).variance)
  return torch.tensor(result), torch.tensor(variances)

In [None]:
def print_predictive_mean(predictive_mean, predictive_variance, iteration, l_bound=-2., h_bound=10., resolution=1000, suggested=None, old_obs=[], old_values=[]):
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = predictive_mean
  max_predictive_mean = x[((predictive_mean == predictive_mean.max().item()).nonzero(as_tuple=True)[0])]

  fig = go.Figure()

  fig.add_trace(go.Scatter(x=x, y= predictive_mean + np.sqrt(predictive_variance),
                                     mode='lines',
                                     line=dict(color="#19D3F3",width =0.1),
                                     name='upper bound'))
  fig.add_trace(go.Scatter(x=x, y= predictive_mean,
                         mode='lines',
                         line=dict(color="blue"),
                         fill='tonexty',
                         name='predictive mean'))
  fig.add_trace(go.Scatter(x=x, y= predictive_mean - np.sqrt(predictive_variance),
                         mode='lines',
                         line=dict(color="blue", width =0.1),
                         fill='tonexty',
                         name='lower bound'))
  
  
  
  fig.update_layout(title="GP Predictive distribution. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output", showlegend=False)

  if(suggested==None):
    fig.add_vline(x=max_predictive_mean, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")  

  if(len(old_obs)>0):
    fig.add_trace(go.Scatter(x=old_obs, y=old_values, mode = 'markers', marker_color="black", marker_size=10))

  fig.show()

In [None]:
predictive_mean, predictive_variance = compute_predictive_distribution(single_model, best_init_y)
print_predictive_mean(predictive_mean, predictive_variance, 1)

We can embed all this logic into the BO loop to have visualizations of the objective function, GP predictive distribution and acquisition function in every iteration.

In [None]:
def visualize_functions(single_model, best_init_y, best_candidate, candidate_acq_fun, iteration, previous_observations, previous_values):
  predictive_mean, predictive_variance = compute_predictive_distribution(single_model, best_init_y)
  print_predictive_mean(predictive_mean, predictive_variance, iteration, suggested=candidate_acq_fun, old_obs=previous_observations, old_values=previous_values)
  acq_fun = compute_acquisition_function(single_model, best_init_y)
  print_acquisition_function(acq_fun, iteration, suggested=candidate_acq_fun)
  print_objective_function(best_candidate, iteration)

def get_next_points_and_visualize(init_x, init_y, best_init_y, bounds, iteration, previous_observations, previous_values, n_points=1):
  single_model = SingleTaskGP(init_x, init_y)
  mll = ExactMarginalLogLikelihood(single_model.likelihood, single_model)
  fit_gpytorch_model(mll)

  EI = qExpectedImprovement(model=single_model, best_f=best_init_y)
  
  candidates, _ = optimize_acqf(acq_function=EI, bounds=bounds, q=n_points, num_restarts=200, raw_samples=512, options={"batch_limit": 5, "maxiter": 200})
  best_candidate = init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0]

  visualize_functions(single_model, best_init_y, best_candidate, candidates, iteration, previous_observations, previous_values)

  return candidates

In [None]:
n_iterations=10

init_x, init_y, best_init_y = generate_initial_data(20)
bounds = torch.tensor([[0.], [10.]])

candidates=[]
results=[]
for i in range(n_iterations):
  print(f"Number of iterations done: {i}")
  new_candidates = get_next_points_and_visualize(init_x, init_y, best_init_y, bounds, i, candidates, results, 1)
  new_results = target_function(new_candidates).unsqueeze(-1)

  print(f"New candidates are: {new_candidates}")
  init_x = torch.cat([init_x, new_candidates])
  init_y = torch.cat([init_y, new_results])

  best_init_y = init_y.max().item()
  print(f"Best point performs this way: {best_init_y}")
  candidates.append(float(new_candidates[0][0]))
  results.append(float(new_results[0][0]))