# BOTorch tutorial
Here we illustrate how to tune a machine learning hyper-parameter using BOTorch.

First we install BOTorch

In [None]:
!pip install botorch

Import libraries. We will just tune the alpha regularization term of a MLPClassifier on a synthetic classification problem.

In [None]:
import os
import torch
import numpy as np
import plotly
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

seed=1
np.random.seed(seed)
torch.manual_seed(seed)

Objective function: Estimation of the generalization error measured by the lower confidence bound of the accuracy loss function of a multilayer perceptron whose weights are estimated via a k-fold cross validation on a synthetic dataset of 20 variables $\mathbf{X}$ to perform binary classification of the dummy variable $y$. We do not need to shuffle the dataset as it is synthetic, but watch out of that for real problems.

In [None]:
X, y = make_classification(n_samples=1000, random_state=1)

We have a low fidelity of the real performance brought by this classifier here as I limit the number of epochs of the MLP to 150 for didactic purposes. Also, the variance of the classifier can be improved by setting a higher number of folds, but I also set it to 3 for didactic purposes. We set the synthetic problem to be common for all the experiments. 

In [None]:
def target_function(alpha_regularizer=0.0001, seed=1):
  X, y = make_classification(n_samples=1000, random_state=1)
  clf = MLPClassifier(random_state=seed, max_iter=150, alpha=alpha_regularizer)
  scores = cross_val_score(clf, X, y, cv=3)
  obj_function = scores.mean() - scores.std()
  result = []
  result.append(obj_function)
  print("Estimation of the accuracy of the MLP on the synthetic dataset done")
  return result

Print the accuracy function of the alpha hyper-parameter, that we want to maximize. Here we set the resolution to 100 for didactic purposes. As the 3-fold CV estimates three models to estimate the value of the accuracy parameter that represents the generalization error, we are really training 300 models here. So this may be computationally costly.

In [None]:
import plotly.graph_objects as go

lower_bound = 0.000001
upper_bound = 0.1
resolution = 50
#resolution = 100
x = np.linspace(lower_bound, upper_bound, resolution)
x_new = x.reshape((resolution,-1))
z = np.array([target_function(alpha) for alpha in x]).reshape((resolution))

data = go.Scatter(x=x, y=z, line_color="#FE73FF")

fig = go.Figure(data=data)
fig.update_layout(title="Accuracy estimated by 3-fold-CV", xaxis_title="Alpha regularizer value", yaxis_title="Accuracy")
fig.show()

Generate some data. Example of 10 random points from the input space, normalized into the range.

In [None]:
train_x = torch.rand(3, 1) * (upper_bound - lower_bound) + lower_bound
print(train_x.min())
print(train_x.max())

Then we compute the latent function $f(x)$. The true evaluation would be contaminated. $y = f(x) + \epsilon \quad s.t. \quad \epsilon \approx N(0, \sigma)$

In [None]:
exact_obj = np.array([target_function(float(x)) for x in train_x])
exact_obj

Let us see which is the best observed value so far. Assuming maximization as we are dealing with a lower bound on the accuracy.

In [None]:
best_observed_value = exact_obj.max().item()
best_observed_value

We wrap all of the previous code into a function to be used freely

In [None]:
def generate_initial_data(n=10):
  train_x = torch.rand(n, 1, dtype=torch.double) * (upper_bound - lower_bound) + lower_bound
  exact_obj = torch.tensor([target_function(float(alpha)) for alpha in train_x])
  best_observed_value = exact_obj.max().item()
  return train_x, exact_obj, best_observed_value

In [None]:
generate_initial_data(3)

Let us now invoke this function to start the BO iteration and set the bounds of the 1-D $f(x) : x \in [-0.1,0.1]$

In [None]:
init_x, init_y, best_init_y = generate_initial_data(3)
bounds = torch.tensor([[lower_bound], [upper_bound]]) #bounds for 2D: torch.tensor([[0., 1.], [10.,2.]]) 

We now declare a set of functions that will help us to visualize all the components of Bayesian optimization.

In [None]:
from botorch.acquisition.analytic import ExpectedImprovement
from botorch.optim import optimize_acqf
from botorch.utils.transforms import standardize, normalize, unnormalize
from botorch.models import SingleTaskGP
from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
from botorch import fit_gpytorch_model

def print_objective_function(best_candidate, iteration, l_bound=0, h_bound=1):
  x = np.linspace(l_bound, h_bound, 100)
  x_new = x.reshape((100,-1))
  z = target_function(x_new)

  data = go.Scatter(x=x, y=z, line_color="#FE73FF")

  fig = go.Figure(data=data)
  fig.update_layout(title="Objective function. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output")
  fig.add_vline(x=best_candidate, line_width=3, line_color="red")
  fig.show()


def compute_acquisition_function(single_model, best_init_y, l_bound=-2., h_bound=10., resolution=1000):
  linspace = torch.linspace(l_bound, h_bound, steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)
  EI = ExpectedImprovement(model=single_model, best_f=best_init_y, maximize=True)
  result = []
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(EI(x_test))
  return torch.tensor(result)

def print_acquisition_function(acq_fun, iteration, l_bound=-2., h_bound=10., resolution=1000, suggested=None):
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = acq_fun
  max_acq_fun = x[((acq_fun == acq_fun.max().item()).nonzero(as_tuple=True)[0])]
  data = go.Scatter(x=x, y=z, line_color="yellow")

  fig = go.Figure(data=data)
  fig.update_layout(title="Expected Improvement acquisition function. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output")
  if(suggested==None):
    fig.add_vline(x=max_acq_fun, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")
  fig.show()

def compute_predictive_distribution(single_model, best_init_y, l_bound=-2., h_bound=10., resolution=1000):
  linspace = torch.linspace(l_bound, h_bound, steps=resolution)
  x_test = torch.tensor([linspace[0]]).unsqueeze(-1)
  result = []
  variances = []
  for x in linspace:
    x_test = torch.tensor([x]).unsqueeze(-1)
    result.append(single_model.posterior(x_test).mean)
    variances.append(single_model.posterior(x_test).variance)
  return torch.tensor(result), torch.tensor(variances)

def print_predictive_mean(predictive_mean, predictive_variance, iteration, l_bound=-2., h_bound=10., resolution=1000, suggested=None, old_obs=[], old_values=[]):
  x = torch.linspace(l_bound, h_bound, steps=resolution).detach().numpy()
  x_new = x.reshape((resolution,-1))
  z = predictive_mean
  max_predictive_mean = x[((predictive_mean == predictive_mean.max().item()).nonzero(as_tuple=True)[0])]

  fig = go.Figure()

  fig.add_trace(go.Scatter(x=x, y= predictive_mean + np.sqrt(predictive_variance),
                                     mode='lines',
                                     line=dict(color="#19D3F3",width =0.1),
                                     name='upper bound'))
  fig.add_trace(go.Scatter(x=x, y= predictive_mean,
                         mode='lines',
                         line=dict(color="blue"),
                         fill='tonexty',
                         name='predictive mean'))
  fig.add_trace(go.Scatter(x=x, y= predictive_mean - np.sqrt(predictive_variance),
                         mode='lines',
                         line=dict(color="blue", width =0.1),
                         fill='tonexty',
                         name='lower bound'))
  
  
  
  fig.update_layout(title="GP Predictive distribution. Iteration " + str(iteration), xaxis_title="input", yaxis_title="output", showlegend=False)

  if(suggested==None):
    fig.add_vline(x=max_predictive_mean, line_width=3, line_color="red")
  else:
    fig.add_vline(x=float(suggested[0][0]), line_width=3, line_color="red")  

  if(len(old_obs)>0):
    fig.add_trace(go.Scatter(x=old_obs, y=old_values, mode = 'markers', marker_color="black", marker_size=10))

  fig.show()

def visualize_functions(single_model, best_init_y, best_candidate, candidate_acq_fun, iteration, previous_observations, previous_values, bounds, best_candidate_normalized):
  predictive_mean, predictive_variance = compute_predictive_distribution(single_model, best_init_y, l_bound=0, h_bound=1)
  print_predictive_mean(predictive_mean, predictive_variance, iteration, suggested=candidate_acq_fun, old_obs=previous_observations, old_values=previous_values, l_bound=bounds[0][0], h_bound=bounds[1][0])
  acq_fun = compute_acquisition_function(single_model, best_init_y, l_bound=0, h_bound=1)
  print_acquisition_function(acq_fun, iteration, suggested=candidate_acq_fun, l_bound=bounds[0][0], h_bound=bounds[1][0])
  #Very expensive to compute here, just see that the predictive mean is just like the obj function! print_objective_function(best_candidate, iteration, l_bound=bounds[0][0], h_bound=bounds[1][0])

def get_next_points_and_visualize_norm(init_x, init_y, best_init_y, normalized_bounds, iteration, previous_observations, previous_values, bounds, n_points=1):
  single_model = SingleTaskGP(init_x, init_y)
  mll = ExactMarginalLogLikelihood(single_model.likelihood, single_model)
  fit_gpytorch_model(mll)

  EI = ExpectedImprovement(model=single_model, best_f=best_init_y, maximize=True)
  
  candidates, _ = optimize_acqf(acq_function=EI, bounds=normalized_bounds, q=n_points, num_restarts=200, raw_samples=512, options={"batch_limit": 5, "maxiter": 200})
  best_candidate = unnormalize(init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0], bounds=normalized_bounds)
  best_candidate_normalized = init_x[((init_y == best_init_y).nonzero(as_tuple=True)[0])][0][0]

  visualize_functions(single_model, best_init_y, best_candidate, unnormalize(candidates, bounds=bounds), iteration, previous_observations, previous_values, bounds, best_candidate_normalized)

  return candidates

We can now embed this code into the BO loop. First three points are drawn at random.

In [None]:
n_iterations=10

init_x, init_y, best_init_y = generate_initial_data(3)
bounds = torch.tensor([[lower_bound], [upper_bound]])
normalized_bounds = torch.tensor([[0.0], [1.0]])
init_x_normalized = normalize(init_x, bounds=bounds)
init_y_standardized = standardize(init_y)
best_init_y_standardized = init_y_standardized.max().item()

candidates=[]
results=[]
for i in range(n_iterations):
  print(f"Number of iterations done: {i}")
  normalized_new_candidates = get_next_points_and_visualize_norm(init_x_normalized, init_y_standardized, best_init_y_standardized, normalized_bounds, i, init_x, init_y, bounds, 1)
  new_candidates = unnormalize(normalized_new_candidates, bounds=bounds)
  new_results = torch.tensor([target_function(float(new_candidates))])

  print(f"New candidates are: {new_candidates}")
  init_x = torch.cat([init_x, new_candidates])
  init_y = torch.cat([init_y, new_results])
  init_x_normalized = normalize(init_x, bounds=bounds)
  init_y_standardized = standardize(init_y)

  best_init_y = init_y.max().item()
  best_init_y_standardized = init_y_standardized.max().item()
  print(f"Best point performs this way: {best_init_y}")
  candidates.append(float(normalized_new_candidates[0][0]))
  results.append(float(standardize(new_results[0][0])))