# Using cluster centers instead of full initial data

In [49]:
import sys
import warnings
from pathlib import Path

import pandas as pd
import seaborn as sns
from botorch.test_functions.synthetic import Hartmann
from sklearn_extra.cluster import KMedoids

from baybe import Campaign
from baybe.objective import Objective
from baybe.parameters import NumericalContinuousParameter, TaskParameter
from baybe.searchspace import SearchSpace
from baybe.simulation import simulate_scenarios
from baybe.targets import NumericalTarget
from baybe.utils.plotting import create_example_plots

warnings.filterwarnings('ignore')

## Settings

The following settings are used to set up the problem:

In [50]:
DIMENSION = 6  # input dimensionality of the test function
BATCH_SIZE = 1  # batch size of recommendations per DOE iteration
N_MC_ITERATIONS = 30  # number of Monte Carlo runs
N_DOE_ITERATIONS = 25  # number of DOE iterations

In [51]:
objective = Objective(
    mode="SINGLE", targets=[NumericalTarget(name="Target", mode="MIN")]
)

# The bounds of the search space are dictated by the test function:

BOUNDS = Hartmann(dim=DIMENSION).bounds

params = [
    NumericalContinuousParameter(
        name=f"x{d}",
        bounds=(lower, upper),
    )
    for d, (lower, upper) in enumerate(BOUNDS.T)
]

Next, we define a `TaskParameter` to encode the task context,
which allows the model to establish a relationship between the training data and
the data collected during the optimization process.
Because we want to obtain recommendations only for the test function, we explicitly
pass the `active_values` keyword.

In [52]:
task_param = TaskParameter(
    name="Function",
    values=["Test_Function", "Training_Function"],
    active_values=["Test_Function"],
)

With the parameters at hand, we can now create our search space.


In [53]:

parameters = [*params, task_param]
searchspace = SearchSpace.from_product(parameters=parameters)

## Defining the Tasks

To demonstrate the transfer learning mechanism, we consider the problem of optimizing
the Hartmann function using training data from a slightly altered version. The used model is of course not aware of this relationship but needs to infer
it from the data gathered during the optimization process.

NOTE: Due to a bug in the code, we need to adjust the `botorch_function_wrapper` function to ignore the `TaskParameter`. This is intended to be fixed in a future version of BayBE.

In [54]:
def botorch_function_wrapper(test_function):

    def wrapper(*x) -> float:
        from torch import Tensor
        # Cast the provided list of floats to a tensor.
        if isinstance(x[0], str):
            x_tensor = Tensor(x[1:])
        else:
            x_tensor = Tensor(x)
        result = test_function.forward(x_tensor)
        # We do not need to return a tuple here.
        return float(result)

    return wrapper

# Once the bug is fixed, you should be able to use the already existing wrapper by uncommenting this line.
# from baybe.utils.botorch_wrapper import botorch_function_wrapper

In [55]:
def shifted_hartmann(*x: float) -> float:
    """Calculate a shifted, scaled and noisy variant of the Hartman function."""
    noised_hartmann = Hartmann(dim=DIMENSION)
    return 2.5 * botorch_function_wrapper(noised_hartmann)(x) + 3.25


test_functions = {
    "Test_Function": botorch_function_wrapper(Hartmann(dim=DIMENSION)),
    "Training_Function": shifted_hartmann,
}

## Simulation Loop

We now simulate campaigns for different amounts of training data unveiled,
to show the impact of transfer learning on the optimization performance.
To average out and reduce statistical effects that might happen due to the random
sampling of the provided data, we perform several Monte Carlo runs.

The output of the following code is deleted to improve readability.

In [None]:
for n in (30, 50, 100, 250):
    results: list[pd.DataFrame] = []
    # Produce a baseline by sampling 
    sampled_data = [searchspace.continuous.samples_random(n_points=n) for _ in range(N_MC_ITERATIONS)]
    for num_clusters in (2, 5, 10, 15, 25):
        initial_data = []
        for i in range(N_MC_ITERATIONS):
            data = sampled_data[i]
            kmedoids = KMedoids(n_clusters=num_clusters).fit(data)
            centers = pd.DataFrame(
                data=kmedoids.cluster_centers_,
                columns=("x0", "x1", "x2", "x3", "x4", "x5"),
            )
            centers["Target"] = centers.apply(
                test_functions["Training_Function"], axis=1
            )
            centers["Function"] = "Training_Function"
            initial_data.append(centers)

        campaign = Campaign(searchspace=searchspace, objective=objective)
        result_clustered = simulate_scenarios(
            {f"{num_clusters}": campaign},
            test_functions["Test_Function"],
            initial_data=initial_data,
            batch_size=BATCH_SIZE,
            n_doe_iterations=N_DOE_ITERATIONS,
        )
        results.append(result_clustered)

    # Provide a baseline by using all of the sampled data
    campaign = Campaign(searchspace=searchspace, objective=objective)
    for data in sampled_data:
        data["Target"] = data.apply(
            test_functions["Training_Function"], axis=1
        )
        data["Function"] = "Training_Function"
    result_baseline = simulate_scenarios(
        {"Baseline": campaign},
        test_functions["Test_Function"],
        initial_data=sampled_data,
        batch_size=BATCH_SIZE,
        n_doe_iterations=N_DOE_ITERATIONS,
    )
    results.append(result_baseline)

    results = pd.concat(results)

    results.rename(columns={"Scenario": "Num. clusters"}, inplace=True)
    path = Path(sys.path[0])
    ax = sns.lineplot(
        data=results,
        marker="o",
        markersize=10,
        x="Num_Experiments",
        y="Target_CumBest",
        hue="Num. clusters",
    )
    create_example_plots(
        ax=ax,
        path=path,
        base_name=f"cluster_experiments_{n}",
)

## Using 30 initial points

![image.png](results/cluster_experiments_30_dark.svg)

## Using 50 initial points

![image.png](results/cluster_experiments_50_dark.svg)

## Using 100 initial points

![image.png](results/cluster_experiments_100_dark.svg)

## Using 250 initial points

![image.png](results/cluster_experiments_250_fallback.svg)

## Interpretation

The key result of this study is that choosing clusters instead of providing the full available data definitely has an influence on the performance and might be worth a more in-depth investigation.
Interestingly, it seems like the baseline actually performs worst in our experiments, although it has the maximum number of points available. This could have several reasons, like the chosen function not being a suitable one for this exercise, or the availability of relatively many points distracting the optimizer. Also, it might be the case that it might take more iterations for the baseline to "catch up" and overtaking the experiments using clusters.