In [None]:
## Example for full simulation loop using a table-based lookup mechanism

# This example shows a simulation for a direct arylation where all combinations have been measured.
# This allows us to access information about previously conducted experiments from .xlsx-files.

# This example assumes some basic familiarity with using BayBE.
# We thus refer to [`campaign`](./../Basics/campaign.md) for a basic example.

### Necessary imports for this example

import os
import warnings
import sys
from pathlib import Path

import pandas as pd
import seaborn as sns

from baybe import Campaign
from baybe.objective import Objective
from baybe.parameters import (
    CategoricalParameter,
    NumericalDiscreteParameter,
    SubstanceParameter,
    TaskParameter,
)
from baybe.recommenders import RandomRecommender
from baybe.searchspace import SearchSpace
from baybe.simulation import simulate_scenarios
from baybe.targets import NumericalTarget
from baybe.utils.plotting import create_example_plots

### Parameters for a full simulation loop

# For the full simulation, we need to define some additional parameters.
# These are the number of Monte Carlo runs and the number of experiments to be conducted per run.

N_DOE_ITERATIONS = 10
N_MC_ITERATIONS = 15
BATCH_SIZE = 2

# Let's define the path to the current directory.
script_path = Path(os.getcwd())
# Raise a warning if the path does not end with "scripts"
if script_path.name != "scripts":
    warnings.warn(
        "Please run this script from the 'scripts' directory to ensure that the data is loaded correctly."
    )
    raise SystemExit

results_dir = script_path.parent / "results"
data_dir = script_path.parent / "data"
if not data_dir.exists():
    raise FileNotFoundError(
        f"Data directory {data_dir} not found. Please make sure to run this script from the 'scripts' directory."
    )
if not results_dir.exists():
    results_dir.mkdir(exist_ok=True)
print(f"Current directory is {script_path}.")
print(f"Data is loaded from {data_dir}.")
print(f"Results will be saved in {results_dir}.")

warnings.filterwarnings('ignore')


In [None]:

### Lookup functionality and data creation

# We read the information about the conducted experiments from a .xlsx-file.
# This data set was obtained from [Shields, B.J., Stevens et al. Nature 590, 89–96 (2021)](https://doi.org/10.1038/s41586-021-03213-y) and contains measurements of a reaction yield,
# varying typical reaction conditions.
# This code assumes that you call `python` either from the repository root folder or this folder.

try:
    lookup = pd.read_excel(data_dir / "lookup.xlsx")
except FileNotFoundError:
    try:
        lookup = pd.read_excel("./lookup.xlsx")
    except FileNotFoundError as e:
        print(e)

# print full lookup table to a  file
print(lookup.dtypes)
lookup["Temp_C"] = lookup["Temp_C"].astype(str)
print(lookup.dtypes)
print(lookup.head())


In [None]:

dict_solvent = {
    "DMAc": r"CC(N(C)C)=O",
    "Butyornitrile": r"CCCC#N",
    "Butyl Ester": r"CCCCOC(C)=O",
    "p-Xylene": r"CC1=CC=C(C)C=C1",
}
dict_base = {
    "Potassium acetate": r"O=C([O-])C.[K+]",
    "Potassium pivalate": r"O=C([O-])C(C)(C)C.[K+]",
    "Cesium acetate": r"O=C([O-])C.[Cs+]",
    "Cesium pivalate": r"O=C([O-])C(C)(C)C.[Cs+]",
}
dict_ligand = {
    "BrettPhos": r"CC(C)C1=CC(C(C)C)=C(C(C(C)C)=C1)C2=C(P(C3CCCCC3)C4CCCCC4)C(OC)="
    "CC=C2OC",
    "Di-tert-butylphenylphosphine": r"CC(C)(C)P(C1=CC=CC=C1)C(C)(C)C",
    "(t-Bu)PhCPhos": r"CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C3=CC=CC=C3",
    "Tricyclohexylphosphine": r"P(C1CCCCC1)(C2CCCCC2)C3CCCCC3",
    "PPh3": r"P(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3",
    "XPhos": r"CC(C1=C(C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C(C)C)=CC(C(C)C)=C1)C",
    "P(2-furyl)3": r"P(C1=CC=CO1)(C2=CC=CO2)C3=CC=CO3",
    "Methyldiphenylphosphine": r"CP(C1=CC=CC=C1)C2=CC=CC=C2",
    "1268824-69-6": r"CC(OC1=C(P(C2CCCCC2)C3CCCCC3)C(OC(C)C)=CC=C1)C",
    "JackiePhos": r"FC(F)(F)C1=CC(P(C2=C(C3=C(C(C)C)C=C(C(C)C)C=C3C(C)C)C(OC)=CC=C2OC)"
    r"C4=CC(C(F)(F)F)=CC(C(F)(F)F)=C4)=CC(C(F)(F)F)=C1",
    "SCHEMBL15068049": r"C[C@]1(O2)O[C@](C[C@]2(C)P3C4=CC=CC=C4)(C)O[C@]3(C)C1",
    "Me2PPh": r"CP(C)C1=CC=CC=C1",
}

### Creating the Objective
objective = Objective(
    mode="SINGLE", targets=[NumericalTarget(name="yield", mode="MAX")]
)
### Define the substance encoding
encoding = "RDKIT"

In [None]:

temperatures = ["90", "105", "120"]
sample_fractions = [0.01, 0.1, 0.5]

for temp in temperatures:
    print(f"\n\nTemperature: {temp}")
    excluded_temps = [t for t in temperatures if t != temp]
    print(f"Taking additional data from {excluded_temps} into account.\n")
    campaign = Campaign(
        searchspace=SearchSpace.from_product(
            parameters=[
                SubstanceParameter(name="Solvent", data=dict_solvent, encoding=encoding),
                SubstanceParameter(name="Base", data=dict_base, encoding=encoding),
                SubstanceParameter(name="Ligand", data=dict_ligand, encoding=encoding),

                ### FOR TRANSFER LEARNING ###
                # NOTE - old definition: NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2),
                TaskParameter(
                    name="Temp_C",
                    values=temperatures,
                    active_values=[temp],
                ),
                #############################

                NumericalDiscreteParameter(
                    name="Concentration", values=[0.057, 0.1, 0.153]
                ),
            ]
        ),
        objective=objective,
    )

    # We can now create a lookup table that contains all data except the data for the current temperature.
    lookup_other_data = lookup[lookup["Temp_C"] != temp].copy(deep=True)
    # DEV: Print full lookup table to a file
    path_to_lookup = results_dir / f".lookup_{temp}.csv"
    lookup_other_data.to_csv(path_to_lookup, index=False)

    # We can now use the `simulate_scenarios` function to simulate a full optimization loop.
    results: list[pd.DataFrame] = []
    for p in sample_fractions:
        print("Fraction of data used: ", p)
        result_fraction = simulate_scenarios(
            {f"{int(100*p)}": campaign},
            lookup,
            initial_data = [lookup_other_data.sample(frac=p) for _ in range(N_MC_ITERATIONS)],
            batch_size=BATCH_SIZE,
            n_doe_iterations=N_DOE_ITERATIONS,
            n_mc_iterations=N_MC_ITERATIONS,
        )
        results.append(result_fraction)
    print("Fraction of data used: 0.0")
    result_baseline = simulate_scenarios(
        {"0": campaign},
        lookup,
        batch_size=BATCH_SIZE,
        n_doe_iterations=N_DOE_ITERATIONS,
        n_mc_iterations=N_MC_ITERATIONS,
    )
    results = pd.concat([result_baseline, *results])
    # Rename the column for more reasonable plotting
    results.rename(columns={"Scenario": "% of data used"}, inplace=True)

    # first, print the results to a file
    path_to_results = results_dir / f".results_{temp}.csv"
    results.to_csv(path_to_results, index=False)

    # Let's visualize the results. As you can see, the amount of initial data used has a significant impact on the performance.
    path = Path(sys.path[0])
    ax = sns.lineplot(
        data=results,
        marker="o",
        markersize=10,
        x="Num_Experiments",
        y="yield_CumBest",
        hue="% of data used",
    )
    create_example_plots(
        ax=ax,
        path=results_dir,
        base_name=f"transfer_learning_{temp}_bq{BATCH_SIZE}_ndi{N_DOE_ITERATIONS}_mc{N_MC_ITERATIONS}",
    )