# Simulation Mode

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from baybe import Campaign
from baybe.objective import Objective
from baybe.parameters import NumericalDiscreteParameter, SubstanceParameter
from baybe.recommenders import RandomRecommender, TwoPhaseMetaRecommender
from baybe.searchspace import SearchSpace
from baybe.simulation import simulate_scenarios
from baybe.targets import NumericalTarget



In [2]:
SMOKE_TEST = "SMOKE_TEST" in os.environ
N_MC_ITERATIONS = 2 if SMOKE_TEST else 5
N_DOE_ITERATIONS = 2 if SMOKE_TEST else 5
BATCH_SIZE = 1 if SMOKE_TEST else 3

In [3]:
lookup = pd.read_excel('data/filtered_AA2024.xlsx')

In [4]:
unique_SMILES = lookup.SMILES.unique()

def list_to_dict(input_list):
    return {item: item for item in input_list}

smiles_dict =list_to_dict(unique_SMILES)

In [5]:
from baybe.parameters import NumericalDiscreteParameter, NumericalContinuousParameter
from baybe.parameters import SubstanceParameter
from baybe.searchspace import SearchSpace

encoding_choice = ["MORDRED", "RDKIT", "MORGAN_FP"]

parameters = [ 
NumericalDiscreteParameter(
    name="Time_h",
    values = lookup['Time_h'].unique()
    # tolerance = 0.004, assume certain experimental noise for each parameter measurement?
),
NumericalDiscreteParameter(
    name="pH",
    values = lookup['pH'].unique()
    # tolerance = 0.004
    ),  
NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?
    name="Inhib_Concentrat_M",
    values=lookup['Inhib_Concentrat_M'].unique(), # Remove data outliers like 0.1?
    # tolerance = 0.004
    ),
NumericalDiscreteParameter(
    name="Salt_Concentrat_M",
    values=lookup['Salt_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
SubstanceParameter(
    name="SMILES",
    data=smiles_dict,
    encoding="MORGAN_FP", # Which is better?
    decorrelate=0.7,  # Change threshold to avoid overfitting?
    )
]

In [6]:
searchspace = SearchSpace.from_product(parameters)

MemoryError: Unable to allocate 11.9 GiB for an array with shape (384, 4151250) and data type int64

In [None]:
from baybe.targets import NumericalTarget
from baybe.objective import Objective

target = NumericalTarget(
    name="Efficiency",
    mode="MAX",
)
objective = Objective(mode="SINGLE", targets=[target])

In [None]:
campaign = Campaign(searchspace=searchspace, objective=objective)
campaign_rand = Campaign(
    searchspace=searchspace,
    recommender=TwoPhaseMetaRecommender(recommender=RandomRecommender()),
    objective=objective,
)

In [None]:
scenarios = {"Test_Scenario": campaign, "Random": campaign_rand}

In [None]:
results = simulate_scenarios(
    scenarios,
    lookup,
    batch_size=BATCH_SIZE,
    n_doe_iterations=N_DOE_ITERATIONS,
    n_mc_iterations=N_MC_ITERATIONS,
    impute_mode="best",
)

In [None]:
""" max_yield = lookup["yield"].max()
sns.lineplot(
    data=results, x="Num_Experiments", y="yield_CumBest", hue="Scenario", marker="x"
)
plt.plot([3, 3 * N_DOE_ITERATIONS], [max_yield, max_yield], "--r")
plt.legend(loc="lower right")
plt.gcf().set_size_inches(20, 8)
plt.savefig("./run_impute_mode.png")
"""

######################

# Introduction

This project will focus on exploring the capabilities of Bayesian optimization, specifically employing BayBE, in the discovery of novel corrosion inhibitors for materials design. Initially, we will work with a randomly chosen subset from a comprehensive database of electrochemical responses of small organic molecules. Our goal is to assess how Bayesian optimization can speed up the screening process across the design space to identify promising compounds. We will compare different strategies for incorporating alloy information, while optimizing the experimental parameters with respect to the inhibitive performance of the screened compounds.

# Initizalization

Loading libraries and data files:

In [None]:
import pandas as pd
import numpy as np

df_AA2024 = pd.read_excel('data/filtered_AA2024.xlsx')
# df_AA1000 = pd.read_excel('data/filtered_AA1000.xlsx')
# df_Al = pd.read_excel('data/filtered_Al.xlsx')

In [None]:
print(df_AA2024.describe())

In [None]:
print(df_AA2024.head())

# Data Processing

### Extract all unique SMILES values into dictionary

In [None]:
unique_SMILES = df_AA2024.SMILES.unique()

def list_to_dict(input_list):
    return {item: item for item in input_list}

smiles_dict =list_to_dict(unique_SMILES)

# Bayesian Optimization

## Search Space

In [None]:
from baybe.parameters import NumericalDiscreteParameter, NumericalContinuousParameter
from baybe.parameters import SubstanceParameter
from baybe.searchspace import SearchSpace

encoding_choice = ["MORDRED", "RDKIT", "MORGAN_FP"]

parameters = [ 
NumericalDiscreteParameter(
    name="Time_h",
    values = df_AA2024['Time_h'].unique()
    # tolerance = 0.004, assume certain experimental noise for each parameter measurement?
),
NumericalDiscreteParameter(
    name="pH",
    values = df_AA2024['pH'].unique()
    # tolerance = 0.004
    ),  
NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?
    name="Inhib_Concentrat_M",
    values=df_AA2024['Inhib_Concentrat_M'].unique(), # Remove data outliers like 0.1?
    # tolerance = 0.004
    ),
NumericalDiscreteParameter(
    name="Salt_Concentrat_M",
    values=df_AA2024['Salt_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
SubstanceParameter(
    name="SMILES",
    data=smiles_dict,
    encoding="MORGAN_FP", # Which is better?
    decorrelate=0.7,  # Change threshold to avoid overfitting?
    )
]

These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data.

### Custom descriptors

In [None]:
"""
The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.

Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. 
Still, one could provide experimental measurements or common metrics used to classify polymers:
from baybe.parameters import CustomDiscreteParameter

# Create or import new dataframe containing custom descriptors

descriptors = pd.DataFrame(
    {
        "Glass_Transition_TempC": [20, -71, -39],
        "Weight_kDalton": [120, 32, 241],
    },
    index=["Polymer A", "Polymer B", "Polymer C"],  # put labels in the index
)

CustomDiscreteParameter(
    name="Polymer",
    data=descriptors,
    decorrelate=True,  # optional, uses default correlation threshold = 0.7?
)
""" 

In [None]:
searchspace = SearchSpace.from_product(parameters)

In [None]:
print(searchspace)

## Objective

In [None]:
from baybe.targets import NumericalTarget
from baybe.objective import Objective

target = NumericalTarget(
    name="Efficiency",
    mode="MAX",
)
objective = Objective(mode="SINGLE", targets=[target])

## Recommender

In [None]:
from baybe.recommenders import RandomRecommender, SequentialGreedyRecommender
from baybe.surrogates import GaussianProcessSurrogate

available_surr_models = [
    "GaussianProcessSurrogate", 
    "BayesianLinearSurrogate",
    "MeanPredictionSurrogate",
    "NGBoostSurrogate",
    "RandomForestSurrogate"
]

available_acq_functions = [
    "qPI",  # q-Probability Of Improvement
    "qEI",  # q-Expected Improvement
    "qUCB", # q-upper confidence bound with beta of 1.0
]

# Defaults anyway
SURROGATE_MODEL = GaussianProcessSurrogate()
ACQ_FUNCTION = "qEI" # q-Expected Improvement, only q-fuctions are available for batch_size > 1

seq_greedy_recommender = SequentialGreedyRecommender(
        surrogate_model=SURROGATE_MODEL,
        acquisition_function_cls=ACQ_FUNCTION,
        hybrid_sampler="Farthest", # find more details in the documentation
        sampling_percentage=0.3, # should be relatively low
        allow_repeated_recommendations=False,
        allow_recommending_already_measured=False,
    )

# Campaign

In [None]:
from baybe.strategies import TwoPhaseStrategy
from baybe import Campaign

strategy = TwoPhaseStrategy(
    initial_recommender = RandomRecommender(),  # Initial recommender
    # Doesn't matter since I already have training data, BUT CAN BE USED FOR BENCHMARKING
    recommender = seq_greedy_recommender,  # Bayesian model-based optimization
    switch_after=1  # Switch to the model-based recommender after 1 batches = immediately
)

campaign = Campaign(searchspace, objective, strategy)

In [None]:
print(campaign)

### Get recommendations

In [None]:
new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance
print("\n\nRecommended experiments: ")
print(new_rec.to_markdown())

In [None]:
# Get and input efficiency value from Excel table, for specific SMILES component first, 
# then for the closest values of the rest of the parameters

new_rec["Efficiency"] = [0.1]
campaign.add_measurements(new_rec)

print("\n\nRecommended experiments with measured values: ")
print(new_rec.to_markdown())

In [None]:
new_new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance
print("\n\nRecommended experiments: ")
print(new_new_rec.to_markdown())

In [None]:
print(campaign)

### Merge all results into a dataframe

In [None]:
results = pd.concat([new_rec, new_new_rec]) # etc.
print("\n\nAll experiments with measured values: ")
print(results.to_markdown())

# Benchmarking

# Transfer Learning

https://emdgroup.github.io/baybe/examples/Transfer_Learning/basic_transfer_learning.html

https://emdgroup.github.io/baybe/userguide/transfer_learning.html

https://emdgroup.github.io/baybe/userguide/simulation.html

https://emdgroup.github.io/baybe/examples/Backtesting/impute_mode.html