# Introduction

This project will focus on exploring the capabilities of Bayesian optimization, specifically employing BayBE, in the discovery of novel corrosion inhibitors for materials design. Initially, we will work with a randomly chosen subset from a comprehensive database of electrochemical responses of small organic molecules. Our goal is to assess how Bayesian optimization can speed up the screening process across the design space to identify promising compounds. We will compare different strategies for incorporating alloy information, while optimizing the experimental parameters with respect to the inhibitive performance of the screened compounds.

# Initialization

Loading libraries and data files:

In [1]:
import pandas as pd
import numpy as np
from baybe import Campaign

df_AA2024 = pd.read_excel('../data/filtered_AA2024.xlsx')
df_AA1000 = pd.read_excel('../data/filtered_AA1000.xlsx')
df_Al = pd.read_excel('../data/filtered_Al.xlsx')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np

def random_subsample(df, num_samples):
    np.random.seed(42)
    indices = np.random.choice(df.index, num_samples, replace=False)
    subsampled_df = df.loc[indices]
    return subsampled_df 

In [3]:
df_AA2024.head()

Unnamed: 0,SMILES,Time_h,pH,Inhib_Concentrat_M,Salt_Concentrat_M,Efficiency
0,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,4.0,0.001,0.1,0.0
1,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,10.0,0.001,0.1,0.0
2,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,4.0,0.001,0.1,30.0
3,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,10.0,0.001,0.1,30.0
4,Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O,24.0,4.0,0.001,0.1,30.0


In [4]:
unique_smiles = df_AA2024.SMILES.unique()
unique_times = df_AA2024.Time_h.unique()
unique_pH = df_AA2024.pH.unique()
unique_inhib_conc = df_AA2024.Inhib_Concentrat_M.unique()
unique_salt_conc = df_AA2024.Salt_Concentrat_M.unique()

time_min, time_max = df_AA2024.Time_h.min(), df_AA2024.Time_h.max()
pH_min, pH_max = df_AA2024.pH.min(), df_AA2024.pH.max()
inhib_conc_min, inhib_conc_max = df_AA2024.Inhib_Concentrat_M.min(), df_AA2024.Inhib_Concentrat_M.max()
salt_conc_min, salt_conc_max = df_AA2024.Salt_Concentrat_M.min(), df_AA2024.Salt_Concentrat_M.max()
efficiency_min, efficiency_max = df_AA2024.Efficiency.min(), df_AA2024.Efficiency.max()

# Data Processing

# Data Anaylsis

# Bayesian Optimization

In [5]:
from baybe import Campaign

from baybe.targets import NumericalTarget
from baybe.objective import Objective
from baybe.searchspace import SubspaceDiscrete
from baybe.searchspace import SearchSpace
from baybe.parameters import NumericalDiscreteParameter
from baybe.parameters import SubstanceParameter


def list_to_dict(input_list):
    return {item: item for item in input_list}

smiles_dict =list_to_dict(unique_smiles)


target = NumericalTarget(name="Efficiency", mode="MAX", bounds=(efficiency_min, efficiency_max), transformation="LINEAR")
objective = Objective(mode="SINGLE", targets=[target])

parameters = [
NumericalDiscreteParameter(
    name="Time_h",
    values=df_AA2024['Time_h'].unique(),
    # tolerance = 0.004, assume certain experimental noise for each parameter measurement?
),
NumericalDiscreteParameter(
    name="pH",
    values=df_AA2024['pH'].unique(),
    # tolerance = 0.004
    ),  
NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?
    name="Inhib_Concentrat_M",
    values= df_AA2024['Inhib_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
NumericalDiscreteParameter(
    name="Salt_Concentrat_M",
    values=df_AA2024['Salt_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
SubstanceParameter(
    name="SMILES",
    data=smiles_dict,
    encoding="MORGAN_FP",  # optional
    decorrelate=0.7,  # optional
    )        
    ]
# searchspace = SubspaceDiscrete.from_product(parameters=parameters)
df_no_target = df_AA2024.drop('Efficiency', axis=1)

searchspace = SearchSpace.from_dataframe(df = df_no_target, parameters=parameters)

campaign = Campaign(
    searchspace=searchspace,  # Required
    objective=objective,  # Required
    # recommender=recommender,  # Optional
)

In [6]:
searchspace

SearchSpace(discrete=SubspaceDiscrete(parameters=[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0), NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0), NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0), NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0), SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(

In [7]:
from baybe.simulation import simulate_experiment

lookup=df_AA2024
BATCH_SIZE = 1
N_DOE_ITERATIONS = 5
N_MC_ITERATIONS = 5

results = simulate_experiment(
    campaign,
    df_AA2024,
    batch_size=BATCH_SIZE,
    n_doe_iterations=N_DOE_ITERATIONS,
    impute_mode="best",
)


  stdvs = Y.std(dim=-2, keepdim=True)
  Ymean, Ystd = torch.mean(Y, dim=-2), torch.std(Y, dim=-2)
The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.
The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.
The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.
The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.
Input row with index 297 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.
Input row with index 300 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.
Input row with index 303 has multiple matches with the search space. This could indicate that something went wrong. Matching o

In [8]:
results

Unnamed: 0,Iteration,Num_Experiments,Efficiency_Measurements,Efficiency_IterBest,Efficiency_CumBest
0,0,1,[60.35],60.35,60.35
1,1,5,"[40.0, 73.0, 40.0, 43.0]",73.0,73.0
2,2,6,[78.26],78.26,78.26
3,3,7,[89.68],89.68,89.68
4,4,8,[50.0],50.0,89.68


In [9]:
df_AA2024.Efficiency.max()

100.0

In [None]:
results = simulate_scenarios(
    # Necessary
    campaign=campaign,
    # Technically optional but should always be set
    lookup=lookup,
    # Optional
    batch_size=batch_size,
    n_doe_iterations=n_doe_iterations,
    initial_data=initial_data,
    random_seed=random_seed,
    impute_mode=impute_mode,
    noise_percent=noise_percent,
)

## Search Space

## Target & Objective

In [None]:
from baybe.targets import NumericalTarget
from baybe.objective import Objective

target = NumericalTarget(
    name="Efficiency",
    mode="MAX",
)
objective = Objective(mode="SINGLE", targets=[target])

## Recommender

# Benchmarking

# Transfer Learning