# Introduction

This project will focus on exploring the capabilities of Bayesian optimization, specifically employing BayBE, in the discovery of novel corrosion inhibitors for materials design. Initially, we will work with a randomly chosen subset from a comprehensive database of electrochemical responses of small organic molecules. Our goal is to assess how Bayesian optimization can speed up the screening process across the design space to identify promising compounds. We will compare different strategies for incorporating alloy information, while optimizing the experimental parameters with respect to the inhibitive performance of the screened compounds.

# Initizalization

Loading libraries and data files:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from baybe import Campaign
from baybe.objective import Objective
from baybe.parameters import NumericalDiscreteParameter, SubstanceParameter
from baybe.recommenders import RandomRecommender, TwoPhaseMetaRecommender
from baybe.searchspace import SearchSpace
from baybe.simulation import simulate_scenarios
from baybe.targets import NumericalTarget

df_AA2024 = pd.read_excel('data/filtered_AA2024.xlsx')
df_AA1000 = pd.read_excel('data/filtered_AA1000.xlsx')
df_Al = pd.read_excel('data/filtered_Al.xlsx')

df_active = df_AA2024

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SMOKE_TEST = "SMOKE_TEST" in os.environ

N_MC_ITERATIONS = 2 if SMOKE_TEST else 5
N_DOE_ITERATIONS = 2 if SMOKE_TEST else 5
BATCH_SIZE = 1 if SMOKE_TEST else 3

In [4]:
lookup = df_active

In [5]:
# chemical space dictionary
unique_SMILES = df_active.SMILES.unique()

def list_to_dict(input_list):
    return {item: item for item in input_list}

smiles_dict =list_to_dict(unique_SMILES)

In [6]:
# parameters
parameters = [
NumericalDiscreteParameter(
    name="Time_h",
    values=df_active['Time_h'].unique(),
    # tolerance = 0.004, assume certain experimental noise for each parameter measurement?
),
NumericalDiscreteParameter(
    name="pH",
    values=df_active['pH'].unique(),
    # tolerance = 0.004
    ),  
NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?
    name="Inhib_Concentrat_M",
    values= df_active['Inhib_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
NumericalDiscreteParameter(
    name="Salt_Concentrat_M",
    values=df_active['Salt_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
SubstanceParameter(
    name="SMILES",
    data=smiles_dict,
    encoding="MORDRED",  # optional
    decorrelate=0.7,  # optional
    )        
    ]
parameters

[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0),
 NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0),
 NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0),
 NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0),
 SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O': 'Clc1ccc(cc

In [7]:
df_no_target = lookup.drop('Efficiency', axis=1)

searchspace = SearchSpace.from_dataframe(df = df_no_target, parameters=parameters)
print('Print test 1')
objective = Objective(
    mode="SINGLE", targets=[NumericalTarget(name="Efficiency", mode="MAX")]
)

Print test 1


In [8]:
print('Print test 2')

Print test 2


In [9]:
searchspace

SearchSpace(discrete=SubspaceDiscrete(parameters=[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0), NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0), NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0), NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0), SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(

In [10]:
campaign = Campaign(searchspace=searchspace, objective=objective)
campaign_rand = Campaign(
    searchspace=searchspace,
    recommender=TwoPhaseMetaRecommender(recommender=RandomRecommender()),
    objective=objective,
)

In [11]:
scenarios = {"Test_Scenario": campaign, "Random": campaign_rand}

In [12]:
results = simulate_scenarios(
    scenarios,
    lookup,
    batch_size=BATCH_SIZE,
    n_doe_iterations=N_DOE_ITERATIONS,
    n_mc_iterations=N_MC_ITERATIONS,
    impute_mode="ignore",
)

  0%|          | 0/10 [00:00<?, ?it/s]


IndexError: boolean index did not match indexed array along dimension 0; dimension is 611 but corresponding boolean dimension is 921

# Non - simulation stuff

In [6]:
# targets 
from baybe.targets import NumericalTarget
from baybe.objective import Objective

target = NumericalTarget(
    name="Efficiency",
    mode="MAX",
)
objective = Objective(mode="SINGLE", targets=[target])

In [7]:
# parameters
parameters = [
NumericalDiscreteParameter(
    name="Time_h",
    values=df_active['Time_h'].unique(),
    # tolerance = 0.004, assume certain experimental noise for each parameter measurement?
),
NumericalDiscreteParameter(
    name="pH",
    values=df_active['pH'].unique(),
    # tolerance = 0.004
    ),  
NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?
    name="Inhib_Concentrat_M",
    values= df_active['Inhib_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
NumericalDiscreteParameter(
    name="Salt_Concentrat_M",
    values=df_active['Salt_Concentrat_M'].unique(),
    # tolerance = 0.004
    ),
SubstanceParameter(
    name="SMILES",
    data=smiles_dict,
    encoding="MORDRED",  # optional
    decorrelate=0.7,  # optional
    )        
    ]
parameters

[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0),
 NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0),
 NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0),
 NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0),
 SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O': 'Clc1ccc(cc

In [9]:
# define search space
df_no_target = lookup.drop('Efficiency', axis=1)

searchspace = SearchSpace.from_dataframe(df = df_no_target, parameters=parameters)
searchspace

SearchSpace(discrete=SubspaceDiscrete(parameters=[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0), NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0), NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0), NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0), SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(

In [10]:
# recommenders
from baybe.recommenders import RandomRecommender, SequentialGreedyRecommender
from baybe.surrogates import GaussianProcessSurrogate

SURROGATE_MODEL = GaussianProcessSurrogate()
ACQ_FUNCTION = "qEI" # q-Expected Improvement, only q-fuctions are available for batch_size > 1

seq_greedy_recommender = SequentialGreedyRecommender(
        surrogate_model=SURROGATE_MODEL,
        acquisition_function_cls=ACQ_FUNCTION,
        hybrid_sampler="Farthest", # find more details in the documentation
        sampling_percentage=0.3, # should be relatively low
        allow_repeated_recommendations=False,
        allow_recommending_already_measured=False,
    )

In [11]:
# campaign strategy
from baybe.strategies import TwoPhaseStrategy
from baybe import Campaign

strategy = TwoPhaseStrategy(
    initial_recommender = RandomRecommender(),  # Initial recommender
    # Doesn't matter since I already have training data, BUT CAN BE USED FOR BENCHMARKING
    recommender = seq_greedy_recommender,  # Bayesian model-based optimization
    switch_after=1  # Switch to the model-based recommender after 1 batches = immediately
)

# setup campaign
campaign = Campaign(searchspace, objective, strategy)
print(campaign)

[1mCampaign[0m
         
 [1mMeta Data[0m
 Batches Done: 0
 Fits Done: 0
 
 [1mSearch Space[0m
          
  [1mSearch Space Type: [0mDISCRETE
  
  [1mDiscrete Search Space[0m
               
   [1mDiscrete Parameters[0m
                    Name                        Type  Num_Values                   Encoding
   0              Time_h  NumericalDiscreteParameter          25                       None
   1                  pH  NumericalDiscreteParameter           9                       None
   2  Inhib_Concentrat_M  NumericalDiscreteParameter          25                       None
   3   Salt_Concentrat_M  NumericalDiscreteParameter           6                       None
   4              SMILES          SubstanceParameter         123  SubstanceEncoding.MORDRED
               
   [1mExperimental Representation[0m
        Time_h    pH  ...  Salt_Concentrat_M                                 SMILES
   0      24.0   4.0  ...               0.10           COCCOC(=O)OCSc1nc2c(s1



In [12]:
# recommendations 
new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance
print("\n\nRecommended experiments: ")
print(new_rec.to_markdown())



Recommended experiments: 
|     |   Time_h |   pH |   Inhib_Concentrat_M |   Salt_Concentrat_M | SMILES                                                               |
|----:|---------:|-----:|---------------------:|--------------------:|:---------------------------------------------------------------------|
| 484 |    480   |    7 |                0.031 |                0.05 | C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]   |
| 227 |      0.5 |    7 |                0.01  |                0.6  | C1=CC(=C(C=C1SSC2=CC(=C(C=C2)[N+](=O)[O-])C(=O)O)C(=O)O)[N+](=O)[O-] |
| 394 |    144   |    7 |                1e-05 |                0.01 | [N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+3]      |


In [13]:
new_rec["Efficiency"] = [79.8, 54.1, 59.4]
campaign.add_measurements(new_rec)

Input row with index 227 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.


In [14]:
campaign

Campaign(searchspace=SearchSpace(discrete=SubspaceDiscrete(parameters=[NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0), NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0), NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0), NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0), SubstanceParameter(name='SMILES', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n

In [15]:
second_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance
print("\n\nRecommended experiments: ")
print(second_rec.to_markdown())



Recommended experiments: 
|   index |   Time_h |   pH |   Inhib_Concentrat_M |   Salt_Concentrat_M | SMILES                                                 |
|--------:|---------:|-----:|---------------------:|--------------------:|:-------------------------------------------------------|
|     194 |       24 |   10 |               0.001  |                 0.1 | C1N2CN3CN1CN(C2)C3                                     |
|     297 |       24 |    0 |               0.0004 |                 0   | CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C  |
|     300 |       24 |    0 |               0.0004 |                 0   | CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C  |
|     303 |       24 |    0 |               0.0004 |                 0   | CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C  |
|     306 |       24 |    0 |               0.0004 |                 0   | CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C  |
|     586 |        3 |    0 |               0.00

# Data Analysis

# Bayesian Optimization

## Search Space

## Objective

## Recommender

# Benchmarking

# Transfer Learning