# Introduction

This project will focus on exploring the capabilities of Bayesian optimization, specifically employing BayBE, in the discovery of novel corrosion inhibitors for materials design. Initially, we will work with a randomly chosen subset from a comprehensive database of electrochemical responses of small organic molecules. Our goal is to assess how Bayesian optimization can speed up the screening process across the design space to identify promising compounds. We will compare different strategies for incorporating alloy information, while optimizing the experimental parameters with respect to the inhibitive performance of the screened compounds.

# Initialization

Loading libraries and data files:

In [3]:
import pandas as pd
import numpy as np
from baybe import Campaign

df_AA2024 = pd.read_excel('../data/filtered_AA2024.xlsx')
df_AA1000 = pd.read_excel('../data/filtered_AA1000.xlsx')
df_Al = pd.read_excel('../data/filtered_Al.xlsx')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import numpy as np

def random_subsample(df, num_samples):
    np.random.seed(42)
    indices = np.random.choice(df.index, num_samples, replace=False)
    subsampled_df = df.loc[indices]
    return subsampled_df 

In [5]:
df_AA2024.head()

Unnamed: 0,SMILES,Time_h,pH,Inhib_Concentrat_M,Salt_Concentrat_M,Efficiency
0,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,4.0,0.001,0.1,0.0
1,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,10.0,0.001,0.1,0.0
2,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,4.0,0.001,0.1,30.0
3,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,10.0,0.001,0.1,30.0
4,Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O,24.0,4.0,0.001,0.1,30.0


In [19]:
unique_smiles = df_AA2024.SMILES.unique()
unique_times = df_AA2024.Time_h.unique()
unique_pH = df_AA2024.pH.unique()
unique_inhib_conc = df_AA2024.Inhib_Concentrat_M.unique()
unique_salt_conc = df_AA2024.Salt_Concentrat_M.unique()

time_min, time_max = df_AA2024.Time_h.min(), df_AA2024.Time_h.max()
pH_min, pH_max = df_AA2024.pH.min(), df_AA2024.pH.max()
inhib_conc_min, inhib_conc_max = df_AA2024.Inhib_Concentrat_M.min(), df_AA2024.Inhib_Concentrat_M.max()
salt_conc_min, salt_conc_max = df_AA2024.Salt_Concentrat_M.min(), df_AA2024.Salt_Concentrat_M.max()
efficiency_min, efficiency_max = df_AA2024.Efficiency.min(), df_AA2024.Efficiency.max()

# Data Processing

In [7]:
import sys
sys.path.append('../utils')
from subsampling import random_subsample

In [8]:
random_subsample(df_AA2024, 50).shape

(50, 6)

# Data Anaylsis

In [17]:
df_AA2024[["SMILES", "Time_h", "pH", "Inhib_Concentrat_M", "Salt_Concentrat_M"]]

Unnamed: 0,SMILES,Time_h,pH,Inhib_Concentrat_M,Salt_Concentrat_M
0,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,4.0,0.0010,0.10
1,COCCOC(=O)OCSc1nc2c(s1)cccc2,24.0,10.0,0.0010,0.10
2,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,4.0,0.0010,0.10
3,Cc1ccc(c(c1)n1nc2c(n1)cccc2)O,24.0,10.0,0.0010,0.10
4,Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O,24.0,4.0,0.0010,0.10
...,...,...,...,...,...
606,S=c1sc2c([nH]1)cccc2,24.0,7.0,0.0005,0.05
607,C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O,24.0,7.0,0.0005,0.05
608,C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O,24.0,7.0,0.0005,0.05
609,C(=O)(C(=O)[O-])[O-],24.0,7.0,0.0005,0.05


# Bayesian Optimization

In [36]:
from baybe import Campaign

from baybe.targets import NumericalTarget
from baybe.objective import Objective
from baybe.searchspace import SubspaceDiscrete
from baybe.parameters import NumericalDiscreteParameter
from baybe.parameters import SubstanceParameter


def list_to_dict(input_list):
    return {item: item for item in input_list}

smiles_dict =list_to_dict(unique_smiles)


target = NumericalTarget(name="Efficiency", mode="MAX", bounds=(efficiency_min, efficiency_max), transformation="LINEAR")
objective = Objective(mode="SINGLE", targets=[target])





parameters = [
    SubstanceParameter(
    name="Solvent",
    data=smiles_dict,
    encoding="MORGAN_FP",  # optional
    decorrelate=0.7,  # optional
),
    # NumericalDiscreteParameter(name="time", values=unique_times),
    # NumericalDiscreteParameter(name="pH", values=unique_pH),
    # NumericalDiscreteParameter(name="inhib_conc", values=unique_inhib_conc),
    # NumericalDiscreteParameter(name="salt_conc", values=unique_salt_conc),
]
# searchspace = SubspaceDiscrete.from_product(parameters=parameters)
searchspace = SubspaceDiscrete.from_dataframe(df_AA2024[["SMILES", "Time_h", "pH", "Inhib_Concentrat_M", "Salt_Concentrat_M"]])

campaign = Campaign(
    searchspace=searchspace,  # Required
    objective=objective,  # Required
    # recommender=recommender,  # Optional
)

AttributeError: type object 'SubstanceParameter' has no attribute 'name'

In [34]:
searchspace

SubspaceDiscrete(parameters=[CategoricalParameter(name='SMILES', _values=('COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O', 'On1nnc2c1cccc2', 'c1ncn[nH]1', 'Sc1n[nH]cn1', 'S[C]1NC2=C[CH]C=NC2=N1', 'S=c1[nH]c2c([nH]1)nccn2', 'Sc1ncc[nH]1', 'C=CC(=O)OCCOC(=O)OCCSc1ncccn1', 'CCSc1nnc(s1)N', 'CSc1nnc(s1)N', 'Cc1ccc2c(c1)nc([nH]2)S', 'OC(=O)CS', 'Sc1nc2c([nH]1)cccc2', 'OC(=O)c1ccccc1S', 'S=c1sc2c([nH]1)cccc2', 'OC(=O)c1cccnc1S', 'Sc1ncccn1', 'c1ccc(nc1)c1ccccn1', 'Sc1nnc(s1)S', 'Nc1cc(S)nc(n1)N', 'Nc1nc([nH]n1)C(=O)O', 'Nc1n[nH]cn1', 'OC(=O)c1n[nH]c(n1)N', 'Nc1n[nH]c(n1)S', 'CS[C]1N[N]C(=N1)N', 'C1=CC(=CC(=C1)S)C(=O)O', 'OC(=O)CCS', 'Oc1ccccc1c1nnc([nH]1)S', 'Nn1cnnc1', 'Nc1ccnc(n1)S', 'Nn1c(NN)nnc1S', 'Nn1c(S)nnc1c1ccccc1', 'Sc1nc(N)c2c(n1)[nH]nc2', 'Oc1ccc(cc1)C(=O)O', 'OC(=O)c1ccc(cc1)S', 'Cn1cnnc1S', 'Sc1nc(N)c(c(n1)S)N', 'Nc1ncncc1N', 'Nc1cc(N)nc(n1)S', 'Cc1cc(C)nc(n1)S', 'Clc1cccc(c1)c1n[nH]c(=S)[nH]1', 'COc1cccc(c1)c1n[nH]c(=S

In [None]:
results = simulate_scenarios(
    # Necessary
    campaign=campaign,
    # Technically optional but should always be set
    lookup=lookup,
    # Optional
    batch_size=batch_size,
    n_doe_iterations=n_doe_iterations,
    initial_data=initial_data,
    random_seed=random_seed,
    impute_mode=impute_mode,
    noise_percent=noise_percent,
)

## Search Space

## Target & Objective

In [None]:
from baybe.targets import NumericalTarget
from baybe.objective import Objective

target = NumericalTarget(
    name="Efficiency",
    mode="MAX",
)
objective = Objective(mode="SINGLE", targets=[target])

## Recommender

# Benchmarking

# Transfer Learning