In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#parameters
experimental_datafile = 'inputs/results_demo.csv'
domain_configfile = 'inputs/borrowing_hydrogen_domain.json'
batch_size = 10

In [19]:
from summit.strategies import TSEMO
from summit.models import GPyModel
from summit.data import solvent_ds, ucb_ds, DataSet
from summit.domain import Domain, DescriptorsVariable,ContinuousVariable
from summit.initial_design import LatinDesigner
# from summit.optimizers import EnumerationOptimizer
# from summit.objective import HV

import GPy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
#Import experimental results
previous_results = DataSet.read_csv(experimental_datafile)

#Read in solubility data
solubilities = pd.read_csv('inputs/solubilities.csv')
solubilities = solubilities.set_index('cas_number')
solubilities = DataSet.from_df(solubilities)

#Merge data sets
solvent_ds_full = solvent_ds.join(solubilities)
solvent_ds_final = pd.merge(solvent_ds_full, ucb_ds, left_index=True,right_index=True)
print(f"{solvent_ds_final.shape[0]} solvents for optimization")

#Double check that there are no NaNs in the descriptors
values = solvent_ds_final.descriptors_to_numpy()
values = values.astype(np.float64)
check = np.isnan(values)
assert check.all() == False

solvent_ds_final.head(3) #Show first 3 rows of dataset

80 solvents for optimization


Unnamed: 0_level_0,stenutz_name,cosmo_name,chemical_formula,molecular_weight,density,molar_volume,refractive_index,molecular_refractive_power,dielectric_constant,dipole_moment,...,vapour_pressure,henry_constant,sigma_1,sigma_2,sigma_3,sigma_4,sigma_5,solubility,solvent_class,solvent_name
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56-81-5,"1,2,3-propanetriol",glycerol,C3H8O3,92,1.26,73.1,1.475,20.56,47.0,2.56,...,0.01,3779.8,1.392,8.3081,2.465,0.5226,3.8766,0.0,Alcohol,Glycerol
107-06-2,"1,2-dichloroethane","1,2-dichloroethane",C2H4Cl2,99,1.253,79.0,1.445,21.01,10.36,1.83,...,222.09,2904.4,1.2021,10.4996,0.0,0.0,3.5413,-2.004364,Halogenated,"1,2-Dichloroethane (DCE)"
110-71-4,"1,2-dimethoxyethane","1,2-dimethoxyethane",C4H10O2,90,0.869,103.7,1.38,24.0,7.2,1.71,...,83.42,2260.69,0.0,12.4399,1.8467,0.0,1.8566,-2.891152,Ether,Dimethoxyethane (DME)


In [10]:
#Set up optimization domain
domain = Domain()
domain += DescriptorsVariable(name='solvent',
                             description='solvent for the borrowing hydrogen reaction',
                             ds=solvent_ds_final)
domain += ContinuousVariable(name='yield',
                             description='relative yield to triphenylphosphine oxide determined by LCMS',
                             bounds=[0, 100],
                             is_output=True)
domain += ContinuousVariable(name='enantiomeric_excess',
                             description='enantiomeric excess determined by ratio of LCMS peaks',
                             bounds=[0, 100],
                             is_output=True)
domain

0,1,2,3
Name,Type,Description,Values
solvent,"descriptors, input",solvent for the borrowing hydrogen reaction,80 examples of 19 descriptors
yield,"continuous, output",relative yield to triphenylphosphine oxide determined by LCMS,"[0,100]"
enantiomeric_excess,"continuous, output",enantiomeric excess determined by ratio of LCMS peaks,"[0,100]"


In [119]:
#Run the optimization
kernel = GPy.kern.Matern52(input_dim = domain.num_continuous_dimensions+domain.num_discrete_variables, 
                           ARD=True)
models = [GPyModel(kernel=kernel) for _ in range(2)]
# tsemo = TSEMO(domain, models, acquisition=HV(), optimizer=EnumerationOptimizer())
tsemo = TSEMO(domain, models)
design = tsemo.generate_experiments(previous_results, batch_size, normalize=True)

In [120]:
tsemo.x

array([[ 0.51268313,  1.89359512, -0.14893776,  0.08738322, -0.14940013,
         1.0093385 ,  0.90643061,  0.91913817,  1.88725428,  2.24757626,
        -1.28461496, -0.7914231 ,  0.09050827,  0.64982969, -0.50399538,
         1.91318877, -0.37796447,  2.02494171,  0.54520272],
       [-0.29040094, -1.33496402,  0.15106039,  0.76055763,  0.27752606,
        -1.62981139, -1.91788437, -1.98340342, -1.01236819, -0.71562828,
         1.44243007,  0.9521677 , -0.6141113 , -0.73535091,  0.482755  ,
        -1.59885834, -0.37796447, -0.76710529, -2.04090369],
       [ 2.20489599,  0.00410758,  2.07557682, -0.0420734 ,  1.92080727,
         0.12158986,  1.46224545,  0.43538124,  1.05402943,  0.59336013,
         0.57760329, -0.78725806, -1.14274672, -0.74482403,  1.77857518,
         0.34704149, -0.37796447, -1.45708318, -0.12903125],
       [-1.17952973,  0.88313004, -1.32911916, -2.21694458, -1.51191006,
         0.71764966,  0.11794908, -0.43538124, -1.14568417, -0.98893356,
        -0.205

In [None]:
#Save design for next experiment(s) and model hyperparameters to disk                                     
next_batch_num = previous_results['batch_number'].max() + 1
design.insert(0, 'batch_number', np.ones(design.shape[0])*next_batch_num, type='METADATA')
design.to_csv(f'outputs/batch_{next_batch_num}.csv')
tsemo.save_params(f'outputs/{batch}')