## Notebook for training GPR on DC dataset and generating set of suggestions

**Note: Dataset do not contain any NMC data**

## Batch: 6

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, PandasTools, Fragments, rdMolDescriptors, Descriptors, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.PandasTools import ChangeMoleculeRendering
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser
# Silence non-critical RDKit warnings to minimize unnecessary outputs
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
## import train_test_split from sklearn
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, RationalQuadratic, WhiteKernel, Matern, ConstantKernel, DotProduct, PairwiseKernel 
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.special import erf
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%%bash
pwd
ls -ltr

/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/Nat-Comm-R2/active-learning_wo_nmc_data
total 15792
-rw-r--r--@ 1 riteshk  staff  1381340 Apr 17 21:00 active_learning_batch_7.ipynb
-rw-r--r--@ 1 riteshk  staff   964972 Apr 17 21:42 active_learning_batch_1.ipynb
-rw-r--r--@ 1 riteshk  staff   984021 Apr 24 10:28 active_learning_batch_2.ipynb
-rw-r--r--@ 1 riteshk  staff  1113818 Apr 24 10:28 active_learning_batch_3.ipynb
-rw-r--r--@ 1 riteshk  staff  1164096 Apr 24 10:28 active_learning_batch_4.ipynb
-rw-r--r--@ 1 riteshk  staff  1002304 Apr 24 10:28 active_learning_batch_5.ipynb
-rw-r--r--@ 1 riteshk  staff  1461868 Apr 24 10:29 active_learning_batch_6.ipynb


### Reading & standardizing datasets

In [3]:
rem_till_b5 = [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 56, 57, 60, 61, 62, 66, 67, 68, 70, 73, 74, 75, 76, 77, 78, 79, 104, 110, 111, 112, 113, 117, 118, 119]
rem_till_b5 += [121, 123, 127, 128, 129]
rem_till_b5 += [141, 143, 144, 145, 149, 150, 151]
rem_till_b5 += [176, 177, 178, 181]
len(rem_till_b5)

51

In [None]:
df = pd.read_csv('../../datasets/batch-6/label_data_post_batch5.csv') 
df.drop(index=rem_till_b5, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_14,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,0.613072,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,0.301582,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.022233,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.000573,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,-0.208395,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,COCCCOCCCOC,[Li+].F[P-](F)(F)(F)(F)F,-0.633502,-1.410069,0.798630,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
128,COCCCOCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.633502,-1.410069,0.798630,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
129,COC(CCl)(CCl)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.715544,-1.384987,0.702147,-0.545643,0.424318,0.071399,0.660587,0.237141,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
130,COCCOCC(=O)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.314981,0.184491,1.622378,-0.105444,-0.221744,0.414074,0.458720,0.608244,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0


In [6]:
X = df.iloc[:,2:27] ## PCA-reduced solvent & salt descriptors and other non-molecular features
y = df['norm_capacity_3'] # normalized discharge capacity at 20th cycle (target variable)
std_scale = StandardScaler().fit(X)
X_std = std_scale.transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

### Active learning workflow

#### Choose best hyperparameters for each kernel

In [7]:
def negative_log_likelihood_rbf(params):
    noise_level, length_scale, alpha = params
    kernel = RBF(length_scale=length_scale)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rq(params):
    noise_level, length_scale, alpha_k, alpha = params 
    kernel = RationalQuadratic(length_scale=length_scale, alpha=alpha_k)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rbf_expsin(params):
    noise_level, length_scale, periodicity, alpha = params 
    kernel = RBF(length_scale=length_scale) + ExpSineSquared(length_scale=length_scale, periodicity=periodicity)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_matern(params):
    noise_level, length_scale, alpha = params
    kernel = Matern(length_scale=length_scale, nu=1.5)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_pairwise(params):
    noise_level, length_scale, alpha = params
    kernel = PairwiseKernel(metric="polynomial")
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

##### Pairwise kernel

In [8]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_pairwise, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.0001


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


##### RationalQuadratic kernel

In [9]:
initial_guess = [0.15, 0.01, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha_k, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha_k, alpha
result = minimize(negative_log_likelihood_rq, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha_k, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha_k:", optimized_alpha_k)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.14958820461792924
Optimized length_scale: 0.009440526274305389
Optimized alpha_k: 50.0
Optimized alpha: 0.00783695778067083


##### Matern-3/2 kernel

In [10]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_matern, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.011076894284322281


##### RBF-ExpineSquared kernel

In [11]:
initial_guess = [0.15, 0.01, 1.0, 0.02] # initial guess for noise_level, length_scale, periodicity, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-2, 10.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, periodicity, alpha
result = minimize(negative_log_likelihood_rbf_expsin, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_periodicity, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized periodicity:", optimized_periodicity)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized periodicity: 1.0
Optimized alpha: 0.020000000052559454


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


#### Train surrogate models

Note: no need to run again, saved model checkpoints have been provided

In [None]:
## change all hyperparameters accordingly
optimized_pairwise_kernel = PairwiseKernel(metric="polynomial") + WhiteKernel(noise_level=0.15)
optimized_matern_kernel = Matern(length_scale=0.01, nu=1.5) + WhiteKernel(noise_level=0.15)
optimized_rbfexpsin_kernel = RBF(length_scale=0.01) + ExpSineSquared(length_scale=0.01, periodicity=1.0) + WhiteKernel(noise_level=0.15)
optimized_rq_kernel = RationalQuadratic(length_scale=0.009440526274305389, alpha=50.0) + WhiteKernel(noise_level=0.14958820461792924)

gpr_models = [GaussianProcessRegressor(kernel=optimized_pairwise_kernel, alpha=0.0001, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_matern_kernel, alpha=0.011076894284322281, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rq_kernel, alpha=0.00783695778067083, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rbfexpsin_kernel, alpha=0.020000000052559454, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)]

model_names = ['../../models/batch-6/pairwise_batch6_wo_nmc_data.pkl', '../../models/batch-6/matern_batch6_wo_nmc_data.pkl', '../../models/batch-6/rq_batch6_wo_nmc_data.pkl', '../../models/batch-6/rbf-ess_batch6_wo_nmc_data.pkl']
k = 0
for model in gpr_models:
    print("fitting model: ", k)
    model.fit(X_std, y)
    pickle.dump(model, open(model_names[k], 'wb'))
    k += 1

fitting model:  0


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


fitting model:  1
fitting model:  2
fitting model:  3


#### BMA: aggregate predictions

##### Acquisition function (Expected improvement)

In [14]:
## final corrected & verified one to be used
def calc_EI(y_pred, y_pred_un, y_pred_un_uncer, epsilon=0.01):
    y_best = np.max(y_pred)
    EI = []
    explore = []
    exploit = []

    for i in range(len(y_pred_un)):
        if y_pred_un_uncer[i] != 0:
            
            # Calculate the cumulative distribution function (CDF) for the Gaussian distribution
            z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            # z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            cdf_z = 0.5 * (1 + erf(z / np.sqrt(2)))
            pdf_z = np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)

            # Calculate Expected Improvement
            expected_improvement = y_pred_un_uncer[i] * (z * cdf_z) + y_pred_un_uncer[i] * pdf_z
            exploitation = y_pred_un_uncer[i] * z * cdf_z
            exploration = y_pred_un_uncer[i] * pdf_z
            EI.append(expected_improvement)
            explore.append(exploration)
            exploit.append(exploitation)
        else:
            EI.append(0.0)
    return EI, exploit, explore

In [None]:
## virtual search space for batch-4 (electrolytes containing solvent combinations tested in batch-3 removed)
path = '/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/data-codes-sharing/datasets'
df_unlabel = pd.read_csv(f'../../datasets/batch-6/virtual_search_space_for_batch6.csv')
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,salt_ecfp_pca_5,salt_ecfp_pca_6,salt_ecfp_pca_7,salt_ecfp_pca_8,salt_ecfp_pca_9,mol_wt_solv,mol_wt_salt,conc_salt_1,theor_capacity,amt_electrolyte
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,73.052764,186.939685,1,150,50
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,114.079313,186.939685,1,150,50
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,116.094963,186.939685,1,150,50
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,70.058995,186.939685,1,150,50
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,72.044939,186.939685,1,150,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999851,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,193.077264,144.001775,1,150,50
999852,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,240.147392,144.001775,1,150,50
999853,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,251.046358,144.001775,1,150,50
999854,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,248.133634,144.001775,1,150,50


In [16]:
X_un = df_unlabel.iloc[:,2:]
X_un_std = std_scale.transform(X_un)
X_un_std = pd.DataFrame(X_un_std, columns=X_un.columns)

##### Calculate model weights & obtained aggregated mean ($\mu^{aggr}$), uncertainty ($\sigma^{aggr}$), & EI ($EI^{aggr}$)

In [None]:
# Calculate Model Weights using BMA (first order)
model_names = ['../../models/batch-6/pairwise_batch6_wo_nmc_data.pkl', '../../models/batch-6/matern_batch6_wo_nmc_data.pkl', '../../models/batch-6/rq_batch6_wo_nmc_data.pkl', '../../models/batch-6/rbf-ess_batch6_wo_nmc_data.pkl']
model_weights = []
uncertainties = []
predictions = []
y_label_preds = []
for model in model_names:
    gpr = pickle.load(open(model, 'rb'))
    y_un = gpr.predict(X_un_std)
    predictions.append(y_un)
    individual_uncertainties = gpr.predict(X_un_std, return_std=True)[1]
    uncertainties.append(individual_uncertainties)
    likelihoods = norm.pdf(y_un, loc=gpr.predict(X_un_std), scale=individual_uncertainties)
    prior_beliefs = 1.0  # Non-informative prior
    posterior = likelihoods * prior_beliefs
    model_weights.append(posterior / np.sum(posterior))
    y_ = gpr.predict(X_std)
    y_label_preds.append(y_)

In [18]:
df_unlabel['uncertainty_1'] = uncertainties[0]; df_unlabel['uncertainty_2'] = uncertainties[1]; df_unlabel['uncertainty_3'] = uncertainties[2]; df_unlabel['uncertainty_4'] = uncertainties[3]
df_unlabel['prediction_1'] = predictions[0]; df_unlabel['prediction_2'] = predictions[1]; df_unlabel['prediction_3'] = predictions[2]; df_unlabel['prediction_4'] = predictions[3]
df_unlabel['explore_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[2]; df_unlabel['exploit_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[1]
df_unlabel['explore_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[2]; df_unlabel['exploit_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[1]
df_unlabel['explore_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[2]; df_unlabel['exploit_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[1]
df_unlabel['explore_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[2]; df_unlabel['exploit_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[1]
df_unlabel['EI_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[0]; df_unlabel['EI_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[0]; df_unlabel['EI_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[0]; df_unlabel['EI_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[0]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,explore_2,exploit_2,explore_3,exploit_3,explore_4,exploit_4,EI_1,EI_2,EI_3,EI_4
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.002186,-0.001969,0.010416,-0.008915,0.001387,-0.001260,0.000221,0.000217,0.001501,0.000127
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.005247,-0.004621,0.008152,-0.007066,0.005065,-0.004459,0.000785,0.000626,0.001087,0.000605
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.001150,-0.001051,0.007884,-0.006845,0.000623,-0.000574,0.000085,0.000100,0.001038,0.000049
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.007413,-0.006415,0.012493,-0.010571,0.006683,-0.005805,0.001235,0.000998,0.001922,0.000878
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.002138,-0.001927,0.010016,-0.008593,0.001053,-0.000961,0.000208,0.000211,0.001423,0.000091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999851,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.001767,-0.001602,0.006312,-0.005531,0.001156,-0.001055,0.000125,0.000164,0.000781,0.000101
999852,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.004622,-0.004112,0.007009,-0.006126,0.001678,-0.001524,0.000237,0.000510,0.000883,0.000154
999853,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.007203,-0.006302,0.008889,-0.007689,0.003378,-0.003018,0.000599,0.000901,0.001201,0.000359
999854,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.001787,-0.001624,0.007162,-0.006253,0.000264,-0.000246,0.000025,0.000163,0.000909,0.000018


In [19]:
def calc_aggr_uncer(uncer_1, w_1, pred_1, uncer_2, w_2, pred_2, uncer_3, w_3, pred_3, uncer_4, w_4, pred_4):
    uncer = [uncer_1, uncer_2, uncer_3, uncer_4]
    pred = [pred_1, pred_2, pred_3, pred_4]
    weight = [w_1, w_2, w_3, w_4]
    pred_aggr = w_1 * pred_1 + w_2 * pred_2 + w_3 * pred_3 + w_4 * pred_4
    sum = 0
    for i in range(4):
        sum += weight[i] * (uncer[i]**2 + (pred[i] - pred_aggr)**2)
    aggr_uncer = np.sqrt(sum)
    return aggr_uncer

In [20]:
df_unlabel['prediction_aggr'] = df_unlabel['prediction_1'] * model_weights[0] + df_unlabel['prediction_2'] * model_weights[1] + df_unlabel['prediction_3'] * model_weights[2] + df_unlabel['prediction_4'] * model_weights[3]
df_unlabel['uncertainty_aggr'] = calc_aggr_uncer(df_unlabel['uncertainty_1'], model_weights[0], df_unlabel['prediction_1'], df_unlabel['uncertainty_2'], model_weights[1], df_unlabel['prediction_2'], df_unlabel['uncertainty_3'], model_weights[2], df_unlabel['prediction_3'], df_unlabel['uncertainty_4'], model_weights[3], df_unlabel['prediction_4'])
df_unlabel['explore_aggr'] = df_unlabel['explore_1'] * model_weights[0] + df_unlabel['explore_2'] * model_weights[1] + df_unlabel['explore_3'] * model_weights[2] + df_unlabel['explore_4'] * model_weights[3]
df_unlabel['exploit_aggr'] = df_unlabel['exploit_1'] * model_weights[0] + df_unlabel['exploit_2'] * model_weights[1] + df_unlabel['exploit_3'] * model_weights[2] + df_unlabel['exploit_4'] * model_weights[3]
df_unlabel['ratio_aggr'] = df_unlabel['exploit_aggr'] / df_unlabel['explore_aggr']

## 'EI_aggr' is the final rank by which candidate electrolytes are selected for experimental validation
df_unlabel['EI_aggr'] = df_unlabel['EI_1'] * model_weights[0] + df_unlabel['EI_2'] * model_weights[1] + df_unlabel['EI_3'] * model_weights[2] + df_unlabel['EI_4'] * model_weights[3]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.000221,0.000217,0.001501,0.000127,3.251651e-07,0.000547,1.718350e-08,-1.500905e-08,-0.873457,2.174452e-09
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.000785,0.000626,0.001087,0.000605,4.500738e-07,0.000566,2.554496e-08,-2.235475e-08,-0.875114,3.190213e-09
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.000085,0.000100,0.001038,0.000049,-7.699636e-08,0.000551,1.082722e-08,-9.539625e-09,-0.881078,1.287596e-09
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.001235,0.000998,0.001922,0.000878,8.705177e-07,0.000649,3.917685e-08,-3.364081e-08,-0.858691,5.536049e-09
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.000208,0.000211,0.001423,0.000091,2.545270e-07,0.000543,1.610424e-08,-1.408545e-08,-0.874643,2.018784e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999851,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.000125,0.000164,0.000781,0.000101,-7.312810e-08,0.000548,1.065562e-08,-9.483406e-09,-0.889991,1.172218e-09
999852,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.000237,0.000510,0.000883,0.000154,-1.722843e-07,0.000581,1.449939e-08,-1.285330e-08,-0.886472,1.646092e-09
999853,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.000599,0.000901,0.001201,0.000359,1.610598e-07,0.000556,2.344490e-08,-2.053503e-08,-0.875885,2.909876e-09
999854,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.000025,0.000163,0.000909,0.000018,-5.230923e-07,0.000672,9.076519e-09,-8.014594e-09,-0.883003,1.061925e-09


##### Save top 5000 predictions

In [32]:
df_unlabel_ = df_unlabel.copy()
df_unlabel_ = df_unlabel_.sort_values(by='EI_aggr', ascending=False)
df_unlabel_5000 = df_unlabel_.iloc[:5000,:]
df_unlabel_5000

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
332493,COCCCN1C(=O)CCCCC1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.169587,-0.484007,1.174677,0.358038,-1.006078,0.638467,0.850387,0.255182,...,0.026366,0.020068,0.003556,0.024658,0.000002,0.001024,2.236542e-07,-1.452605e-07,-0.649487,7.839369e-08
285698,COCCOOC1CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.388405,-0.527537,0.930537,-0.411043,-0.933764,1.138892,0.692328,-0.351117,...,0.021434,0.021523,0.005613,0.020996,0.000002,0.001090,2.297727e-07,-1.514951e-07,-0.659326,7.827755e-08
230728,COCCCCCOCCCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.720305,-1.521431,0.842552,-0.023711,-0.722962,1.030389,0.932336,0.376740,...,0.020119,0.018376,0.006438,0.020778,0.000002,0.001114,2.288959e-07,-1.521279e-07,-0.664616,7.676795e-08
302690,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.019959,0.024181,0.005345,0.015880,0.000002,0.001099,2.247264e-07,-1.483614e-07,-0.660187,7.636502e-08
162781,COCCCCCOCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.721775,-1.496471,0.872212,-0.086397,-0.676900,0.979219,0.992173,0.364148,...,0.020129,0.019405,0.007433,0.017755,0.000002,0.001125,2.302560e-07,-1.539464e-07,-0.668588,7.630962e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327496,CCCS(=O)(=O)OCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.182448,-1.331591,0.667976,0.582480,-0.425401,0.803221,0.534043,0.495566,...,0.003847,0.003029,0.003192,0.002246,0.000001,0.000814,7.998745e-08,-6.567681e-08,-0.821089,1.431064e-08
11773,COCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.873248,-1.351804,0.787974,-0.607253,-0.663560,-0.249187,0.641711,0.413012,...,0.003894,0.002993,0.002344,0.002941,0.000001,0.000821,7.988083e-08,-6.557186e-08,-0.820871,1.430897e-08
105249,CCP(=O)(CCOC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.969289,-1.223450,0.896945,-0.054915,-0.435602,0.354200,0.532004,0.516892,...,0.003355,0.002963,0.003054,0.002669,0.000001,0.000833,8.017460e-08,-6.586600e-08,-0.821532,1.430860e-08
198671,CC1CCCC1OP(=O)=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.062683,-0.690392,0.726282,-0.437153,0.062610,0.280937,-0.152459,-0.987590,...,0.003609,0.003297,0.002691,0.003627,0.000001,0.000756,7.997274e-08,-6.566606e-08,-0.821106,1.430667e-08


In [33]:
df_unlabel_uniq = df_unlabel_5000.drop_duplicates(subset=['solv_comb_sm'], keep='first') ## only keeping unique solvent combinations for selection purposes; these compounds were manually searched in emolecules to finf purchasable compounds
df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_unlabel_uniq[['solv_comb_sm', 'salt_comb_sm', 'prediction_aggr', 'uncertainty_aggr', 'explore_aggr', 'exploit_aggr', 'ratio_aggr', 'EI_aggr']].to_csv('../datasets/batch-6/top_5000_suggestions_batch6_uniq_solvents_wo_nmc_data.csv', index=False)
df_unlabel_uniq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
332493,COCCCN1C(=O)CCCCC1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.169587,-0.484007,1.174677,0.358038,-1.006078,0.638467,0.850387,0.255182,...,0.026366,0.020068,0.003556,0.024658,0.000002,0.001024,2.236542e-07,-1.452605e-07,-0.649487,7.839369e-08
285698,COCCOOC1CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.388405,-0.527537,0.930537,-0.411043,-0.933764,1.138892,0.692328,-0.351117,...,0.021434,0.021523,0.005613,0.020996,0.000002,0.001090,2.297727e-07,-1.514951e-07,-0.659326,7.827755e-08
230728,COCCCCCOCCCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.720305,-1.521431,0.842552,-0.023711,-0.722962,1.030389,0.932336,0.376740,...,0.020119,0.018376,0.006438,0.020778,0.000002,0.001114,2.288959e-07,-1.521279e-07,-0.664616,7.676795e-08
302690,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.019959,0.024181,0.005345,0.015880,0.000002,0.001099,2.247264e-07,-1.483614e-07,-0.660187,7.636502e-08
162781,COCCCCCOCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.721775,-1.496471,0.872212,-0.086397,-0.676900,0.979219,0.992173,0.364148,...,0.020129,0.019405,0.007433,0.017755,0.000002,0.001125,2.302560e-07,-1.539464e-07,-0.668588,7.630962e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327496,CCCS(=O)(=O)OCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.182448,-1.331591,0.667976,0.582480,-0.425401,0.803221,0.534043,0.495566,...,0.003847,0.003029,0.003192,0.002246,0.000001,0.000814,7.998745e-08,-6.567681e-08,-0.821089,1.431064e-08
11773,COCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.873248,-1.351804,0.787974,-0.607253,-0.663560,-0.249187,0.641711,0.413012,...,0.003894,0.002993,0.002344,0.002941,0.000001,0.000821,7.988083e-08,-6.557186e-08,-0.820871,1.430897e-08
105249,CCP(=O)(CCOC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.969289,-1.223450,0.896945,-0.054915,-0.435602,0.354200,0.532004,0.516892,...,0.003355,0.002963,0.003054,0.002669,0.000001,0.000833,8.017460e-08,-6.586600e-08,-0.821532,1.430860e-08
198671,CC1CCCC1OP(=O)=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.062683,-0.690392,0.726282,-0.437153,0.062610,0.280937,-0.152459,-0.987590,...,0.003609,0.003297,0.002691,0.003627,0.000001,0.000756,7.997274e-08,-6.566606e-08,-0.821106,1.430667e-08


In [23]:
df_unlabel_uniq_ = pd.read_csv(f'{path}/batch-6/top_5000_suggestions_batch6_uniq_solvents.csv')
df_unlabel_uniq_

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,theor_capacity,amt_electrolyte,pressure_type,rank,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,150,50,2,1095.0,1.846807e-06,0.000938,1.757061e-08,-1.520749e-08,-6.435340,2.363121e-09
1,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,150,50,2,3925.0,1.573056e-06,0.000830,9.237026e-09,-8.166634e-09,-7.629569,1.070393e-09
2,COCB(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.608169,-1.441212,0.926965,-0.546185,-0.079639,0.040126,0.487229,0.370232,...,150,50,2,4329.0,1.562349e-06,0.000826,8.804825e-09,-7.798488e-09,-7.749381,1.006337e-09
3,COCCO[Al],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.843809,-0.989662,1.069399,-0.365416,-0.311995,0.308275,0.678411,0.599304,...,150,50,2,1636.0,1.779212e-06,0.000910,1.453641e-08,-1.267635e-08,-6.815016,1.860062e-09
4,COCCB=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801731,-1.105580,1.083006,-0.354707,-0.315271,0.212797,0.668411,0.618927,...,150,50,2,1185.0,1.840186e-06,0.000939,1.722264e-08,-1.495731e-08,-6.602702,2.265332e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3655,COCCCCCOOCCCOOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.963041,-1.537202,0.930112,-0.000216,-0.598388,0.780594,0.817360,0.704008,...,150,50,2,4797.0,1.271594e-06,0.000741,8.333448e-09,-7.393424e-09,-7.865138,9.400247e-10
3656,COCCC1CCCN(C(=O)C(F)F)C1,[Li+].O=C1O[B-](F)(F)OC1=O,0.751896,0.844789,0.729120,0.442497,-1.227714,-0.132727,1.217904,-0.219517,...,150,50,2,3226.0,5.196018e-07,0.000567,1.029346e-08,-9.081568e-09,-7.493699,1.211894e-09
3657,COCCC1CCCN(CC=O)C1,[Li+].O=C1O[B-](F)(F)OC1=O,1.183272,-0.146497,0.667712,0.318337,-1.237932,0.317185,1.474304,0.149830,...,150,50,2,1009.0,1.045575e-06,0.000710,1.843212e-08,-1.593775e-08,-6.389482,2.494372e-09
3658,COC1CCCCCC(=O)OC(C)C1,[Li+].O=C1O[B-](F)(F)OC1=O,1.135297,-0.466424,0.870969,-0.693492,-0.429577,0.719366,-0.422886,-0.750610,...,150,50,2,3983.0,5.996270e-07,0.000567,8.686140e-09,-7.627014e-09,-7.201240,1.059125e-09


In [25]:
df_unlabel_uniq_['solv_comb_sm'] = df_unlabel_uniq_['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
# df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm', 'salt_comb_sm'], how='inner') ## unique electrolytes
df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm'], how='inner') ## unique solvents
df_sugg_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm_x,solv_ecfp_pca_0_x,solv_ecfp_pca_1_x,solv_ecfp_pca_2_x,solv_ecfp_pca_3_x,solv_ecfp_pca_4_x,solv_ecfp_pca_5_x,solv_ecfp_pca_6_x,solv_ecfp_pca_7_x,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr_y,uncertainty_aggr_y,explore_aggr_y,exploit_aggr_y,ratio_aggr_y,EI_aggr_y
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,0.007718,0.007201,0.002005,0.006969,1.795432e-06,0.000950,1.263438e-07,-9.758212e-08,-0.772354,2.876170e-08
1,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,0.004561,0.004106,0.003493,0.002893,1.549200e-06,0.000861,9.342847e-08,-7.561331e-08,-0.809318,1.781517e-08
2,COCB(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.608169,-1.441212,0.926965,-0.546185,-0.079639,0.040126,0.487229,0.370232,...,0.004378,0.003909,0.002600,0.003268,1.513381e-06,0.000850,8.933122e-08,-7.255834e-08,-0.812239,1.677288e-08
3,COCCO[Al],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.843809,-0.989662,1.069399,-0.365416,-0.311995,0.308275,0.678411,0.599304,...,0.006835,0.005997,0.001289,0.006434,1.764771e-06,0.000931,1.136080e-07,-8.879101e-08,-0.781556,2.481696e-08
4,COCCB=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801731,-1.105580,1.083006,-0.354707,-0.315271,0.212797,0.668411,0.618927,...,0.008662,0.007533,0.002428,0.007278,1.787946e-06,0.000953,1.328411e-07,-1.019333e-07,-0.767333,3.090780e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806,COCCOC1CCCC(=O)CCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.390526,-0.208369,1.246751,-0.053624,-0.728419,0.751588,0.397039,-0.153624,...,0.005351,0.006418,0.002140,0.003626,1.180846e-06,0.000774,9.471173e-08,-7.606351e-08,-0.803105,1.864822e-08
2807,COCCOCCOC1CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.340693,-0.507196,0.871490,-0.409964,-1.059248,1.172802,0.773990,-0.333313,...,0.010835,0.014507,0.003072,0.006822,1.493009e-06,0.000894,1.487800e-07,-1.109183e-07,-0.745519,3.786172e-08
2808,COCCC1CCCC(=O)CC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.631822,-0.656586,1.123381,-0.360369,-0.423082,0.714197,0.992159,-0.337507,...,0.006763,0.007954,0.002342,0.004526,1.364119e-06,0.000832,1.111114e-07,-8.740974e-08,-0.786686,2.370162e-08
2809,COCCCCCCCOCC=O,[Li+].O=C1O[B-](F)(F)OC1=O,-0.707783,-1.113542,0.971515,0.124799,-0.614998,0.978602,0.770082,0.267719,...,0.005380,0.005877,0.003436,0.003576,1.460681e-06,0.000848,1.034388e-07,-8.266032e-08,-0.799123,2.077850e-08


### Check how many common solvents in sixth batch of labeled dataset

In [None]:
df_label_all = pd.read_csv('../../datasets/label_all_ecfp_pca_add_feat_incl_b7_090824.csv')
df_label_all

Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
204,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
205,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
206,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0


In [27]:
df_b6 = df_label_all.loc[df_label_all['expt_test'] == 6]
df_b6['solv_comb_sm'] = df_b6['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_b6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b6['solv_comb_sm'] = df_b6['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
183,COCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-0.899245,-1.039674,1.079794,-0.351354,-0.360827,0.421555,0.804488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
184,COCCOCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-0.899245,-1.039674,1.079794,-0.351354,-0.360827,0.421555,0.804488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
185,COC1CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,0.807314,-0.951595,0.87908,-0.605456,-0.644732,0.730071,0.164348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
186,COCCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
187,COC(=O)C1CCCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,0.719334,0.63609,1.59074,-0.247047,-0.124693,0.317496,-0.297293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
188,COC(=O)C1CCCCCC1=O,[Li+].F[P-](F)(F)(F)(F)F,6.0,0.719334,0.63609,1.59074,-0.247047,-0.124693,0.317496,-0.297293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
189,COC(=O)C1CCCCCC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,6.0,0.719334,0.63609,1.59074,-0.247047,-0.124693,0.317496,-0.297293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
190,COCCOCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-0.900062,-1.026422,1.018114,-0.356201,-0.343756,0.403599,0.757477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
191,CCCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-1.067892,-1.253529,0.602357,0.775134,-0.714364,1.295648,0.646532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
192,CCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,6.0,-1.097146,-1.027,0.868116,0.014163,-0.384779,0.651601,0.660382,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [28]:
uniq_sm_b6 = df_b6['solv_comb_sm'].unique()
uniq_sm_b6 = pd.DataFrame(uniq_sm_b6, columns=['solv_comb_sm'])
uniq_sm_b6

Unnamed: 0,solv_comb_sm
0,COCCOCCOCCOC
1,COCCOCCOCCOCCOC
2,COC1CCCC1
3,COCCCCCCOC
4,COC(=O)C1CCCCCC1=O
5,COCCOCOCCOC
6,CCCCOCCOCCOC
7,CCOCCOCCOC
8,COC1CCCCC1=O
9,COCCS(=O)(=O)F


In [29]:
df_comm = df_unlabel_uniq.merge(uniq_sm_b6, on='solv_comb_sm', how='right')
df_comm.dropna(inplace=True)
df_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.899245,-1.039674,1.079794,-0.351354,-0.360827,0.421555,0.804488,0.673995,...,0.007099,0.006085,0.001228,0.007396,2e-06,0.000915,1.16616e-07,-9.049457e-08,-0.776005,2.612144e-08
1,COCCOCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.899245,-1.039674,1.079794,-0.351354,-0.360827,0.421555,0.804488,0.673995,...,0.005975,0.005023,0.000875,0.00421,2e-06,0.000855,9.437629e-08,-7.525482e-08,-0.797391,1.912148e-08
2,COC1CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.807314,-0.951595,0.87908,-0.605456,-0.644732,0.730071,0.164348,-0.790465,...,0.013649,0.013696,0.006754,0.012106,2e-06,0.001008,1.866271e-07,-1.347927e-07,-0.722257,5.183445e-08
3,COCCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0115,0.010043,0.00263,0.010235,2e-06,0.001015,1.584409e-07,-1.173113e-07,-0.74041,4.112963e-08
4,COC(=O)C1CCCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.719334,0.63609,1.59074,-0.247047,-0.124693,0.317496,-0.297293,-0.87441,...,0.007706,0.006312,0.002257,0.005679,1e-06,0.000757,1.055838e-07,-8.375371e-08,-0.793244,2.183014e-08
5,COCCOCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.900062,-1.026422,1.018114,-0.356201,-0.343756,0.403599,0.757477,0.652219,...,0.006078,0.005205,0.001184,0.004236,2e-06,0.000879,9.846025e-08,-7.836905e-08,-0.795946,2.00912e-08
6,CCCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.067892,-1.253529,0.602357,0.775134,-0.714364,1.295648,0.646532,0.057886,...,0.005634,0.004394,0.002113,0.004266,2e-06,0.000854,9.692391e-08,-7.785793e-08,-0.803289,1.906598e-08
7,CCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.097146,-1.027,0.868116,0.014163,-0.384779,0.651601,0.660382,0.56931,...,0.004554,0.004159,0.004821,0.00384,2e-06,0.000895,1.039622e-07,-8.327842e-08,-0.801045,2.068375e-08
8,COC1CCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.010281,0.009184,0.003562,0.007305,2e-06,0.000894,1.398717e-07,-1.06687e-07,-0.762749,3.318474e-08
9,COCCS(=O)(=O)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.774656,-1.131648,1.061661,-0.391643,-0.188458,0.184919,0.733522,0.657518,...,0.009108,0.00787,0.001859,0.006789,2e-06,0.000933,1.297816e-07,-9.942971e-08,-0.766131,3.035191e-08


In [None]:
df_comm.to_csv('../../datasets/batch-6/labeled_batch6_uniq_solvents_wo_nmc_data.csv', index=False)

In [34]:
df_comm_label = df_unlabel_uniq.merge(df_b6, on=['solv_comb_sm'], how='left', suffixes=('', '_drop'))
df_comm_label.dropna(inplace=True)
df_comm_label = df_comm_label.loc[:, ~df_comm_label.columns.str.endswith('_drop')]
df_comm_label

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
89,COC1CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.807314,-0.951595,0.87908,-0.605456,-0.644732,0.730071,0.164348,-0.790465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
240,COCCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
548,COC1CCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
549,COC1CCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
550,COC1CCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
584,COCCSC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.819883,-1.431011,0.956715,-0.464875,-0.484067,0.20612,0.691785,0.673957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
585,COCCSC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.819883,-1.431011,0.956715,-0.464875,-0.484067,0.20612,0.691785,0.673957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
732,COCCS(=O)(=O)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.774656,-1.131648,1.061661,-0.391643,-0.188458,0.184919,0.733522,0.657518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1076,COCCOCCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.899245,-1.039674,1.079794,-0.351354,-0.360827,0.421555,0.804488,0.673995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1649,COC(=O)C1CCCCCC1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.719334,0.63609,1.59074,-0.247047,-0.124693,0.317496,-0.297293,-0.87441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [None]:
# df_comm_label.to_csv('../datasets/batch-6/labeled_batch6_all_wo_nmc_data.csv', index=False) ## all common so no need to save

In [31]:
uniq_sm_comm = df_comm['solv_comb_sm'].unique()
for i in range(len(uniq_sm_b6)):
    if uniq_sm_b6['solv_comb_sm'][i] not in uniq_sm_comm:
        print(uniq_sm_b6['solv_comb_sm'][i])