## Notebook for training GPR on DC dataset and generating set of suggestions

**Note: Dataset do not contain any NMC data**

## Batch: 5

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, PandasTools, Fragments, rdMolDescriptors, Descriptors, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.PandasTools import ChangeMoleculeRendering
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser
# Silence non-critical RDKit warnings to minimize unnecessary outputs
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
## import train_test_split from sklearn
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, RationalQuadratic, WhiteKernel, Matern, ConstantKernel, DotProduct, PairwiseKernel 
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.special import erf
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%%bash
pwd
ls -ltr

/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/Nat-Comm-R2/active-learning_wo_nmc_data


total 16744
-rw-r--r--@ 1 riteshk  staff  1474650 Apr 17 21:00 active_learning_batch_6.ipynb
-rw-r--r--@ 1 riteshk  staff  1381340 Apr 17 21:00 active_learning_batch_7.ipynb
-rw-r--r--@ 1 riteshk  staff   964972 Apr 17 21:42 active_learning_batch_1.ipynb
-rw-r--r--@ 1 riteshk  staff   984020 Apr 22 09:52 active_learning_batch_2.ipynb
-rw-r--r--@ 1 riteshk  staff  1113817 Apr 22 11:07 active_learning_batch_3.ipynb
-rw-r--r--@ 1 riteshk  staff  1164233 Apr 22 11:35 active_learning_batch_4.ipynb
-rw-r--r--@ 1 riteshk  staff  1473193 Apr 24 09:57 active_learning_batch_5.ipynb


### Reading & standardizing datasets

In [3]:
rem_till_b4 = [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 56, 57, 60, 61, 62, 66, 67, 68, 70, 73, 74, 75, 76, 77, 78, 79, 104, 110, 111, 112, 113, 117, 118, 119]
rem_till_b4 += [121, 123, 127, 128, 129]
rem_till_b4 += [141, 143, 144, 145, 149, 150, 151]
len(rem_till_b4)

47

In [None]:
df = pd.read_csv('../../datasets/batch-5/label_data_post_batch4.csv') 
df.drop(index=rem_till_b4, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_14,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,0.613072,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,0.301582,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.022233,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.000573,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,-0.208395,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,CCCCOCCCl,[Li+].O=C1O[B-](F)(F)OC1=O,-0.744628,-1.849104,0.136047,0.898735,-0.158824,0.886538,0.120523,-0.324358,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
101,CCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.185937,-1.761041,0.562131,0.697612,-0.655805,1.009843,0.605023,0.421360,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
102,CO[Si](CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(...,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.703865,-1.378061,0.988412,-0.661167,0.540599,-0.224957,0.849146,-0.022704,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
103,CO[Si](CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(...,[Li+].F[P-](F)(F)(F)(F)F,-0.703865,-1.378061,0.988412,-0.661167,0.540599,-0.224957,0.849146,-0.022704,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0


In [5]:
X = df.iloc[:,2:27] ## PCA-reduced solvent & salt descriptors and other non-molecular features
y = df['norm_capacity_3'] # normalized discharge capacity at 20th cycle (target variable)
std_scale = StandardScaler().fit(X)
X_std = std_scale.transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

### Active learning workflow

#### Choose best hyperparameters for each kernel

In [6]:
def negative_log_likelihood_rbf(params):
    noise_level, length_scale, alpha = params
    kernel = RBF(length_scale=length_scale)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rq(params):
    noise_level, length_scale, alpha_k, alpha = params 
    kernel = RationalQuadratic(length_scale=length_scale, alpha=alpha_k)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rbf_expsin(params):
    noise_level, length_scale, periodicity, alpha = params 
    kernel = RBF(length_scale=length_scale) + ExpSineSquared(length_scale=length_scale, periodicity=periodicity)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_matern(params):
    noise_level, length_scale, alpha = params
    kernel = Matern(length_scale=length_scale, nu=1.5)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_pairwise(params):
    noise_level, length_scale, alpha = params
    kernel = PairwiseKernel(metric="polynomial")
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

##### Pairwise kernel

In [7]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_pairwise, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.0001


##### RationalQuadratic kernel

In [8]:
initial_guess = [0.15, 0.01, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha_k, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha_k, alpha
result = minimize(negative_log_likelihood_rq, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha_k, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha_k:", optimized_alpha_k)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha_k: 0.01
Optimized alpha: 0.006256325235093756


##### Matern-3/2 kernel

In [9]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_matern, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.00999289432403267


##### RBF-ExpineSquared kernel

In [10]:
initial_guess = [0.15, 0.01, 1.0, 0.02] # initial guess for noise_level, length_scale, periodicity, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-2, 10.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, periodicity, alpha
result = minimize(negative_log_likelihood_rbf_expsin, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_periodicity, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized periodicity:", optimized_periodicity)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized periodicity: 1.0
Optimized alpha: 0.02


#### Train surrogate models

Note: no need to run again, saved model checkpoints have been provided

In [None]:
## change all hyperparameters accordingly
optimized_pairwise_kernel = PairwiseKernel(metric="polynomial") + WhiteKernel(noise_level=0.15)
optimized_matern_kernel = Matern(length_scale=0.01, nu=1.5) + WhiteKernel(noise_level=0.15)
optimized_rbfexpsin_kernel = RBF(length_scale=0.01) + ExpSineSquared(length_scale=0.01, periodicity=1.0) + WhiteKernel(noise_level=0.15)
optimized_rq_kernel = RationalQuadratic(length_scale=0.01, alpha=0.01) + WhiteKernel(noise_level=0.15)

gpr_models = [GaussianProcessRegressor(kernel=optimized_pairwise_kernel, alpha=0.0001, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_matern_kernel, alpha=0.00999289432403267, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rq_kernel, alpha=0.006256325235093756, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rbfexpsin_kernel, alpha=0.02, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)]

model_names = ['../../models/batch-5/pairwise_batch5_wo_nmc_data.pkl', '../../models/batch-5/matern_batch5_wo_nmc_data.pkl', '../../models/batch-5/rq_batch5_wo_nmc_data.pkl', '../../models/batch-5/rbf-ess_batch5_wo_nmc_data.pkl']
k = 0
for model in gpr_models:
    print("fitting model: ", k)
    model.fit(X_std, y)
    pickle.dump(model, open(model_names[k], 'wb'))
    k += 1

fitting model:  0
fitting model:  1
fitting model:  2
fitting model:  3


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


#### BMA: aggregate predictions

##### Acquisition function (Expected improvement)

In [12]:
## final corrected & verified one to be used
def calc_EI(y_pred, y_pred_un, y_pred_un_uncer, epsilon=0.01):
    y_best = np.max(y_pred)
    EI = []
    explore = []
    exploit = []

    for i in range(len(y_pred_un)):
        if y_pred_un_uncer[i] != 0:
            
            # Calculate the cumulative distribution function (CDF) for the Gaussian distribution
            z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            # z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            cdf_z = 0.5 * (1 + erf(z / np.sqrt(2)))
            pdf_z = np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)

            # Calculate Expected Improvement
            expected_improvement = y_pred_un_uncer[i] * (z * cdf_z) + y_pred_un_uncer[i] * pdf_z
            exploitation = y_pred_un_uncer[i] * z * cdf_z
            exploration = y_pred_un_uncer[i] * pdf_z
            EI.append(expected_improvement)
            explore.append(exploration)
            exploit.append(exploitation)
        else:
            EI.append(0.0)
    return EI, exploit, explore

In [None]:
## virtual search space for batch-4 (electrolytes containing solvent combinations tested in batch-3 removed)
path = '/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/data-codes-sharing/datasets'
df_unlabel = pd.read_csv(f'../../datasets/batch-5/virtual_search_space_for_batch5.csv')
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,salt_ecfp_pca_5,salt_ecfp_pca_6,salt_ecfp_pca_7,salt_ecfp_pca_8,salt_ecfp_pca_9,mol_wt_solv,mol_wt_salt,conc_salt_1,theor_capacity,amt_electrolyte
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,73.052764,186.939685,1,150,50
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,114.079313,186.939685,1,150,50
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,116.094963,186.939685,1,150,50
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,70.058995,186.939685,1,150,50
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,72.044939,186.939685,1,150,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999882,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,193.077264,144.001775,1,150,50
999883,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,240.147392,144.001775,1,150,50
999884,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,251.046358,144.001775,1,150,50
999885,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,248.133634,144.001775,1,150,50


In [14]:
X_un = df_unlabel.iloc[:,2:]
X_un_std = std_scale.transform(X_un)
X_un_std = pd.DataFrame(X_un_std, columns=X_un.columns)

##### Calculate model weights & obtained aggregated mean ($\mu^{aggr}$), uncertainty ($\sigma^{aggr}$), & EI ($EI^{aggr}$)

In [None]:
# Calculate Model Weights using BMA (first order)
model_names = ['../../models/batch-5/pairwise_batch5_wo_nmc_data.pkl', '../../models/batch-5/matern_batch5_wo_nmc_data.pkl', '../../models/batch-5/rq_batch5_wo_nmc_data.pkl', '../../models/batch-5/rbf-ess_batch5_wo_nmc_data.pkl']
model_weights = []
uncertainties = []
predictions = []
y_label_preds = []
for model in model_names:
    gpr = pickle.load(open(model, 'rb'))
    y_un = gpr.predict(X_un_std)
    predictions.append(y_un)
    individual_uncertainties = gpr.predict(X_un_std, return_std=True)[1]
    uncertainties.append(individual_uncertainties)
    likelihoods = norm.pdf(y_un, loc=gpr.predict(X_un_std), scale=individual_uncertainties)
    prior_beliefs = 1.0  # Non-informative prior
    posterior = likelihoods * prior_beliefs
    model_weights.append(posterior / np.sum(posterior))
    y_ = gpr.predict(X_std)
    y_label_preds.append(y_)

In [16]:
df_unlabel['uncertainty_1'] = uncertainties[0]; df_unlabel['uncertainty_2'] = uncertainties[1]; df_unlabel['uncertainty_3'] = uncertainties[2]; df_unlabel['uncertainty_4'] = uncertainties[3]
df_unlabel['prediction_1'] = predictions[0]; df_unlabel['prediction_2'] = predictions[1]; df_unlabel['prediction_3'] = predictions[2]; df_unlabel['prediction_4'] = predictions[3]
df_unlabel['explore_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[2]; df_unlabel['exploit_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[1]
df_unlabel['explore_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[2]; df_unlabel['exploit_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[1]
df_unlabel['explore_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[2]; df_unlabel['exploit_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[1]
df_unlabel['explore_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[2]; df_unlabel['exploit_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[1]
df_unlabel['EI_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[0]; df_unlabel['EI_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[0]; df_unlabel['EI_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[0]; df_unlabel['EI_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[0]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,explore_2,exploit_2,explore_3,exploit_3,explore_4,exploit_4,EI_1,EI_2,EI_3,EI_4
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.000777,-0.000712,0.005881,-0.005137,0.000299,-0.000277,0.000135,0.000065,0.000744,0.000022
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.003526,-0.003135,0.003799,-0.003373,0.001924,-0.001736,0.000896,0.000392,0.000426,0.000188
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.000697,-0.000641,0.003781,-0.003360,0.000041,-0.000039,0.000105,0.000056,0.000421,0.000002
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.002951,-0.002627,0.006095,-0.005302,0.000268,-0.000249,0.000807,0.000324,0.000792,0.000019
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.000832,-0.000762,0.005835,-0.005102,0.000034,-0.000032,0.000136,0.000070,0.000733,0.000002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999882,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.000434,-0.000402,0.001649,-0.001496,0.000144,-0.000134,0.000034,0.000032,0.000154,0.000009
999883,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.010109,-0.008752,0.004149,-0.003688,0.012696,-0.010919,0.000636,0.001357,0.000461,0.001777
999884,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.004981,-0.004416,0.003777,-0.003365,0.005653,-0.005003,0.000325,0.000565,0.000412,0.000650
999885,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.001276,-0.001166,0.003267,-0.002922,0.000701,-0.000647,0.000009,0.000110,0.000345,0.000054


In [17]:
def calc_aggr_uncer(uncer_1, w_1, pred_1, uncer_2, w_2, pred_2, uncer_3, w_3, pred_3, uncer_4, w_4, pred_4):
    uncer = [uncer_1, uncer_2, uncer_3, uncer_4]
    pred = [pred_1, pred_2, pred_3, pred_4]
    weight = [w_1, w_2, w_3, w_4]
    pred_aggr = w_1 * pred_1 + w_2 * pred_2 + w_3 * pred_3 + w_4 * pred_4
    sum = 0
    for i in range(4):
        sum += weight[i] * (uncer[i]**2 + (pred[i] - pred_aggr)**2)
    aggr_uncer = np.sqrt(sum)
    return aggr_uncer

In [18]:
df_unlabel['prediction_aggr'] = df_unlabel['prediction_1'] * model_weights[0] + df_unlabel['prediction_2'] * model_weights[1] + df_unlabel['prediction_3'] * model_weights[2] + df_unlabel['prediction_4'] * model_weights[3]
df_unlabel['uncertainty_aggr'] = calc_aggr_uncer(df_unlabel['uncertainty_1'], model_weights[0], df_unlabel['prediction_1'], df_unlabel['uncertainty_2'], model_weights[1], df_unlabel['prediction_2'], df_unlabel['uncertainty_3'], model_weights[2], df_unlabel['prediction_3'], df_unlabel['uncertainty_4'], model_weights[3], df_unlabel['prediction_4'])
df_unlabel['explore_aggr'] = df_unlabel['explore_1'] * model_weights[0] + df_unlabel['explore_2'] * model_weights[1] + df_unlabel['explore_3'] * model_weights[2] + df_unlabel['explore_4'] * model_weights[3]
df_unlabel['exploit_aggr'] = df_unlabel['exploit_1'] * model_weights[0] + df_unlabel['exploit_2'] * model_weights[1] + df_unlabel['exploit_3'] * model_weights[2] + df_unlabel['exploit_4'] * model_weights[3]
df_unlabel['ratio_aggr'] = df_unlabel['exploit_aggr'] / df_unlabel['explore_aggr']

## 'EI_aggr' is the final rank by which candidate electrolytes are selected for experimental validation
df_unlabel['EI_aggr'] = df_unlabel['EI_1'] * model_weights[0] + df_unlabel['EI_2'] * model_weights[1] + df_unlabel['EI_3'] * model_weights[2] + df_unlabel['EI_4'] * model_weights[3]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.000135,0.000065,0.000744,0.000022,4.097530e-07,0.000525,9.244044e-09,-8.188891e-09,-0.885856,1.055152e-09
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.000896,0.000392,0.000426,0.000188,5.659211e-07,0.000556,1.708202e-08,-1.507869e-08,-0.882723,2.003329e-09
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.000105,0.000056,0.000421,0.000002,-6.094674e-08,0.000519,5.855438e-09,-5.258022e-09,-0.897972,5.974159e-10
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.000807,0.000324,0.000792,0.000019,8.758390e-07,0.000615,1.805648e-08,-1.581621e-08,-0.875930,2.240269e-09
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.000136,0.000070,0.000733,0.000002,2.224184e-07,0.000523,8.820891e-09,-7.808843e-09,-0.885267,1.012047e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999882,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.000034,0.000032,0.000154,0.000009,-3.368679e-07,0.000542,2.680237e-09,-2.451858e-09,-0.914792,2.283785e-10
999883,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.000636,0.001357,0.000461,0.001777,6.105515e-08,0.000550,2.585780e-08,-2.250759e-08,-0.870437,3.350213e-09
999884,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.000325,0.000565,0.000412,0.000650,-3.405767e-08,0.000541,1.499479e-08,-1.333090e-08,-0.889035,1.663892e-09
999885,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.000009,0.000110,0.000345,0.000054,-5.943331e-07,0.000670,4.773768e-09,-4.312389e-09,-0.903351,4.613787e-10


##### Save top 5000 predictions

In [31]:
df_unlabel_ = df_unlabel.copy()
df_unlabel_ = df_unlabel_.sort_values(by='EI_aggr', ascending=False)
df_unlabel_5000 = df_unlabel_.iloc[:5000,:]
df_unlabel_5000

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
111650,COCCOCC(F)(F)OCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.979777,-1.126940,1.238547,-0.543210,0.172293,0.267553,1.378745,0.361646,...,0.007056,0.008397,0.011778,0.006699,2.905207e-06,0.001296,1.762955e-07,-1.260665e-07,-0.715086,5.022901e-08
223361,COCCOCC(F)(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.020865,-1.124835,1.256503,-0.548366,0.183966,0.306774,1.383833,0.462789,...,0.006459,0.007786,0.008185,0.004010,3.315528e-06,0.001394,1.598982e-07,-1.160243e-07,-0.725613,4.387391e-08
93220,COCCOCC(F)(F)COC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.020868,-1.127152,1.205747,-0.580401,0.170861,0.298518,1.393101,0.414904,...,0.006943,0.008367,0.011648,0.001020,2.815127e-06,0.001257,1.492377e-07,-1.061523e-07,-0.711297,4.308535e-08
193863,COCCOCCC(F)(F)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.995461,-1.139384,1.208259,-0.557954,0.205390,0.322203,1.391092,0.433611,...,0.006013,0.007293,0.009366,0.003285,3.122797e-06,0.001343,1.544774e-07,-1.120330e-07,-0.725239,4.244443e-08
66101,COCCOCCOC(F)(F)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.013246,-1.132293,1.203856,-0.549821,0.238278,0.290024,1.338129,0.446974,...,0.006374,0.007571,0.009752,0.000709,3.138033e-06,0.001344,1.440662e-07,-1.018548e-07,-0.707000,4.221138e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829989,COCCN1C(=O)C(=O)N(CC(=O)OC)C1=O,[Li+].O=C1O[B-](F)(F)OC1=O,-0.620284,0.443967,1.775680,0.572759,-0.228114,-0.256229,0.581298,1.189885,...,0.002361,0.002484,0.000512,0.003409,3.916426e-07,0.000582,4.670398e-08,-3.941107e-08,-0.843848,7.292905e-09
158383,COC1CCCCCCC(=O)N1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.546040,-0.390739,1.161412,0.038221,-0.386243,-0.019659,0.246157,-0.135880,...,0.003053,0.001591,0.000680,0.001518,1.019267e-06,0.000694,4.683361e-08,-3.954074e-08,-0.844281,7.292869e-09
36017,COCCOOOCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.860078,-1.003831,1.024917,-0.374903,-0.298655,0.368557,0.715066,0.615098,...,0.003628,0.002092,0.000005,0.000153,1.568246e-06,0.000806,4.143030e-08,-3.413845e-08,-0.823997,7.291851e-09
280350,COCCOCCOP(=O)(OCC(C)Cl)OCC(C)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.287747,-0.693303,0.903049,-0.447747,-0.868605,-0.385971,0.496917,0.417255,...,0.003023,0.001594,0.000503,0.001349,1.180810e-06,0.000736,4.644902e-08,-3.915794e-08,-0.843030,7.291087e-09


In [32]:
df_unlabel_uniq = df_unlabel_5000.drop_duplicates(subset=['solv_comb_sm'], keep='first') ## only keeping unique solvent combinations for selection purposes; these compounds were manually searched in emolecules to finf purchasable compounds
df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_unlabel_uniq[['solv_comb_sm', 'salt_comb_sm', 'prediction_aggr', 'uncertainty_aggr', 'explore_aggr', 'exploit_aggr', 'ratio_aggr', 'EI_aggr']].to_csv('../datasets/batch-5/top_5000_suggestions_batch5_uniq_solvents_wo_nmc_data.csv', index=False)
df_unlabel_uniq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
111650,COCCOCC(F)(F)OCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.979777,-1.126940,1.238547,-0.543210,0.172293,0.267553,1.378745,0.361646,...,0.007056,0.008397,0.011778,0.006699,0.000003,0.001296,1.762955e-07,-1.260665e-07,-0.715086,5.022901e-08
223361,COCCOCC(F)(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.020865,-1.124835,1.256503,-0.548366,0.183966,0.306774,1.383833,0.462789,...,0.006459,0.007786,0.008185,0.004010,0.000003,0.001394,1.598982e-07,-1.160243e-07,-0.725613,4.387391e-08
93220,COCCOCC(F)(F)COC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.020868,-1.127152,1.205747,-0.580401,0.170861,0.298518,1.393101,0.414904,...,0.006943,0.008367,0.011648,0.001020,0.000003,0.001257,1.492377e-07,-1.061523e-07,-0.711297,4.308535e-08
193863,COCCOCCC(F)(F)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.995461,-1.139384,1.208259,-0.557954,0.205390,0.322203,1.391092,0.433611,...,0.006013,0.007293,0.009366,0.003285,0.000003,0.001343,1.544774e-07,-1.120330e-07,-0.725239,4.244443e-08
66101,COCCOCCOC(F)(F)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.013246,-1.132293,1.203856,-0.549821,0.238278,0.290024,1.338129,0.446974,...,0.006374,0.007571,0.009752,0.000709,0.000003,0.001344,1.440662e-07,-1.018548e-07,-0.707000,4.221138e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16879,COCCOCC(=O)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946012,-0.249386,1.407532,-0.251236,-0.029404,0.433643,0.317942,0.652580,...,0.003647,0.001775,0.000445,0.000370,0.000001,0.000762,4.439069e-08,-3.709750e-08,-0.835704,7.293196e-09
158383,COC1CCCCCCC(=O)N1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.546040,-0.390739,1.161412,0.038221,-0.386243,-0.019659,0.246157,-0.135880,...,0.003053,0.001591,0.000680,0.001518,0.000001,0.000694,4.683361e-08,-3.954074e-08,-0.844281,7.292869e-09
36017,COCCOOOCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.860078,-1.003831,1.024917,-0.374903,-0.298655,0.368557,0.715066,0.615098,...,0.003628,0.002092,0.000005,0.000153,0.000002,0.000806,4.143030e-08,-3.413845e-08,-0.823997,7.291851e-09
280350,COCCOCCOP(=O)(OCC(C)Cl)OCC(C)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.287747,-0.693303,0.903049,-0.447747,-0.868605,-0.385971,0.496917,0.417255,...,0.003023,0.001594,0.000503,0.001349,0.000001,0.000736,4.644902e-08,-3.915794e-08,-0.843030,7.291087e-09


In [22]:
df_unlabel_uniq_ = pd.read_csv(f'{path}/batch-5/top_5000_suggestions_batch5_uniq_solvents.csv')
df_unlabel_uniq_

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,theor_capacity,amt_electrolyte,pressure_type,rank,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,CCCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.920924,-1.603036,0.550274,0.249128,-0.250719,0.486272,0.282188,0.202565,...,150,50,2,1482.0,1.377135e-06,0.000729,1.646288e-08,-1.366268e-08,-4.879178,2.800201e-09
1,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,150,50,2,4619.0,1.554081e-06,0.000790,1.153646e-08,-9.914691e-09,-6.113494,1.621772e-09
2,CCCOP(=O)(OC)OOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.976774,-1.307574,0.817352,0.341483,-0.258352,0.456097,0.319158,0.267196,...,150,50,2,2813.0,1.414756e-06,0.000742,1.333075e-08,-1.127442e-08,-5.482795,2.056328e-09
3,COCP(=O)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.879609,-0.651601,1.280382,-0.395084,0.304172,0.069591,0.741684,0.261875,...,150,50,2,1832.0,1.588343e-06,0.000811,1.585504e-08,-1.336783e-08,-5.374617,2.487215e-09
4,CCCOP(=O)(OOC)OOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.968314,-1.300380,0.783393,0.353633,-0.220336,0.434773,0.277325,0.253531,...,150,50,2,4515.0,1.329633e-06,0.000711,1.119901e-08,-9.563052e-09,-5.845538,1.635957e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,CCC(=O)CC(CCOC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.451813,-0.194807,0.881349,0.225478,-0.903288,-0.207091,0.387309,0.198267,...,150,50,2,1753.0,1.129130e-06,0.000668,1.558214e-08,-1.304327e-08,-5.137434,2.538868e-09
4995,FCCOCC(F)(F)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.645508,-1.686106,0.770590,-0.476692,0.716019,-0.145151,0.818991,0.095029,...,150,50,2,1188.0,1.599224e-06,0.000809,1.842024e-08,-1.526848e-08,-4.844427,3.151761e-09
4996,COCOCOP=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.599196,-0.969392,0.889094,-0.231780,-0.288586,0.057383,0.397675,0.209634,...,150,50,2,4176.0,1.480941e-06,0.000762,1.163702e-08,-9.947592e-09,-5.888148,1.689426e-09
4997,COCCOCCOCC(=O)OCC(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.660152,0.037751,1.085719,-0.276596,-0.883826,-0.435727,0.325705,0.298334,...,150,50,2,2200.0,1.200160e-06,0.000711,1.534372e-08,-1.305337e-08,-5.699309,2.290343e-09


In [24]:
df_unlabel_uniq_['solv_comb_sm'] = df_unlabel_uniq_['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm', 'salt_comb_sm'], how='inner') ## unique electrolytes
# df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm'], how='inner') ## unique solvents
df_sugg_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0_x,solv_ecfp_pca_1_x,solv_ecfp_pca_2_x,solv_ecfp_pca_3_x,solv_ecfp_pca_4_x,solv_ecfp_pca_5_x,solv_ecfp_pca_6_x,solv_ecfp_pca_7_x,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr_y,uncertainty_aggr_y,explore_aggr_y,exploit_aggr_y,ratio_aggr_y,EI_aggr_y
0,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,0.002837,0.001603,0.001120,0.000470,0.000002,0.000876,4.809631e-08,-4.049486e-08,-0.841954,7.601449e-09
1,COCP(=O)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.879609,-0.651601,1.280382,-0.395084,0.304172,0.069591,0.741684,0.261875,...,0.002826,0.001939,0.000943,0.000269,0.000002,0.000834,4.601585e-08,-3.866208e-08,-0.840190,7.353771e-09
2,COOP(=O)(Cl)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.613540,-1.126666,1.207040,-0.490748,0.394677,-0.154634,0.552317,0.208962,...,0.002763,0.001849,0.001140,0.000247,0.000002,0.000848,4.673772e-08,-3.926824e-08,-0.840183,7.469488e-09
3,COCCOC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.199042,-1.114139,0.826715,-0.257876,-0.665296,-0.070154,0.607672,0.315459,...,0.003226,0.002056,0.001656,0.000523,0.000002,0.000916,5.689007e-08,-4.739405e-08,-0.833081,9.496012e-09
4,COOCCC=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.722664,-1.098510,1.021394,-0.346581,-0.141532,0.057438,0.270262,0.297829,...,0.003204,0.001923,0.000857,0.000406,0.000002,0.000865,4.899609e-08,-4.102966e-08,-0.837407,7.966432e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2327,COCCCCCCCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.262920,-1.527847,0.459590,0.478765,-1.112345,0.253068,0.803576,0.318226,...,0.002751,0.001613,0.001564,0.001026,0.000002,0.000843,5.294291e-08,-4.454459e-08,-0.841370,8.398322e-09
2328,COCCOCC(C)(C)OC(=O)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.197712,-0.252969,1.488802,-0.398167,0.617184,0.247025,1.354607,0.026354,...,0.004896,0.003678,0.000687,0.002185,0.000002,0.000864,7.260999e-08,-5.907934e-08,-0.813653,1.353065e-08
2329,COCCCN1C(=O)CCCCC1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.169587,-0.484007,1.174677,0.358038,-1.006078,0.638467,0.850387,0.255182,...,0.007990,0.004357,0.000964,0.002075,0.000001,0.000771,8.008038e-08,-6.406773e-08,-0.800043,1.601265e-08
2330,COCCCN1C(=O)C2CCCCN2C(=O)C1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.367853,-0.158817,1.005564,0.630241,-0.970872,0.322566,1.134869,0.413049,...,0.006757,0.004050,0.000834,0.002175,0.000001,0.000718,7.248189e-08,-5.873287e-08,-0.810311,1.374901e-08


### Check how many common solvents in fourth batch of labeled dataset

In [None]:
df_label_all = pd.read_csv('../../datasets/label_all_ecfp_pca_add_feat_incl_b7_090824.csv')
df_label_all

Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
204,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
205,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
206,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0


In [26]:
df_b5 = df_label_all.loc[df_label_all['expt_test'] == 5]
df_b5['solv_comb_sm'] = df_b5['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_b5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b5['solv_comb_sm'] = df_b5['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
152,COCCOCC(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,5.0,-1.171924,-1.054353,0.913877,-0.50998,-0.920358,-0.036806,0.730854,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
153,COCCOCC(OC)OC,[Li+].F[P-](F)(F)(F)(F)F,5.0,-1.171924,-1.054353,0.913877,-0.50998,-0.920358,-0.036806,0.730854,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
154,COCCOCC(OC)OC,[Li+].O=C1O[B-](F)(F)OC1=O,5.0,-1.171924,-1.054353,0.913877,-0.50998,-0.920358,-0.036806,0.730854,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
155,COCCCCCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,5.0,-0.895787,-1.564556,0.936152,-0.167522,-0.491035,0.631484,0.790138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
156,COCCCCCl,[Li+].F[P-](F)(F)(F)(F)F,5.0,-0.895787,-1.564556,0.936152,-0.167522,-0.491035,0.631484,0.790138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
157,COCCCCCl,[Li+].O=C1O[B-](F)(F)OC1=O,5.0,-0.895787,-1.564556,0.936152,-0.167522,-0.491035,0.631484,0.790138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
158,COCC(F)(F)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,5.0,-0.682909,-1.492217,1.039659,-0.738617,0.390641,-0.038716,1.00685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
159,COCC(F)(F)C(F)(F)F,[Li+].F[P-](F)(F)(F)(F)F,5.0,-0.682909,-1.492217,1.039659,-0.738617,0.390641,-0.038716,1.00685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
160,COCC(F)(F)C(F)(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,5.0,-0.682909,-1.492217,1.039659,-0.738617,0.390641,-0.038716,1.00685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
161,CCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,5.0,-1.084384,-1.014022,0.862732,0.014255,-0.374,0.614768,0.628171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [27]:
uniq_sm_b5 = df_b5['solv_comb_sm'].unique()
uniq_sm_b5 = pd.DataFrame(uniq_sm_b5, columns=['solv_comb_sm'])
uniq_sm_b5

Unnamed: 0,solv_comb_sm
0,COCCOCC(OC)OC
1,COCCCCCl
2,COCC(F)(F)C(F)(F)F
3,CCOCCOC
4,COC1CCCCC1
5,COCCCCCOC
6,COCCCOC
7,COCCCOCCCOC
8,CS(=O)(=O)F
9,COC(CCl)(CCl)OC


In [28]:
df_comm = df_unlabel_uniq.merge(uniq_sm_b5, on='solv_comb_sm', how='right')
df_comm.dropna(inplace=True)
df_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCOCC(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.171924,-1.054353,0.913877,-0.50998,-0.920358,-0.036806,0.730854,0.526809,...,0.004604,0.002669,0.000302,0.002132,2e-06,0.000931,6.710532e-08,-5.478247e-08,-0.816365,1.232285e-08
1,COCCCCCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.895787,-1.564556,0.936152,-0.167522,-0.491035,0.631484,0.790138,0.755838,...,0.007367,0.004516,0.000646,0.001406,2e-06,0.000924,8.205282e-08,-6.511312e-08,-0.793551,1.693969e-08
2,COCC(F)(F)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.682909,-1.492217,1.039659,-0.738617,0.390641,-0.038716,1.00685,0.213856,...,0.002874,0.002083,0.000801,0.000597,2e-06,0.001005,5.235966e-08,-4.38809e-08,-0.838067,8.478761e-09
3,CCOCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.084384,-1.014022,0.862732,0.014255,-0.374,0.614768,0.628171,0.548904,...,0.00324,0.001844,0.00059,0.000215,2e-06,0.00085,4.494923e-08,-3.760441e-08,-0.836598,7.344812e-09
4,COC1CCCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.869688,-0.928813,0.864417,-0.633461,-0.657134,0.776594,0.134201,-0.833312,...,0.002966,0.001647,0.001108,0.001199,1e-06,0.000729,4.876561e-08,-4.11749e-08,-0.844343,7.590709e-09
5,COCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0068,0.004156,0.000821,0.0039,2e-06,0.00098,9.422403e-08,-7.458818e-08,-0.791605,1.963585e-08
6,COCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.878583,-1.488406,0.875382,-0.187068,-0.506088,0.578477,0.819152,0.74472,...,0.006589,0.003943,0.000657,0.002634,2e-06,0.000954,8.509265e-08,-6.793436e-08,-0.798357,1.71583e-08
7,COCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.633502,-1.410069,0.79863,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.006566,0.004158,0.000736,0.005968,2e-06,0.000984,1.007691e-07,-7.883983e-08,-0.782381,2.192927e-08
10,COCCOCC(=O)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.314981,0.184491,1.622378,-0.105444,-0.221744,0.414074,0.45872,0.608244,...,0.006874,0.003421,0.00072,0.001229,1e-06,0.000792,6.99895e-08,-5.647774e-08,-0.806946,1.351176e-08


In [None]:
df_comm.to_csv('../../datasets/batch-5/labeled_batch5_uniq_solvents_wo_nmc_data.csv', index=False)

In [35]:
df_comm_label = df_unlabel_uniq.merge(df_b5, on=['solv_comb_sm'], how='left', suffixes=('', '_drop'))
df_comm_label.dropna(inplace=True)
df_comm_label = df_comm_label.loc[:, ~df_comm_label.columns.str.endswith('_drop')]
df_comm_label

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
188,COCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.633502,-1.410069,0.79863,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
189,COCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.633502,-1.410069,0.79863,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
190,COCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.633502,-1.410069,0.79863,-0.225138,-0.701008,0.879349,0.961948,0.459921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
300,COCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
301,COCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
302,COCCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.946328,-1.547746,0.906366,0.007913,-0.568101,0.750411,0.805634,0.679371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
477,COCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.878583,-1.488406,0.875382,-0.187068,-0.506088,0.578477,0.819152,0.74472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
478,COCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.878583,-1.488406,0.875382,-0.187068,-0.506088,0.578477,0.819152,0.74472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
479,COCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.878583,-1.488406,0.875382,-0.187068,-0.506088,0.578477,0.819152,0.74472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
500,COCCCCCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.895787,-1.564556,0.936152,-0.167522,-0.491035,0.631484,0.790138,0.755838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [None]:
df_comm_label.to_csv('../../datasets/batch-5/labeled_batch5_all_wo_nmc_data.csv', index=False)

In [30]:
uniq_sm_comm = df_comm['solv_comb_sm'].unique()
for i in range(len(uniq_sm_b5)):
    if uniq_sm_b5['solv_comb_sm'][i] not in uniq_sm_comm:
        print(uniq_sm_b5['solv_comb_sm'][i])

CS(=O)(=O)F
COC(CCl)(CCl)OC
