## Notebook for performing active learning to optimize anode-free lithium metal battery electrolytes

**Note: Dataset do not contain any NMC data**

## Batch: 2

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, PandasTools, Fragments, rdMolDescriptors, Descriptors
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser
# Silence non-critical RDKit warnings to minimize unnecessary outputs
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, RationalQuadratic, WhiteKernel, Matern, ConstantKernel, DotProduct, PairwiseKernel 
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.special import erf
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.express as px

### Reading & standardizing datasets

In [None]:
## added the labeled data from the batch-1 suggestions to the in-house dataset manually (labeled dataset for batch-2)
df = pd.read_csv('../../datasets/batch-2/label_data_post_batch1.csv') 
df

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_14,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,0.613072,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,0.301582,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.022233,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.000573,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,-0.208395,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,CC1(Cl)CCCCC1,[Li+].F[P-](F)(F)(F)(F)F,0.737807,-1.412186,0.505232,-0.561888,0.444367,0.068359,-1.010924,0.569775,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
67,CC1(Cl)CCCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.737807,-1.412186,0.505232,-0.561888,0.444367,0.068359,-1.010924,0.569775,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
68,ClC1(Cl)CCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.696267,-1.429634,0.575233,-0.595873,0.518935,-0.141128,-0.932757,0.463879,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
69,ClC1(Cl)CCCC1,[Li+].F[P-](F)(F)(F)(F)F,0.696267,-1.429634,0.575233,-0.595873,0.518935,-0.141128,-0.932757,0.463879,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0


In [3]:
X = df.iloc[:,2:27] ## PCA-reduced solvent & salt descriptors and other non-molecular features
y = df['norm_capacity_3'] # normalized discharge capacity at 20th cycle (target variable)
std_scale = StandardScaler().fit(X)
X_std = std_scale.transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

### Active learning workflow

#### Choose best hyperparameters for each kernel

In [4]:
def negative_log_likelihood_rbf(params):
    noise_level, length_scale, alpha = params
    kernel = RBF(length_scale=length_scale)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rq(params):
    noise_level, length_scale, alpha_k, alpha = params 
    kernel = RationalQuadratic(length_scale=length_scale, alpha=alpha_k)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rbf_expsin(params):
    noise_level, length_scale, periodicity, alpha = params 
    kernel = RBF(length_scale=length_scale) + ExpSineSquared(length_scale=length_scale, periodicity=periodicity)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_matern(params):
    noise_level, length_scale, alpha = params
    kernel = Matern(length_scale=length_scale, nu=1.5)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_pairwise(params):
    noise_level, length_scale, alpha = params
    kernel = PairwiseKernel(metric="polynomial")
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

##### Pairwise kernel

In [5]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_pairwise, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters

## enter theses optimized hyperparameters into the combined GP model manually
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.0001


##### RationalQuadratic kernel

In [6]:
initial_guess = [0.15, 0.01, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha_k, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha_k, alpha
result = minimize(negative_log_likelihood_rq, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha_k, optimized_alpha = optimized_hyperparameters

## enter theses optimized hyperparameters into the combined GP model manually
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha_k:", optimized_alpha_k)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.15000208188687805
Optimized length_scale: 0.010000150849441643
Optimized alpha_k: 0.009999220374994854
Optimized alpha: 0.0065986738306711605


##### Matern-3/2 kernel

In [7]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_matern, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters

## enter theses optimized hyperparameters into the combined GP model manually
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.008427094762648479


##### RBF-ExpineSquared kernel

In [8]:
initial_guess = [0.15, 0.01, 1.0, 0.02] # initial guess for noise_level, length_scale, periodicity, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-2, 10.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, periodicity, alpha
result = minimize(negative_log_likelihood_rbf_expsin, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_periodicity, optimized_alpha = optimized_hyperparameters

## enter theses optimized hyperparameters into the combined GP model manually
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized periodicity:", optimized_periodicity)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized periodicity: 1.0
Optimized alpha: 0.020360887827203904


#### Train surrogate models

Note: no need to run again, saved model checkpoints have been provided

In [None]:
## add optimized hyperparameters from previous step
optimized_pairwise_kernel = PairwiseKernel(metric="polynomial") + WhiteKernel(noise_level=0.15)
optimized_matern_kernel = Matern(length_scale=0.01, nu=1.5) + WhiteKernel(noise_level=0.15)
optimized_rbfexpsin_kernel = RBF(length_scale=0.01) + ExpSineSquared(length_scale=0.01, periodicity=1.0) + WhiteKernel(noise_level=0.15)
optimized_rq_kernel = RationalQuadratic(length_scale=0.010000150849441643, alpha=0.009999220374994854) + WhiteKernel(noise_level=0.15000208188687805)

gpr_models = [GaussianProcessRegressor(kernel=optimized_pairwise_kernel, alpha=0.0001, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_matern_kernel, alpha=0.008427094762648479, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rq_kernel, alpha=0.0065986738306711605, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rbfexpsin_kernel, alpha=0.020360887827203904, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)]

model_names = ['../../models/batch-2/pairwise_batch2_wo_nmc_data.pkl', '../../models/batch-2/matern_batch2_wo_nmc_data.pkl', '../../models/batch-2/rq_batch2_wo_nmc_data.pkl', '../../models/batch-2/rbf-ess_batch2_wo_nmc_data.pkl']
k = 0
for model in gpr_models:
    print("fitting model: ", k)
    model.fit(X_std, y)
    pickle.dump(model, open(model_names[k], 'wb'))
    k += 1

fitting model:  0


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


fitting model:  1
fitting model:  2
fitting model:  3


#### BMA: aggregate predictions 

##### Acquisition function (Expected improvement)

In [11]:
## final corrected & verified one to be used
def calc_EI(y_pred, y_pred_un, y_pred_un_uncer, epsilon=0.01):
    y_best = np.max(y_pred)
    EI = []
    explore = []
    exploit = []

    for i in range(len(y_pred_un)):
        if y_pred_un_uncer[i] != 0:
            
            # Calculate the cumulative distribution function (CDF) for the Gaussian distribution
            z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            # z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            cdf_z = 0.5 * (1 + erf(z / np.sqrt(2)))
            pdf_z = np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)

            # Calculate Expected Improvement
            expected_improvement = y_pred_un_uncer[i] * (z * cdf_z) + y_pred_un_uncer[i] * pdf_z
            exploitation = y_pred_un_uncer[i] * z * cdf_z
            exploration = y_pred_un_uncer[i] * pdf_z
            EI.append(expected_improvement)
            explore.append(exploration)
            exploit.append(exploitation)
        else:
            EI.append(0.0)
    return EI, exploit, explore

In [None]:
## virtual search space for batch-2 (electrolytes containing solvent combinations tested in batch-1 removed)
path = '/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/data-codes-sharing/datasets'
df_unlabel = pd.read_csv(f'../../datasets/batch-2/virtual_search_space_for_batch2.csv')
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,salt_ecfp_pca_5,salt_ecfp_pca_6,salt_ecfp_pca_7,salt_ecfp_pca_8,salt_ecfp_pca_9,mol_wt_solv,mol_wt_salt,conc_salt_1,theor_capacity,amt_electrolyte
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,73.052764,186.939685,1.0,150,50
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,114.079313,186.939685,1.0,150,50
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,116.094963,186.939685,1.0,150,50
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,70.058995,186.939685,1.0,150,50
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,72.044939,186.939685,1.0,150,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999956,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,193.077264,144.001775,1.0,150,50
999957,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,240.147392,144.001775,1.0,150,50
999958,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,251.046358,144.001775,1.0,150,50
999959,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,248.133634,144.001775,1.0,150,50


In [8]:
X_un = df_unlabel.iloc[:,2:]
X_un_std = std_scale.transform(X_un)
X_un_std = pd.DataFrame(X_un_std, columns=X_un.columns)

##### Calculate model weights & obtained aggregated mean ($\mu^{aggr}$), uncertainty ($\sigma^{aggr}$), & EI ($EI^{aggr}$)

In [None]:
# Calculate model weights using BMA (first order)
model_names = ['../../models/batch-2/pairwise_batch2_wo_nmc_data.pkl', '../../models/batch-2/matern_batch2_wo_nmc_data.pkl', '../../models/batch-2/rq_batch2_wo_nmc_data.pkl', '../../models/batch-2/rbf-ess_batch2_wo_nmc_data.pkl']
model_weights = []
uncertainties = []
predictions = []
y_label_preds = []
for model in model_names:
    gpr = pickle.load(open(model, 'rb'))
    y_un = gpr.predict(X_un_std)
    predictions.append(y_un)
    individual_uncertainties = gpr.predict(X_un_std, return_std=True)[1]
    uncertainties.append(individual_uncertainties)
    likelihoods = norm.pdf(y_un, loc=gpr.predict(X_un_std), scale=individual_uncertainties)
    prior_beliefs = 1.0  # Non-informative prior
    posterior = likelihoods * prior_beliefs
    model_weights.append(posterior / np.sum(posterior))
    y_ = gpr.predict(X_std)
    y_label_preds.append(y_)

In [12]:
df_unlabel['uncertainty_1'] = uncertainties[0]; df_unlabel['uncertainty_2'] = uncertainties[1]; df_unlabel['uncertainty_3'] = uncertainties[2]; df_unlabel['uncertainty_4'] = uncertainties[3]
df_unlabel['prediction_1'] = predictions[0]; df_unlabel['prediction_2'] = predictions[1]; df_unlabel['prediction_3'] = predictions[2]; df_unlabel['prediction_4'] = predictions[3]
df_unlabel['explore_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[2]; df_unlabel['exploit_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[1]
df_unlabel['explore_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[2]; df_unlabel['exploit_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[1]
df_unlabel['explore_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[2]; df_unlabel['exploit_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[1]
df_unlabel['explore_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[2]; df_unlabel['exploit_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[1]
df_unlabel['EI_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[0]; df_unlabel['EI_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[0]; df_unlabel['EI_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[0]; df_unlabel['EI_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[0]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,explore_2,exploit_2,explore_3,exploit_3,explore_4,exploit_4,EI_1,EI_2,EI_3,EI_4
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.078116,-0.054653,0.017039,-0.014138,0.005411,-0.004772,0.000909,0.023462,0.002902,0.000639
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.018762,-0.015959,0.008237,-0.007146,0.001237,-0.001129,0.000288,0.002803,0.001091,0.000108
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.044365,-0.035889,0.011423,-0.009775,0.004132,-0.003697,0.000552,0.008476,0.001648,0.000434
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.040114,-0.029456,0.017443,-0.014328,0.001115,-0.001012,0.001319,0.010658,0.003115,0.000102
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.088196,-0.060876,0.017866,-0.014797,0.008353,-0.007269,0.001097,0.027320,0.003069,0.001084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999956,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.054423,-0.043286,0.008527,-0.007409,0.007459,-0.006570,0.000446,0.011137,0.001118,0.000889
999957,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.149594,-0.104279,0.015668,-0.013275,0.072629,-0.056169,0.000414,0.045315,0.002393,0.016459
999958,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.134185,-0.096421,0.016950,-0.014285,0.049940,-0.040280,0.001020,0.037764,0.002665,0.009660
999959,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.145054,-0.102178,0.015858,-0.013423,0.057100,-0.045688,0.000432,0.042876,0.002435,0.011412


In [13]:
def calc_aggr_uncer(uncer_1, w_1, pred_1, uncer_2, w_2, pred_2, uncer_3, w_3, pred_3, uncer_4, w_4, pred_4):
    uncer = [uncer_1, uncer_2, uncer_3, uncer_4]
    pred = [pred_1, pred_2, pred_3, pred_4]
    weight = [w_1, w_2, w_3, w_4]
    pred_aggr = w_1 * pred_1 + w_2 * pred_2 + w_3 * pred_3 + w_4 * pred_4
    sum = 0
    for i in range(4):
        sum += weight[i] * (uncer[i]**2 + (pred[i] - pred_aggr)**2)
    aggr_uncer = np.sqrt(sum)
    return aggr_uncer

In [14]:
df_unlabel['prediction_aggr'] = df_unlabel['prediction_1'] * model_weights[0] + df_unlabel['prediction_2'] * model_weights[1] + df_unlabel['prediction_3'] * model_weights[2] + df_unlabel['prediction_4'] * model_weights[3]
df_unlabel['uncertainty_aggr'] = calc_aggr_uncer(df_unlabel['uncertainty_1'], model_weights[0], df_unlabel['prediction_1'], df_unlabel['uncertainty_2'], model_weights[1], df_unlabel['prediction_2'], df_unlabel['uncertainty_3'], model_weights[2], df_unlabel['prediction_3'], df_unlabel['uncertainty_4'], model_weights[3], df_unlabel['prediction_4'])
df_unlabel['explore_aggr'] = df_unlabel['explore_1'] * model_weights[0] + df_unlabel['explore_2'] * model_weights[1] + df_unlabel['explore_3'] * model_weights[2] + df_unlabel['explore_4'] * model_weights[3]
df_unlabel['exploit_aggr'] = df_unlabel['exploit_1'] * model_weights[0] + df_unlabel['exploit_2'] * model_weights[1] + df_unlabel['exploit_3'] * model_weights[2] + df_unlabel['exploit_4'] * model_weights[3]
df_unlabel['ratio_aggr'] = df_unlabel['exploit_aggr'] / df_unlabel['explore_aggr']

## 'EI_aggr' is the final rank by which candidate electrolytes are selected for experimental validation
df_unlabel['EI_aggr'] = df_unlabel['EI_1'] * model_weights[0] + df_unlabel['EI_2'] * model_weights[1] + df_unlabel['EI_3'] * model_weights[2] + df_unlabel['EI_4'] * model_weights[3]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.000909,0.023462,0.002902,0.000639,1.045655e-06,0.000838,1.375794e-07,-1.014139e-07,-0.737130,3.616545e-08
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.000288,0.002803,0.001091,0.000108,1.065547e-07,0.000688,3.693773e-08,-3.181854e-08,-0.861410,5.119184e-09
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.000552,0.008476,0.001648,0.000434,-2.398983e-08,0.000758,6.653234e-08,-5.512061e-08,-0.828479,1.141173e-08
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.001319,0.010658,0.003115,0.000102,2.209771e-06,0.001059,1.217579e-07,-9.308156e-08,-0.764480,2.867639e-08
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.001097,0.027320,0.003069,0.001084,9.515432e-07,0.000830,1.465012e-07,-1.072664e-07,-0.732188,3.923477e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999956,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.000446,0.011137,0.001118,0.000889,-3.737529e-08,0.000755,7.330802e-08,-5.996219e-08,-0.817949,1.334583e-08
999957,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.000414,0.045315,0.002393,0.016459,-1.355062e-07,0.000886,1.678592e-07,-1.232984e-07,-0.734535,4.456077e-08
999958,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.001020,0.037764,0.002665,0.009660,-1.130417e-07,0.000878,1.496175e-07,-1.134177e-07,-0.758051,3.619987e-08
999959,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.000432,0.042876,0.002435,0.011412,-2.484985e-07,0.000912,1.541178e-07,-1.146201e-07,-0.743717,3.949769e-08


##### Save top 5000 predictions

In [15]:
df_unlabel_ = df_unlabel.copy()
df_unlabel_ = df_unlabel_.sort_values(by='EI_aggr', ascending=False)
df_unlabel_5000 = df_unlabel_.iloc[:5000,:]
df_unlabel_5000

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
207196,COCCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.045483,-1.465658,0.913304,-0.851081,-0.030090,-0.187659,1.595853,0.080466,...,0.038035,0.070159,0.023896,0.039930,5.521936e-06,0.002041,5.109404e-07,-1.283994e-07,-0.251300,3.825411e-07
272192,COCC(C)(F)C(COP(=O)(OC)OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.130713,-1.196227,1.118658,-0.892056,0.131801,-0.453687,1.396408,-0.036814,...,0.029840,0.135356,0.013196,0.031315,4.003125e-06,0.001743,4.565801e-07,-9.540137e-08,-0.208948,3.611788e-07
167901,COCC(OC)C(C)(C)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.156855,-1.486546,0.826975,-0.953303,-0.020519,-0.323493,1.480633,0.007891,...,0.039976,0.078700,0.019732,0.046074,4.754064e-06,0.001890,5.034308e-07,-1.431874e-07,-0.284423,3.602434e-07
208198,COCC(OC)C(C)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.184279,-1.417835,0.839451,-0.986497,-0.092615,-0.403107,1.407091,-0.000894,...,0.038288,0.084398,0.019804,0.043248,4.682284e-06,0.001876,5.000531e-07,-1.400304e-07,-0.280031,3.600227e-07
226,CCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.751319,-1.772512,0.326835,0.350208,0.166223,0.187497,0.105344,-0.006552,...,0.004383,0.135314,0.016373,0.001129,3.929492e-06,0.001696,2.929142e-07,5.727469e-08,0.195534,3.501889e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81983,CCCC(C)SC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.744410,-1.677199,-0.506302,0.464625,-0.352549,-1.058846,-0.488425,-0.382818,...,0.002692,0.055134,0.008002,0.002771,1.907888e-06,0.001083,2.409567e-07,-1.501806e-07,-0.623268,9.077613e-08
226784,CCCC(CC)CCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.879378,-1.691560,-0.088478,0.478065,-0.328738,-0.575478,-0.121147,-0.363175,...,0.004438,0.044867,0.008498,0.001704,2.361761e-06,0.001182,2.449773e-07,-1.542120e-07,-0.629495,9.076535e-08
894001,CC(=O)N(C)C(C)(C)C(=O)N(C)C(C)(C)C(=O)N(C)C(C)...,[Li+].O=C1O[B-](F)(F)OC1=O,-0.928565,0.447297,-0.226258,-0.485250,2.078580,-0.119245,0.112154,0.021120,...,0.008655,0.093726,0.005757,0.063352,5.745796e-08,0.000995,2.722653e-07,-1.815126e-07,-0.666675,9.075271e-08
530877,CC(C)(C)[Si](P=PP[Si](C(C)(C)C)(C(C)(C)C)C(C)(...,[Li+].F[P-](F)(F)(F)(F)F,-0.399583,-1.704371,0.441560,-0.213358,1.183242,-0.425614,0.443844,-0.230001,...,0.009094,0.085199,0.005132,0.029422,5.743457e-07,0.000922,2.721467e-07,-1.813947e-07,-0.666533,9.075201e-08


In [None]:
df_unlabel_uniq = df_unlabel_5000.drop_duplicates(subset=['solv_comb_sm'], keep='first') ## only keeping unique solvent combinations for selection purposes; these compounds were manually searched in emolecules to finf purchasable compounds
df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_unlabel_uniq[['solv_comb_sm', 'salt_comb_sm', 'prediction_aggr', 'uncertainty_aggr', 'explore_aggr', 'exploit_aggr', 'ratio_aggr', 'EI_aggr']].to_csv('../datasets/batch-2/top_5000_suggestions_batch2_uniq_solvents_wo_nmc_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabel_uniq['solv_comb_sm'] = df_unlabel_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


In [29]:
df_unlabel_uniq.shape

(3901, 53)

In [25]:
df_unlabel_uniq_ = pd.read_csv(f'{path}/batch-2/top_5000_suggestions_batch2_uniq_solvents.csv')
df_unlabel_uniq_

Unnamed: 0,solv_comb_sm,salt_comb_sm,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.374448e-06,0.001478,2.400529e-07,-1.399644e-07,-0.583056,1.000885e-07
1,COCCOCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.494807e-06,0.001493,2.183637e-07,-1.199578e-07,-0.549348,9.840593e-08
2,COCC(C)(COC)COCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.600224e-06,0.001515,2.155350e-07,-1.176734e-07,-0.545960,9.786158e-08
3,COCCC(F)(F)CC(C)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.523449e-06,0.001501,2.244459e-07,-1.292276e-07,-0.575763,9.521829e-08
4,COCCOCC(OC)C(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.366983e-06,0.001464,2.220979e-07,-1.286134e-07,-0.579084,9.348453e-08
...,...,...,...,...,...,...,...,...
3753,COCC(CCl)(CN(C)C)CN(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.704869e-06,0.000934,9.056705e-08,-7.238532e-08,-0.799246,1.818173e-08
3754,COCCCN(C)OC(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.457475e-06,0.000870,9.051584e-08,-7.233746e-08,-0.799169,1.817838e-08
3755,COOCCOC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,3.038464e-06,0.001279,8.835231e-08,-7.017828e-08,-0.794300,1.817404e-08
3756,COS(C)(OC)N(C)CC(C)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.748043e-06,0.000948,9.088904e-08,-7.271590e-08,-0.800051,1.817314e-08


In [30]:
df_unlabel_uniq_['solv_comb_sm'] = df_unlabel_uniq_['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
# df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm', 'salt_comb_sm'], how='inner')
df_sugg_comm = pd.merge(df_unlabel_uniq_, df_unlabel_uniq, on=['solv_comb_sm'], how='inner')
df_sugg_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm_x,prediction_aggr_x,uncertainty_aggr_x,explore_aggr_x,exploit_aggr_x,ratio_aggr_x,EI_aggr_x,salt_comb_sm_y,solv_ecfp_pca_0,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr_y,uncertainty_aggr_y,explore_aggr_y,exploit_aggr_y,ratio_aggr_y,EI_aggr_y
0,COCCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000003,0.001478,2.400529e-07,-1.399644e-07,-0.583056,1.000885e-07,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.045483,...,0.038035,0.070159,0.023896,0.039930,0.000006,0.002041,5.109404e-07,-1.283994e-07,-0.251300,3.825411e-07
1,COCCOCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000003,0.001493,2.183637e-07,-1.199578e-07,-0.549348,9.840593e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.202645,...,0.026119,0.040530,0.023894,0.013687,0.000006,0.002077,4.558017e-07,-1.969775e-07,-0.432156,2.588242e-07
2,COCC(C)(COC)COCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000004,0.001515,2.155350e-07,-1.176734e-07,-0.545960,9.786158e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.919862,...,0.028458,0.042501,0.020268,0.019680,0.000007,0.002185,4.753646e-07,-1.782574e-07,-0.374991,2.971072e-07
3,COCCC(F)(F)CC(C)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000004,0.001501,2.244459e-07,-1.292276e-07,-0.575763,9.521829e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.872025,...,0.032452,0.048856,0.019483,0.030252,0.000006,0.002117,4.932741e-07,-1.703440e-07,-0.345333,3.229302e-07
4,COCCOCC(OC)C(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000003,0.001464,2.220979e-07,-1.286134e-07,-0.579084,9.348453e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.298547,...,0.029832,0.051733,0.023027,0.016857,0.000006,0.001992,4.668306e-07,-1.875105e-07,-0.401667,2.793200e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2261,CON(OC)C(C)(C)C(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000002,0.001011,9.027027e-08,-7.205947e-08,-0.798264,1.821081e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.890848,...,0.014388,0.038291,0.007287,0.007678,0.000002,0.001179,2.828456e-07,-1.911715e-07,-0.675887,9.167407e-08
2262,CCC(OC)C(C)(Cl)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000002,0.001044,9.040837e-08,-7.219833e-08,-0.798580,1.821004e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.069380,...,0.016860,0.029441,0.007374,0.010147,0.000003,0.001254,2.918759e-07,-1.995480e-07,-0.683674,9.232783e-08
2263,COP(=O)(OC)C(P(=O)(OC)OC)(P(=O)(OC)OC)P(=O)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000001,0.000847,9.019680e-08,-7.200153e-08,-0.798271,1.819527e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.585034,...,0.012958,0.067662,0.005333,0.012059,0.000001,0.000993,2.816243e-07,-1.833975e-07,-0.651213,9.822683e-08
2264,CCC(OC)C(C)(C)OOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.000002,0.001042,9.027389e-08,-7.208313e-08,-0.798494,1.819076e-08,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.076093,...,0.016928,0.030414,0.007528,0.009588,0.000003,0.001255,2.926704e-07,-1.993926e-07,-0.681287,9.327784e-08


### Check how many common solvents in fist batch of labeled dataset

In [None]:
df_label_all = pd.read_csv('../../datasets/label_all_ecfp_pca_add_feat_incl_b7_090824.csv')
df_label_all

Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
204,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
205,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
206,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0


In [5]:
df_b2 = df_label_all.loc[df_label_all['expt_test'] == 2]
df_b2['solv_comb_sm'] = df_b2['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_b2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b2['solv_comb_sm'] = df_b2['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
98,COC(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.0,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
99,COC(OC)C(F)(F)F,[Li+].F[P-](F)(F)(F)(F)F,2.0,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
100,COC(OC)C(F)(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,2.0,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
101,COCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.0,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
102,COCC(F)(F)C(F)F,[Li+].F[P-](F)(F)(F)(F)F,2.0,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
103,COCC(F)(F)C(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,2.0,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
104,COC(CC(F)(F)F)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.0,-0.874616,-1.416077,0.73366,-0.734611,0.144525,-0.589251,0.707575,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
105,COCCC(C)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.0,-0.898381,-1.467829,1.08657,-0.709556,0.226229,0.227359,1.573875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
106,COCC(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.0,-1.094894,-1.404056,0.585391,-0.764865,-1.127708,-0.580104,0.501909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
107,COCC(C)OC,[Li+].F[P-](F)(F)(F)(F)F,2.0,-1.094894,-1.404056,0.585391,-0.764865,-1.127708,-0.580104,0.501909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [6]:
uniq_sm_b2 = df_b2['solv_comb_sm'].unique()
uniq_sm_b2 = pd.DataFrame(uniq_sm_b2, columns=['solv_comb_sm'])
uniq_sm_b2

Unnamed: 0,solv_comb_sm
0,COC(OC)C(F)(F)F
1,COCC(F)(F)C(F)F
2,COC(CC(F)(F)F)OC
3,COCCC(C)(OC)OC
4,COCC(C)OC
5,CO[Si](CCC(F)(F)C(F)(F)C(F)(F)F)(OC)OC
6,COCCOCOCC(F)(F)F
7,COC(CCl)OC
8,COC(F)(F)C(F)Cl
9,COC(C)CC(OC)OC


In [17]:
df_comm = df_unlabel_uniq.merge(uniq_sm_b2, on='solv_comb_sm', how='right')
df_comm.dropna(inplace=True)
df_comm

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COC(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,-0.183907,...,0.014202,0.031163,0.008857,0.009706,4e-06,0.001488,3.237248e-07,-2.086293e-07,-0.644465,1.150955e-07
1,COCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,0.022392,...,0.019412,0.029121,0.014165,0.010894,5e-06,0.00175,3.77074e-07,-2.242732e-07,-0.594773,1.528007e-07
3,COCCC(C)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.898381,-1.467829,1.08657,-0.709556,0.226229,0.227359,1.573875,0.327258,...,0.028999,0.023406,0.012415,0.022004,5e-06,0.001868,4.151318e-07,-2.339786e-07,-0.563625,1.811532e-07
4,COCC(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.094894,-1.404056,0.585391,-0.764865,-1.127708,-0.580104,0.501909,0.331475,...,0.021304,0.024032,0.011263,0.005068,4e-06,0.001531,3.149348e-07,-2.054018e-07,-0.652204,1.09533e-07
5,CO[Si](CCC(F)(F)C(F)(F)C(F)(F)F)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.693253,-1.413398,1.016682,-0.620584,0.391537,-0.151953,0.987994,0.129135,...,0.013915,0.03865,0.005465,0.004172,3e-06,0.001394,2.906551e-07,-1.808117e-07,-0.622083,1.098434e-07
8,COC(F)(F)C(F)Cl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.743406,-1.4879,0.912048,-0.821155,0.293529,-0.651772,0.598413,-0.220786,...,0.013443,0.021839,0.007821,0.007472,4e-06,0.001461,2.892629e-07,-1.981544e-07,-0.685032,9.110852e-08
10,CCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.937292,-1.613544,0.672441,0.163123,-0.54302,0.617387,0.624449,0.415481,...,0.011924,0.071964,0.009948,0.002412,4e-06,0.001518,3.23448e-07,-1.371471e-07,-0.424016,1.863008e-07


In [None]:
df_comm.to_csv('../../datasets/batch-2/labeled_batch2_uniq_solvents_wo_nmc_data.csv', index=False)

In [27]:
df_comm_label = df_unlabel_uniq.merge(df_b2, on=['solv_comb_sm'], how='left', suffixes=('', '_drop'))
df_comm_label.dropna(inplace=True)
df_comm_label = df_comm_label.loc[:, ~df_comm_label.columns.str.endswith('_drop')]
df_comm_label

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
208,CCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.937292,-1.613544,0.672441,0.163123,-0.54302,0.617387,0.624449,0.415481,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
237,COCCC(C)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.898381,-1.467829,1.08657,-0.709556,0.226229,0.227359,1.573875,0.327258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
498,COCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,0.022392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
499,COCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,0.022392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
500,COCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.900533,-1.482508,0.838779,-0.836133,0.037687,-0.440202,0.923087,0.022392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1634,COC(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,-0.183907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1635,COC(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,-0.183907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1636,COC(OC)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.763499,-1.479645,0.935298,-0.813958,0.157691,-0.72405,0.577195,-0.183907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1947,CO[Si](CCC(F)(F)C(F)(F)C(F)(F)F)(OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.693253,-1.413398,1.016682,-0.620584,0.391537,-0.151953,0.987994,0.129135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1972,COCC(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.094894,-1.404056,0.585391,-0.764865,-1.127708,-0.580104,0.501909,0.331475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [None]:
df_comm_label.to_csv('../../datasets/batch-2/labeled_batch2_all_wo_nmc_data.csv', index=False)

In [18]:
df_uncomm = df_unlabel_uniq.merge(uniq_sm_b2, on='solv_comb_sm', how='left')
df_uncomm

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCC(C)(OC)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.045483,-1.465658,0.913304,-0.851081,-0.030090,-0.187659,1.595853,0.080466,...,0.038035,0.070159,0.023896,0.039930,5.521936e-06,0.002041,5.109404e-07,-1.283994e-07,-0.251300,3.825411e-07
1,COCC(C)(F)C(COP(=O)(OC)OC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.130713,-1.196227,1.118658,-0.892056,0.131801,-0.453687,1.396408,-0.036814,...,0.029840,0.135356,0.013196,0.031315,4.003125e-06,0.001743,4.565801e-07,-9.540137e-08,-0.208948,3.611788e-07
2,COCC(OC)C(C)(C)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.156855,-1.486546,0.826975,-0.953303,-0.020519,-0.323493,1.480633,0.007891,...,0.039976,0.078700,0.019732,0.046074,4.754064e-06,0.001890,5.034308e-07,-1.431874e-07,-0.284423,3.602434e-07
3,COCC(OC)C(C)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.184279,-1.417835,0.839451,-0.986497,-0.092615,-0.403107,1.407091,-0.000894,...,0.038288,0.084398,0.019804,0.043248,4.682284e-06,0.001876,5.000531e-07,-1.400304e-07,-0.280031,3.600227e-07
4,CCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.751319,-1.772512,0.326835,0.350208,0.166223,0.187497,0.105344,-0.006552,...,0.004383,0.135314,0.016373,0.001129,3.929492e-06,0.001696,2.929142e-07,5.727469e-08,0.195534,3.501889e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,CCCCC(P)CC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.914695,-1.678418,-0.846483,0.870248,-0.267899,-0.189597,-0.420312,-0.374877,...,0.002404,0.059174,0.007543,0.002208,1.663656e-06,0.001032,2.340421e-07,-1.432614e-07,-0.612118,9.078067e-08
3897,CC(C)C(C)(C)OCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.725371,-1.468186,0.455797,-0.773105,0.566757,-1.217821,0.544821,-0.536231,...,0.011878,0.038771,0.006138,0.009823,2.281196e-06,0.001183,2.817587e-07,-1.909802e-07,-0.677815,9.077844e-08
3898,CCCC(C)SC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.744410,-1.677199,-0.506302,0.464625,-0.352549,-1.058846,-0.488425,-0.382818,...,0.002692,0.055134,0.008002,0.002771,1.907888e-06,0.001083,2.409567e-07,-1.501806e-07,-0.623268,9.077613e-08
3899,CCCC(CC)CCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.879378,-1.691560,-0.088478,0.478065,-0.328738,-0.575478,-0.121147,-0.363175,...,0.004438,0.044867,0.008498,0.001704,2.361761e-06,0.001182,2.449773e-07,-1.542120e-07,-0.629495,9.076535e-08


In [19]:
uniq_sm_comm = df_comm['solv_comb_sm'].unique()
for i in range(len(uniq_sm_b2)):
    if uniq_sm_b2['solv_comb_sm'][i] not in uniq_sm_comm:
        print(uniq_sm_b2['solv_comb_sm'][i])

COC(CC(F)(F)F)OC
COCCOCOCC(F)(F)F
COC(CCl)OC
COC(C)CC(OC)OC
