## Notebook for training GPR on DC dataset and generating set of suggestions

**Note: Dataset do not contain any NMC data**

## Batch: 7

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, PandasTools, Fragments, rdMolDescriptors, Descriptors, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.PandasTools import ChangeMoleculeRendering
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser
# Silence non-critical RDKit warnings to minimize unnecessary outputs
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
## import train_test_split from sklearn
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, RationalQuadratic, WhiteKernel, Matern, ConstantKernel, DotProduct, PairwiseKernel 
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.special import erf
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
%%bash
pwd
ls -ltr

/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/Nat-Comm-R2/active-learning_wo_nmc_data
total 13392
-rw-r--r--@ 1 riteshk  staff   964972 Apr 17 21:42 active_learning_batch_1.ipynb
-rw-r--r--@ 1 riteshk  staff   984021 Apr 24 10:28 active_learning_batch_2.ipynb
-rw-r--r--@ 1 riteshk  staff  1113818 Apr 24 10:28 active_learning_batch_3.ipynb
-rw-r--r--@ 1 riteshk  staff  1164096 Apr 24 10:28 active_learning_batch_4.ipynb
-rw-r--r--@ 1 riteshk  staff  1002304 Apr 24 10:28 active_learning_batch_5.ipynb
-rw-r--r--@ 1 riteshk  staff  1059070 Apr 24 12:09 active_learning_batch_6.ipynb
-rw-r--r--@ 1 riteshk  staff   555953 Apr 24 12:17 active_learning_batch_7.ipynb


### Reading & standardizing datasets

In [3]:
rem_till_b6 = [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 56, 57, 60, 61, 62, 66, 67, 68, 70, 73, 74, 75, 76, 77, 78, 79, 104, 110, 111, 112, 113, 117, 118, 119]
rem_till_b6 += [121, 123, 127, 128, 129]
rem_till_b6 += [141, 143, 144, 145, 149, 150, 151]
rem_till_b6 += [176, 177, 178, 181]
# rem_till_b6 += []
len(rem_till_b6)

51

In [None]:
df = pd.read_csv('../../datasets/batch-7/label_data_post_batch6.csv') 
df.drop(index=rem_till_b6, inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_14,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,0.613072,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,0.301582,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.022233,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,0.421934,...,0.000573,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,-0.208395,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,COC1CCCCC1=O,[Li+].F[P-](F)(F)(F)(F)F,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
144,COC1CCCCC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.996265,-0.567873,1.168385,-0.494515,-0.269926,0.307456,-0.150987,-0.613881,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
145,COCCS(=O)(=O)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.774656,-1.131648,1.061661,-0.391643,-0.188458,0.184919,0.733522,0.657518,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0
146,COCCSC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.819883,-1.431011,0.956715,-0.464875,-0.484067,0.206120,0.691785,0.673957,...,0.000000,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0


In [5]:
X = df.iloc[:,2:27] ## PCA-reduced solvent & salt descriptors and other non-molecular features
y = df['norm_capacity_3'] # normalized discharge capacity at 20th cycle (target variable)
std_scale = StandardScaler().fit(X)
X_std = std_scale.transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

In [None]:
df.to_csv('../../datasets/label_all_ecfp_pca_wo_nmc_data.csv', index=False)

### Active learning workflow

#### Choose best hyperparameters for each kernel

In [6]:
def negative_log_likelihood_rbf(params):
    noise_level, length_scale, alpha = params
    kernel = RBF(length_scale=length_scale)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rq(params):
    noise_level, length_scale, alpha_k, alpha = params 
    kernel = RationalQuadratic(length_scale=length_scale, alpha=alpha_k)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_rbf_expsin(params):
    noise_level, length_scale, periodicity, alpha = params 
    kernel = RBF(length_scale=length_scale) + ExpSineSquared(length_scale=length_scale, periodicity=periodicity)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42) 
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_matern(params):
    noise_level, length_scale, alpha = params
    kernel = Matern(length_scale=length_scale, nu=1.5)
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

def negative_log_likelihood_pairwise(params):
    noise_level, length_scale, alpha = params
    kernel = PairwiseKernel(metric="polynomial")
    white_kernel = WhiteKernel(noise_level=noise_level)
    composite_kernel = kernel + white_kernel
    gpr = GaussianProcessRegressor(kernel=composite_kernel, alpha=alpha, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)
    gpr.fit(X_std, y)
    pred_mean, pred_std = gpr.predict(X_std, return_std=True)
    log_likelihood = np.sum(norm.logpdf(y, loc=pred_mean, scale=pred_std))
    return -log_likelihood

##### Pairwise kernel

In [7]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_pairwise, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.02000002616583303


##### RationalQuadratic kernel

In [8]:
initial_guess = [0.15, 0.01, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha_k, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha_k, alpha
result = minimize(negative_log_likelihood_rq, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha_k, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha_k:", optimized_alpha_k)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.16881592144304958
Optimized length_scale: 0.01010695613039165
Optimized alpha_k: 0.16153456472369815
Optimized alpha: 0.007884872005515384


##### Matern-3/2 kernel

In [9]:
initial_guess = [0.15, 0.01, 0.02] # initial guess for noise_level, length_scale, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, alpha
result = minimize(negative_log_likelihood_matern, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized alpha:", optimized_alpha)



Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized alpha: 0.010685607486285659


##### RBF-ExpineSquared kernel

In [10]:
initial_guess = [0.15, 0.01, 1.0, 0.02] # initial guess for noise_level, length_scale, periodicity, alpha
param_bounds = [(1e-4, 1.0), (1e-5, 50.0), (1e-2, 10.0), (1e-4, 0.1)] # bounds for noise_level, length_scale, periodicity, alpha
result = minimize(negative_log_likelihood_rbf_expsin, initial_guess, bounds=param_bounds)
optimized_hyperparameters = result.x
optimized_noise_level, optimized_length_scale, optimized_periodicity, optimized_alpha = optimized_hyperparameters
print("Optimized noise_level:", optimized_noise_level)
print("Optimized length_scale:", optimized_length_scale)
print("Optimized periodicity:", optimized_periodicity)
print("Optimized alpha:", optimized_alpha)

Optimized noise_level: 0.15
Optimized length_scale: 0.01
Optimized periodicity: 1.0
Optimized alpha: 0.01999999204947953


#### Train surrogate models

Note: no need to run again, saved model checkpoints have been provided

In [None]:
## change all hyperparameters accordingly
optimized_pairwise_kernel = PairwiseKernel(metric="polynomial") + WhiteKernel(noise_level=0.15)
optimized_matern_kernel = Matern(length_scale=0.01, nu=1.5) + WhiteKernel(noise_level=0.15)
optimized_rbfexpsin_kernel = RBF(length_scale=0.01) + ExpSineSquared(length_scale=0.01, periodicity=1.0) + WhiteKernel(noise_level=0.15)
optimized_rq_kernel = RationalQuadratic(length_scale=0.01010695613039165, alpha=0.16153456472369815) + WhiteKernel(noise_level=0.16881592144304958)

gpr_models = [GaussianProcessRegressor(kernel=optimized_pairwise_kernel, alpha=0.02000002616583303, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_matern_kernel, alpha=0.010685607486285659, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rq_kernel, alpha=0.007884872005515384, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42),
              GaussianProcessRegressor(kernel=optimized_rbfexpsin_kernel, alpha=0.01999999204947953, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=10, random_state=42)]

model_names = ['../../models/batch-7/pairwise_batch7_wo_nmc_data.pkl', '../../models/batch-7/matern_batch7_wo_nmc_data.pkl', '../../models/batch-7/rq_batch7_wo_nmc_data.pkl', '../../models/batch-7/rbf-ess_batch7_wo_nmc_data.pkl']
k = 0
for model in gpr_models:
    print("fitting model: ", k)
    model.fit(X_std, y)
    pickle.dump(model, open(model_names[k], 'wb'))
    k += 1

fitting model:  0


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


fitting model:  1
fitting model:  2
fitting model:  3


#### BMA: aggregate predictions

##### Acquisition function (Expected improvement)

In [12]:
## final corrected & verified one to be used
def calc_EI(y_pred, y_pred_un, y_pred_un_uncer, epsilon=0.01):
    y_best = np.max(y_pred)
    EI = []
    explore = []
    exploit = []

    for i in range(len(y_pred_un)):
        if y_pred_un_uncer[i] != 0:
            
            # Calculate the cumulative distribution function (CDF) for the Gaussian distribution
            z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            # z = (y_pred_un[i] - y_best - epsilon) / y_pred_un_uncer[i]
            cdf_z = 0.5 * (1 + erf(z / np.sqrt(2)))
            pdf_z = np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)

            # Calculate Expected Improvement
            expected_improvement = y_pred_un_uncer[i] * (z * cdf_z) + y_pred_un_uncer[i] * pdf_z
            exploitation = y_pred_un_uncer[i] * z * cdf_z
            exploration = y_pred_un_uncer[i] * pdf_z
            EI.append(expected_improvement)
            explore.append(exploration)
            exploit.append(exploitation)
        else:
            EI.append(0.0)
    return EI, exploit, explore

In [13]:
## virtual search space for batch-4 (electrolytes containing solvent combinations tested in batch-3 removed)
path = '/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/AD-AFB/data-codes-sharing/datasets'
df_unlabel = pd.read_csv(f'{path}/batch-7/virtual_search_space_for_batch7.csv')
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,salt_ecfp_pca_5,salt_ecfp_pca_6,salt_ecfp_pca_7,salt_ecfp_pca_8,salt_ecfp_pca_9,mol_wt_solv,mol_wt_salt,conc_salt_1,theor_capacity,amt_electrolyte
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,73.052764,186.939685,1,150,50
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,114.079313,186.939685,1,150,50
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,116.094963,186.939685,1,150,50
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,70.058995,186.939685,1,150,50
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.128733,-0.322339,0.258767,0.301779,-0.272648,72.044939,186.939685,1,150,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999821,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,193.077264,144.001775,1,150,50
999822,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,240.147392,144.001775,1,150,50
999823,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,251.046358,144.001775,1,150,50
999824,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,-0.387958,-0.076666,0.008496,0.161338,0.109063,248.133634,144.001775,1,150,50


In [14]:
X_un = df_unlabel.iloc[:,2:]
X_un_std = std_scale.transform(X_un)
X_un_std = pd.DataFrame(X_un_std, columns=X_un.columns)

##### Calculate model weights & obtained aggregated mean ($\mu^{aggr}$), uncertainty ($\sigma^{aggr}$), & EI ($EI^{aggr}$)

In [None]:
# Calculate Model Weights using BMA (first order)
model_names = ['../../models/batch-7/pairwise_batch7_wo_nmc_data.pkl', '../../models/batch-7/matern_batch7_wo_nmc_data.pkl', '../../models/batch-7/rq_batch7_wo_nmc_data.pkl', '../../models/batch-7/rbf-ess_batch7_wo_nmc_data.pkl']
model_weights = []
uncertainties = []
predictions = []
y_label_preds = []
for model in model_names:
    gpr = pickle.load(open(model, 'rb'))
    y_un = gpr.predict(X_un_std)
    predictions.append(y_un)
    individual_uncertainties = gpr.predict(X_un_std, return_std=True)[1]
    uncertainties.append(individual_uncertainties)
    likelihoods = norm.pdf(y_un, loc=gpr.predict(X_un_std), scale=individual_uncertainties)
    prior_beliefs = 1.0  # Non-informative prior
    posterior = likelihoods * prior_beliefs
    model_weights.append(posterior / np.sum(posterior))
    y_ = gpr.predict(X_std)
    y_label_preds.append(y_)

In [16]:
df_unlabel['uncertainty_1'] = uncertainties[0]; df_unlabel['uncertainty_2'] = uncertainties[1]; df_unlabel['uncertainty_3'] = uncertainties[2]; df_unlabel['uncertainty_4'] = uncertainties[3]
df_unlabel['prediction_1'] = predictions[0]; df_unlabel['prediction_2'] = predictions[1]; df_unlabel['prediction_3'] = predictions[2]; df_unlabel['prediction_4'] = predictions[3]
df_unlabel['explore_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[2]; df_unlabel['exploit_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[1]
df_unlabel['explore_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[2]; df_unlabel['exploit_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[1]
df_unlabel['explore_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[2]; df_unlabel['exploit_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[1]
df_unlabel['explore_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[2]; df_unlabel['exploit_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[1]
df_unlabel['EI_1'] = calc_EI(y_label_preds[0], predictions[0], uncertainties[0])[0]; df_unlabel['EI_2'] = calc_EI(y_label_preds[1], predictions[1], uncertainties[1])[0]; df_unlabel['EI_3'] = calc_EI(y_label_preds[2], predictions[2], uncertainties[2])[0]; df_unlabel['EI_4'] = calc_EI(y_label_preds[3], predictions[3], uncertainties[3])[0]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,explore_2,exploit_2,explore_3,exploit_3,explore_4,exploit_4,EI_1,EI_2,EI_3,EI_4
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,0.001675,-0.001517,0.006464,-0.005651,0.000413,-0.000383,2.114289e-05,0.000158,0.000813,0.000031
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,0.002430,-0.002187,0.004574,-0.004049,0.001019,-0.000931,5.550761e-05,0.000243,0.000525,0.000087
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,0.000626,-0.000577,0.004424,-0.003922,0.000116,-0.000109,4.273494e-06,0.000049,0.000503,0.000007
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,0.003753,-0.003330,0.006919,-0.006027,0.001425,-0.001292,9.528039e-05,0.000423,0.000891,0.000132
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,0.001571,-0.001425,0.006115,-0.005358,0.000378,-0.000351,1.946369e-05,0.000146,0.000756,0.000028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999821,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,0.000527,-0.000488,0.002952,-0.002647,0.000085,-0.000080,2.891597e-06,0.000040,0.000305,0.000005
999822,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,0.001972,-0.001788,0.003134,-0.002808,0.000468,-0.000433,1.540448e-05,0.000184,0.000326,0.000035
999823,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,0.001397,-0.001273,0.003403,-0.003040,0.000321,-0.000298,1.283842e-05,0.000124,0.000363,0.000023
999824,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,0.000715,-0.000659,0.003229,-0.002890,0.000060,-0.000057,9.509758e-07,0.000056,0.000339,0.000003


In [17]:
def calc_aggr_uncer(uncer_1, w_1, pred_1, uncer_2, w_2, pred_2, uncer_3, w_3, pred_3, uncer_4, w_4, pred_4):
    uncer = [uncer_1, uncer_2, uncer_3, uncer_4]
    pred = [pred_1, pred_2, pred_3, pred_4]
    weight = [w_1, w_2, w_3, w_4]
    pred_aggr = w_1 * pred_1 + w_2 * pred_2 + w_3 * pred_3 + w_4 * pred_4
    sum = 0
    for i in range(4):
        sum += weight[i] * (uncer[i]**2 + (pred[i] - pred_aggr)**2)
    aggr_uncer = np.sqrt(sum)
    return aggr_uncer

In [18]:
df_unlabel['prediction_aggr'] = df_unlabel['prediction_1'] * model_weights[0] + df_unlabel['prediction_2'] * model_weights[1] + df_unlabel['prediction_3'] * model_weights[2] + df_unlabel['prediction_4'] * model_weights[3]
df_unlabel['uncertainty_aggr'] = calc_aggr_uncer(df_unlabel['uncertainty_1'], model_weights[0], df_unlabel['prediction_1'], df_unlabel['uncertainty_2'], model_weights[1], df_unlabel['prediction_2'], df_unlabel['uncertainty_3'], model_weights[2], df_unlabel['prediction_3'], df_unlabel['uncertainty_4'], model_weights[3], df_unlabel['prediction_4'])
df_unlabel['explore_aggr'] = df_unlabel['explore_1'] * model_weights[0] + df_unlabel['explore_2'] * model_weights[1] + df_unlabel['explore_3'] * model_weights[2] + df_unlabel['explore_4'] * model_weights[3]
df_unlabel['exploit_aggr'] = df_unlabel['exploit_1'] * model_weights[0] + df_unlabel['exploit_2'] * model_weights[1] + df_unlabel['exploit_3'] * model_weights[2] + df_unlabel['exploit_4'] * model_weights[3]
df_unlabel['ratio_aggr'] = df_unlabel['exploit_aggr'] / df_unlabel['explore_aggr']

## 'EI_aggr' is the final rank by which candidate electrolytes are selected for experimental validation
df_unlabel['EI_aggr'] = df_unlabel['EI_1'] * model_weights[0] + df_unlabel['EI_2'] * model_weights[1] + df_unlabel['EI_3'] * model_weights[2] + df_unlabel['EI_4'] * model_weights[3]
df_unlabel

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,CN(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.325380,-0.919052,-0.279367,-0.373535,0.936724,-0.161937,-0.377185,0.259444,...,2.114289e-05,0.000158,0.000813,0.000031,2.197736e-07,0.000516,9.280583e-09,-8.212216e-09,-0.884881,1.068367e-09
1,CN1CCN(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.013958,-0.868963,0.700464,0.728341,0.681048,-1.181697,0.018457,0.793475,...,5.550761e-05,0.000243,0.000525,0.000087,1.797495e-07,0.000511,8.892351e-09,-7.962856e-09,-0.895473,9.294946e-10
2,CN(C)C(=O)N(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.571448,0.079720,-0.117841,-0.342583,1.301206,-0.264839,-0.516489,0.217482,...,4.273494e-06,0.000049,0.000503,0.000007,-2.336839e-07,0.000547,5.306428e-09,-4.736602e-09,-0.892616,5.698258e-10
3,CB(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.152976,-1.321495,0.631553,-0.113769,0.728616,-0.486733,-0.311699,0.151570,...,9.528039e-05,0.000423,0.000891,0.000132,6.027135e-07,0.000559,1.437630e-08,-1.269898e-08,-0.883327,1.677322e-09
4,[CH2]N(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.283297,-0.876057,-0.318283,-0.547668,0.955202,-0.313588,-0.356929,0.299023,...,1.946369e-05,0.000146,0.000756,0.000028,1.595693e-07,0.000514,8.673957e-09,-7.689816e-09,-0.886541,9.841410e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999821,CC1ON(C)C(C)C1S(C)(=O)=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.771012,-1.078565,0.565077,0.216605,0.613562,-0.742397,-0.257837,0.243970,...,2.891597e-06,0.000040,0.000305,0.000005,-4.296195e-07,0.000583,3.602275e-09,-3.250286e-09,-0.902287,3.519891e-10
999822,COC(=O)C1(N2CCCN(C)C2=O)CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.881747,0.625623,1.808971,0.306581,0.123581,-0.469934,-0.451538,1.337094,...,1.540448e-05,0.000184,0.000326,0.000035,-3.354990e-07,0.000573,5.440780e-09,-4.913707e-09,-0.903125,5.270734e-10
999823,COC(=O)N1CCC(OS(C)(=O)=O)CC1=O,[Li+].O=C1O[B-](F)(F)OC1=O,0.700964,0.588889,1.459777,0.806459,-0.042994,-0.081386,0.196396,-0.195416,...,1.283842e-05,0.000124,0.000363,0.000023,-1.935858e-07,0.000533,5.243910e-09,-4.728246e-09,-0.901664,5.156632e-10
999824,CN(C)C(=O)CN(C)C(=O)C1CCCC1(F)F,[Li+].O=C1O[B-](F)(F)OC1=O,0.558235,0.889818,-0.240008,-0.734436,0.749283,0.799906,-0.856418,0.214675,...,9.509758e-07,0.000056,0.000339,0.000003,-6.422198e-07,0.000669,3.878268e-09,-3.492338e-09,-0.900489,3.859306e-10


##### Save top 5000 predictions: using EI

In [19]:
df_unlabel_ = df_unlabel.copy()
df_unlabel_EI = df_unlabel_.sort_values(by='EI_aggr', ascending=False)
df_unlabel_EI_5000 = df_unlabel_EI.iloc[:5000,:]
df_unlabel_EI_5000

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
302680,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.011391,0.028903,0.004856,0.016248,2.200608e-06,0.001135,2.067450e-07,-1.349598e-07,-0.652784,7.178516e-08
86849,COCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.733879,-1.391349,0.795406,-0.490061,-0.085019,0.834661,1.553562,0.160255,...,0.010245,0.027209,0.004981,0.015165,2.194699e-06,0.001128,2.008737e-07,-1.332258e-07,-0.663232,6.764787e-08
163787,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.740833,-1.411703,0.814934,-0.453350,-0.102028,0.873072,1.559289,0.148085,...,0.010387,0.027447,0.004939,0.014643,2.189165e-06,0.001126,2.002533e-07,-1.328588e-07,-0.663454,6.739449e-08
297383,COCCCCOCCCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.776737,-1.511074,0.882413,-0.374763,-0.084476,0.831644,1.626411,0.214361,...,0.009960,0.026570,0.004339,0.014300,2.159075e-06,0.001115,1.950195e-07,-1.302508e-07,-0.667886,6.476876e-08
261974,COCCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.793614,-1.438015,0.929087,-0.416472,-0.097812,0.818234,1.708894,0.104504,...,0.009998,0.027206,0.004059,0.013668,2.152162e-06,0.001112,1.935233e-07,-1.289917e-07,-0.666544,6.453158e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557178,COCCCC(F)(F)C(C)(C)F,[Li+].F[P-](F)(F)(F)(F)F,-1.092402,-1.514164,1.042945,-0.491385,0.257059,0.368202,1.716456,0.310571,...,0.001225,0.003515,0.000556,0.001690,1.188154e-06,0.000742,4.890463e-08,-4.095622e-08,-0.837471,7.948406e-09
328715,CCCCOCCOCC(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.268444,-1.665272,0.063199,0.437676,-1.373638,0.289036,0.265833,-0.387373,...,0.001065,0.002926,0.001902,0.001390,1.153706e-06,0.000730,5.055914e-08,-4.261204e-08,-0.842816,7.947095e-09
310299,COP(Cl)(Cl)(C(C)(C)C)C(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.544877,-1.444331,1.132880,-0.893553,0.552419,-0.058011,1.101392,-0.175321,...,0.000950,0.003637,0.000729,0.001413,1.310937e-06,0.000769,4.870918e-08,-4.076368e-08,-0.836879,7.945505e-09
154433,COCCCN1CCN(C)C(C)(C)C1=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.459041,-0.740103,1.326976,0.602223,-0.458590,-0.005686,0.695918,1.862728,...,0.001608,0.003226,0.000932,0.002117,9.016408e-07,0.000672,5.018324e-08,-4.223836e-08,-0.841683,7.944885e-09


In [None]:
df_unlabel_EI_uniq = df_unlabel_EI_5000.drop_duplicates(subset=['solv_comb_sm'], keep='first') ## only keeping unique solvent combinations for selection purposes; these compounds were manually searched in emolecules to find purchasable compounds
df_unlabel_EI_uniq['solv_comb_sm'] = df_unlabel_EI_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_unlabel_EI_uniq[['solv_comb_sm', 'salt_comb_sm', 'prediction_aggr', 'uncertainty_aggr', 'explore_aggr', 'exploit_aggr', 'ratio_aggr', 'EI_aggr']].to_csv('../../datasets/batch-7/top_5000_suggestions_batch7_uniq_solvents_wo_nmc_data_EI.csv', index=False)
df_unlabel_EI_uniq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabel_EI_uniq['solv_comb_sm'] = df_unlabel_EI_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
302680,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.011391,0.028903,0.004856,0.016248,2.200608e-06,0.001135,2.067450e-07,-1.349598e-07,-0.652784,7.178516e-08
86849,COCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.733879,-1.391349,0.795406,-0.490061,-0.085019,0.834661,1.553562,0.160255,...,0.010245,0.027209,0.004981,0.015165,2.194699e-06,0.001128,2.008737e-07,-1.332258e-07,-0.663232,6.764787e-08
163787,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.740833,-1.411703,0.814934,-0.453350,-0.102028,0.873072,1.559289,0.148085,...,0.010387,0.027447,0.004939,0.014643,2.189165e-06,0.001126,2.002533e-07,-1.328588e-07,-0.663454,6.739449e-08
297383,COCCCCOCCCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.776737,-1.511074,0.882413,-0.374763,-0.084476,0.831644,1.626411,0.214361,...,0.009960,0.026570,0.004339,0.014300,2.159075e-06,0.001115,1.950195e-07,-1.302508e-07,-0.667886,6.476876e-08
261974,COCCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.793614,-1.438015,0.929087,-0.416472,-0.097812,0.818234,1.708894,0.104504,...,0.009998,0.027206,0.004059,0.013668,2.152162e-06,0.001112,1.935233e-07,-1.289917e-07,-0.666544,6.453158e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242905,COCCCCC(C)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.324501,-0.990143,0.587856,-0.037702,-1.271595,-0.378606,0.513472,0.364708,...,0.001007,0.002774,0.001878,0.001298,1.303814e-06,0.000768,5.059380e-08,-4.263631e-08,-0.842718,7.957485e-09
63888,CCOCCP(=O)(CC)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.009239,-1.134707,0.815208,0.076368,-0.269881,0.374218,0.331322,0.397177,...,0.000532,0.002373,0.003144,0.000706,1.396598e-06,0.000790,4.901752e-08,-4.106291e-08,-0.837719,7.954616e-09
328715,CCCCOCCOCC(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.268444,-1.665272,0.063199,0.437676,-1.373638,0.289036,0.265833,-0.387373,...,0.001065,0.002926,0.001902,0.001390,1.153706e-06,0.000730,5.055914e-08,-4.261204e-08,-0.842816,7.947095e-09
310299,COP(Cl)(Cl)(C(C)(C)C)C(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.544877,-1.444331,1.132880,-0.893553,0.552419,-0.058011,1.101392,-0.175321,...,0.000950,0.003637,0.000729,0.001413,1.310937e-06,0.000769,4.870918e-08,-4.076368e-08,-0.836879,7.945505e-09


In [21]:
df_unlabel_EI_uniq_ = pd.read_csv(f'{path}/batch-7/top_5000_suggestions_batch7_uniq_solvents_EI.csv')
df_unlabel_EI_uniq_

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,theor_capacity,amt_electrolyte,pressure_type,rank,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,150,50,2,1608.0,1.768582e-06,0.000909,1.375098e-08,-1.214112e-08,-7.541743,1.609856e-09
1,COCCO[Al],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.843809,-0.989662,1.069399,-0.365416,-0.311995,0.308275,0.678411,0.599304,...,150,50,2,1920.0,1.769618e-06,0.000908,1.251228e-08,-1.110397e-08,-7.884586,1.408313e-09
2,COCCB=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801731,-1.105580,1.083006,-0.354707,-0.315271,0.212797,0.668411,0.618927,...,150,50,2,1897.0,1.749084e-06,0.000903,1.270709e-08,-1.128760e-08,-7.951840,1.419495e-09
3,COCCOOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.867386,-1.041068,1.083540,-0.400942,-0.225466,0.304233,0.783784,0.599485,...,150,50,2,1190.0,1.899484e-06,0.000960,1.682915e-08,-1.482300e-08,-7.388799,2.006145e-09
4,COCCCC=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.046620,-1.064656,0.993638,-0.042687,-0.413475,0.534219,0.629863,0.690217,...,150,50,2,1334.0,1.772658e-06,0.000915,1.549996e-08,-1.364090e-08,-7.337516,1.859062e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4341,COCCOCCOC1CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.340693,-0.507196,0.871490,-0.409964,-1.059248,1.172802,0.773990,-0.333313,...,150,50,2,578.0,1.475336e-06,0.000844,2.328477e-08,-2.005211e-08,-6.202975,3.232660e-09
4342,COCCC1CCCC(=O)CC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.631822,-0.656586,1.123381,-0.360369,-0.423082,0.714197,0.992159,-0.337507,...,150,50,2,4264.0,1.261659e-06,0.000746,7.986280e-09,-7.167233e-09,-8.750702,8.190466e-10
4343,COCCCCCCCOCC=O,[Li+].O=C1O[B-](F)(F)OC1=O,-0.707783,-1.113542,0.971515,0.124799,-0.614998,0.978602,0.770082,0.267719,...,150,50,2,3241.0,1.343082e-06,0.000768,9.079260e-09,-8.082776e-09,-8.111294,9.964841e-10
4344,COCCC1CCCN(CC=O)C1,[Li+].O=C1O[B-](F)(F)OC1=O,1.183272,-0.146497,0.667712,0.318337,-1.237932,0.317185,1.474304,0.149830,...,150,50,2,2496.0,8.901077e-07,0.000652,1.066601e-08,-9.463832e-09,-7.872218,1.202181e-09


In [23]:
df_unlabel_EI_uniq_['solv_comb_sm'] = df_unlabel_EI_uniq_['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
# df_sugg_comm_EI = pd.merge(df_unlabel_EI_uniq_, df_unlabel_EI_uniq, on=['solv_comb_sm', 'salt_comb_sm'], how='inner') ## unique electrolytes
df_sugg_comm_EI = pd.merge(df_unlabel_EI_uniq_, df_unlabel_EI_uniq, on=['solv_comb_sm'], how='inner') ## unique solvents
df_sugg_comm_EI

Unnamed: 0,solv_comb_sm,salt_comb_sm_x,solv_ecfp_pca_0_x,solv_ecfp_pca_1_x,solv_ecfp_pca_2_x,solv_ecfp_pca_3_x,solv_ecfp_pca_4_x,solv_ecfp_pca_5_x,solv_ecfp_pca_6_x,solv_ecfp_pca_7_x,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr_y,uncertainty_aggr_y,explore_aggr_y,exploit_aggr_y,ratio_aggr_y,EI_aggr_y
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,0.002432,0.006816,0.001046,0.003425,0.000002,0.000916,8.335992e-08,-6.669670e-08,-0.800105,1.666322e-08
1,COCCO[Al],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.843809,-0.989662,1.069399,-0.365416,-0.311995,0.308275,0.678411,0.599304,...,0.002109,0.005997,0.000818,0.003010,0.000002,0.000907,7.573123e-08,-6.115652e-08,-0.807547,1.457471e-08
2,COCCB=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801731,-1.105580,1.083006,-0.354707,-0.315271,0.212797,0.668411,0.618927,...,0.002632,0.006548,0.000941,0.003439,0.000002,0.000903,8.233280e-08,-6.597195e-08,-0.801284,1.636086e-08
3,COCCOOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.867386,-1.041068,1.083540,-0.400942,-0.225466,0.304233,0.783784,0.599485,...,0.002472,0.007031,0.001147,0.003313,0.000002,0.000919,8.451689e-08,-6.752657e-08,-0.798971,1.699032e-08
4,COCCCC=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.046620,-1.064656,0.993638,-0.042687,-0.413475,0.534219,0.629863,0.690217,...,0.003267,0.007673,0.002792,0.004336,0.000002,0.000951,1.026523e-07,-8.107253e-08,-0.789778,2.157973e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513,CCCOCCCOCCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.880135,-1.601934,0.533148,0.449946,-0.843933,1.229256,0.795983,0.214929,...,0.000970,0.003634,0.001123,0.001513,0.000001,0.000739,5.015293e-08,-4.205721e-08,-0.838579,8.095717e-09
3514,CCCOCCCCOCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.880135,-1.601934,0.533148,0.449946,-0.843933,1.229256,0.795983,0.214929,...,0.000970,0.003634,0.001123,0.001513,0.000001,0.000739,5.015293e-08,-4.205721e-08,-0.838579,8.095717e-09
3515,BCCCCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.925033,-1.540815,0.859871,0.021506,-0.581742,0.783126,0.821350,0.680388,...,0.001092,0.003439,0.001120,0.001628,0.000001,0.000822,5.337603e-08,-4.469209e-08,-0.837306,8.683944e-09
3516,COCCOCCOC1CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.340693,-0.507196,0.871490,-0.409964,-1.059248,1.172802,0.773990,-0.333313,...,0.001991,0.007307,0.001283,0.003194,0.000001,0.000803,7.735374e-08,-6.226052e-08,-0.804880,1.509323e-08


### Check how many common solvents in seveth batch of labeled dataset: using EI

In [None]:
df_label_all = pd.read_csv('../../datasets/label_all_ecfp_pca_add_feat_incl_b7_090824.csv')
df_label_all

Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
0,COCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.845301,-0.995151,1.062720,-0.357552,-0.308720,0.309456,0.693325,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.183255,-0.948338,0.615257,-0.582269,-1.010187,-0.522075,0.496896,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.02168,0.022967,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-1.240814,-0.970769,0.671105,-0.607789,-1.124651,-0.436617,0.628824,...,0.00046,0.000360,0.000407,0.0006,0.000593,0.000447,0.000333,0.0,0.0,0.0
4,CCOCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.0,-0.879184,-1.539457,0.507075,-0.111830,0.800175,0.133053,0.926130,...,0.00000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
204,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
205,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
206,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0


In [25]:
df_b7 = df_label_all.loc[df_label_all['expt_test'] == 7]
df_b7['solv_comb_sm'] = df_b7['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_b7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b7['solv_comb_sm'] = df_b7['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,batch,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test
199,COCCOS(=O)(=O)C(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
200,COCCOCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
201,COC1CCCCO1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
202,CCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
203,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
204,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
205,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
206,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0
207,COCCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,7.0,,,,,,,,...,,,,,,,,,,7.0


In [26]:
uniq_sm_b7 = df_b7['solv_comb_sm'].unique()
uniq_sm_b7 = pd.DataFrame(uniq_sm_b7, columns=['solv_comb_sm'])
uniq_sm_b7

Unnamed: 0,solv_comb_sm
0,COCCOS(=O)(=O)C(F)(F)F
1,COCCOCC(F)F
2,COC1CCCCO1
3,CCCCOCCCOC
4,CCOCCCCOC
5,COCCCCOCC(F)(F)F
6,COCCCOCC(C)(C)C
7,COCCCOCC(F)(F)C(F)F
8,COCCCF


In [27]:
df_comm_EI = df_unlabel_EI_uniq.merge(uniq_sm_b7, on='solv_comb_sm', how='right')
df_comm_EI.dropna(inplace=True)
df_comm_EI

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_1,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
1,COCCOCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.095649,-0.965356,0.935134,-0.459764,-0.700674,-0.116387,0.853413,0.497956,...,0.001843,0.005286,0.001072,0.002439,1.59313e-06,0.000863,6.97396e-08,-5.691285e-08,-0.816076,1.282675e-08
2,COC1CCCCO1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,1.016226,-0.952835,1.050206,-0.993306,-0.181317,0.699064,0.240841,-0.729117,...,0.001599,0.003933,0.0015,0.00197,9.219843e-07,0.00068,5.510256e-08,-4.612454e-08,-0.837067,8.978017e-09
3,CCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.948577,-1.676686,0.484386,0.660827,-0.854012,1.322076,0.745136,0.142353,...,0.006298,0.0121,0.00342,0.008035,1.916427e-06,0.001008,1.396974e-07,-1.049203e-07,-0.751054,3.477707e-08
4,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.825595,-1.40477,0.601096,0.177182,-0.731191,1.110972,0.791357,0.32266,...,0.00588,0.013991,0.007192,0.007728,2.075048e-06,0.001062,1.574425e-07,-1.162994e-07,-0.738678,4.11431e-08
5,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.740833,-1.411703,0.814934,-0.45335,-0.102028,0.873072,1.559289,0.148085,...,0.010387,0.027447,0.004939,0.014643,2.189165e-06,0.001126,2.002533e-07,-1.328588e-07,-0.663454,6.739449e-08
6,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.786659,-1.417662,0.909559,-0.453183,-0.080803,0.779823,1.703168,0.116674,...,0.009872,0.026921,0.003975,0.01391,2.15221e-06,0.001112,1.930715e-07,-1.288073e-07,-0.667148,6.426424e-08
7,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.924801,-1.49945,0.749023,-0.542249,-0.491989,0.360325,1.438693,0.063002,...,0.006369,0.018534,0.004016,0.009281,2.033241e-06,0.001055,1.602412e-07,-1.151098e-07,-0.718353,4.513141e-08
8,COCCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.888535,-1.532757,0.907942,-0.240821,-0.416103,0.506808,0.882704,0.708297,...,0.005984,0.012827,0.003082,0.008133,2.0682e-06,0.001049,1.422902e-07,-1.061526e-07,-0.746029,3.613763e-08


In [None]:
df_comm_EI.to_csv('../../datasets/batch-7/labeled_batch7_uniq_solvents_wo_nmc_data_EI.csv', index=False)

In [None]:
# df_comm_label = df_unlabel_EI_uniq.merge(df_b7, on=['solv_comb_sm'], how='left', suffixes=('', '_drop'))
# df_comm_label.dropna(inplace=True)
# df_comm_label = df_comm_label.loc[:, ~df_comm_label.columns.str.endswith('_drop')]
# df_comm_label

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,norm_capacity_15,norm_capacity_16,norm_capacity_17,norm_capacity_18,norm_capacity_19,norm_capacity_20,norm_capacity_21,norm_capacity_22,norm_capacity_23,expt_test


In [None]:
# df_comm_label.to_csv('../datasets/batch-7/labeled_batch7_all_wo_nmc_data.csv', index=False) # no need to save since ML models were not trained on this data

In [29]:
uniq_sm_comm = df_comm_EI['solv_comb_sm'].unique()
for i in range(len(uniq_sm_b7)):
    if uniq_sm_b7['solv_comb_sm'][i] not in uniq_sm_comm:
        print(uniq_sm_b7['solv_comb_sm'][i])

COCCOS(=O)(=O)C(F)(F)F


##### Save top 5000 predictions: using greedy

In [30]:
df_unlabel__ = df_unlabel.copy()
df_unlabel__['prediction_avg'] = df_unlabel__[['prediction_1', 'prediction_2', 'prediction_3', 'prediction_4']].mean(axis=1)
df_unlabel_greed = df_unlabel__.sort_values(by='prediction_avg', ascending=False)
df_unlabel_greed_5000 = df_unlabel_greed.iloc[:5000,:]
df_unlabel_greed_5000

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr,prediction_avg
43113,COCCCOCCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.637205,-1.420085,0.806963,-0.229475,-0.708541,0.911731,0.990308,0.472429,...,0.019294,0.005475,0.012299,0.000002,0.001163,1.875285e-07,-1.314580e-07,-0.701003,5.607048e-08,0.483545
122700,COCCCOCCO[AlH2],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.668732,-1.441791,0.813432,-0.237769,-0.656388,0.923447,1.047836,0.455033,...,0.020539,0.006613,0.012095,0.000002,0.001150,1.921497e-07,-1.336854e-07,-0.695736,5.846430e-08,0.481648
162773,COCCCCCOCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.721775,-1.496471,0.872212,-0.086397,-0.676900,0.979219,0.992173,0.364148,...,0.022099,0.006528,0.013557,0.000002,0.001144,1.986960e-07,-1.360901e-07,-0.684916,6.260597e-08,0.481360
36500,COCCCCOCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.652356,-1.457032,0.890263,-0.266606,-0.608907,0.841153,1.013318,0.390267,...,0.021114,0.006383,0.012794,0.000002,0.001139,1.928062e-07,-1.333576e-07,-0.691667,5.944855e-08,0.477486
302680,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.028903,0.004856,0.016248,0.000002,0.001135,2.067450e-07,-1.349598e-07,-0.652784,7.178516e-08,0.476737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313736,COCCC1C(=O)N(C(C)(C)C)CCCN1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.865239,-0.327340,1.239410,0.326160,-0.095916,-0.057758,1.683522,0.740168,...,0.005760,0.000994,0.003292,0.000001,0.000745,6.958697e-08,-5.681526e-08,-0.816464,1.277171e-08,0.271727
404488,COCCCCOC(F)F,[Li+].F[P-](F)(F)(F)(F)F,-1.200391,-1.606191,0.704259,-0.062248,-0.823102,0.301566,0.775988,0.472924,...,0.002463,0.001270,0.001117,0.000001,0.000746,4.310289e-08,-3.661641e-08,-0.849512,6.486478e-09,0.271713
175401,COCCCN1CCN(CC=O)CC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.157165,-0.674304,0.901134,0.706039,-0.612428,-0.003971,1.330715,1.518373,...,0.004755,0.001377,0.002845,0.000001,0.000741,6.660987e-08,-5.490847e-08,-0.824329,1.170140e-08,0.271700
60675,CCOCCOC(C)(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.028444,-1.454773,0.796553,-0.257764,0.402535,0.300929,1.077131,0.003570,...,0.003195,0.000800,0.001246,0.000001,0.000751,4.518240e-08,-3.806818e-08,-0.842545,7.114214e-09,0.271693


In [33]:
df_unlabel_greed_uniq = df_unlabel_greed_5000.drop_duplicates(subset=['solv_comb_sm'], keep='first') ## only keeping unique solvent combinations for selection purposes; these compounds were manually searched in emolecules to find purchasable compounds
df_unlabel_greed_uniq['solv_comb_sm'] = df_unlabel_greed_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
df_unlabel_greed_uniq[['solv_comb_sm', 'salt_comb_sm', 'explore_aggr', 'exploit_aggr', 'ratio_aggr', 'EI_aggr', 'prediction_avg']].to_csv('../datasets/batch-7/top_5000_suggestions_batch7_uniq_solvents_wo_nmc_data_greedy.csv', index=False)
df_unlabel_greed_uniq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unlabel_greed_uniq['solv_comb_sm'] = df_unlabel_greed_uniq['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))


Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr,prediction_avg
43113,COCCCOCCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.637205,-1.420085,0.806963,-0.229475,-0.708541,0.911731,0.990308,0.472429,...,0.019294,0.005475,0.012299,0.000002,0.001163,1.875285e-07,-1.314580e-07,-0.701003,5.607048e-08,0.483545
122700,COCCCOCCO[AlH2],[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.668732,-1.441791,0.813432,-0.237769,-0.656388,0.923447,1.047836,0.455033,...,0.020539,0.006613,0.012095,0.000002,0.001150,1.921497e-07,-1.336854e-07,-0.695736,5.846430e-08,0.481648
162773,COCCCCCOCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.721775,-1.496471,0.872212,-0.086397,-0.676900,0.979219,0.992173,0.364148,...,0.022099,0.006528,0.013557,0.000002,0.001144,1.986960e-07,-1.360901e-07,-0.684916,6.260597e-08,0.481360
36500,COCCCCOCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.652356,-1.457032,0.890263,-0.266606,-0.608907,0.841153,1.013318,0.390267,...,0.021114,0.006383,0.012794,0.000002,0.001139,1.928062e-07,-1.333576e-07,-0.691667,5.944855e-08,0.477486
302680,COCCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.801624,-1.450690,0.826390,-0.295080,-0.147032,1.006596,1.540045,0.094906,...,0.028903,0.004856,0.016248,0.000002,0.001135,2.067450e-07,-1.349598e-07,-0.652784,7.178516e-08,0.476737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37703,COCCCN1CC(=O)N(C)C(=O)C1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.113020,-0.841355,1.191762,0.647325,-0.471654,-0.017506,0.929687,1.392153,...,0.003943,0.001258,0.002632,0.000001,0.000743,6.172139e-08,-5.123162e-08,-0.830047,1.048977e-08,0.271729
313736,COCCC1C(=O)N(C(C)(C)C)CCCN1C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.865239,-0.327340,1.239410,0.326160,-0.095916,-0.057758,1.683522,0.740168,...,0.005760,0.000994,0.003292,0.000001,0.000745,6.958697e-08,-5.681526e-08,-0.816464,1.277171e-08,0.271727
175401,COCCCN1CCN(CC=O)CC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.157165,-0.674304,0.901134,0.706039,-0.612428,-0.003971,1.330715,1.518373,...,0.004755,0.001377,0.002845,0.000001,0.000741,6.660987e-08,-5.490847e-08,-0.824329,1.170140e-08,0.271700
60675,CCOCCOC(C)(C)OC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.028444,-1.454773,0.796553,-0.257764,0.402535,0.300929,1.077131,0.003570,...,0.003195,0.000800,0.001246,0.000001,0.000751,4.518240e-08,-3.806818e-08,-0.842545,7.114214e-09,0.271693


In [32]:
df_unlabel_greed_uniq_ = pd.read_csv(f'{path}/batch-7/top_5000_suggestions_batch7_uniq_solvents_greedy.csv')
df_unlabel_greed_uniq_

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,amt_electrolyte,pressure_type,norm_capacity_3_avg_pred,rank,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,50,2,0.361556,963.0,0.000002,0.000909,1.375098e-08,-1.214112e-08,-7.541743,1.609856e-09
1,CCCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.920924,-1.603036,0.550274,0.249128,-0.250719,0.486272,0.282188,0.202565,...,50,2,0.272308,4969.0,0.000001,0.000742,4.638333e-09,-4.199308e-09,-9.565083,4.390247e-10
2,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,50,2,0.319932,2059.0,0.000002,0.000828,7.006357e-09,-6.317582e-09,-9.172202,6.887748e-10
3,COP(=O)(OC)OCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.621001,-1.115459,1.065245,-0.408802,0.071975,0.005767,0.326389,0.238781,...,50,2,0.282854,4145.0,0.000001,0.000760,4.322256e-09,-3.928531e-09,-9.977869,3.937245e-10
4,CCOP(=O)(OCC)OCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.104998,-0.671211,1.004061,0.135798,-0.179604,0.467863,0.406747,0.484254,...,50,2,0.276050,4683.0,0.000001,0.000747,4.879251e-09,-4.410976e-09,-9.419622,4.682753e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,COCCCS(C)(C)C,[Li+].O=C1O[B-](F)(F)OC1=O,-0.854245,-1.533978,0.921179,-0.305305,-0.172969,0.520020,1.193371,0.510727,...,50,2,0.279934,4361.0,0.000001,0.000749,4.680737e-09,-4.262400e-09,-10.188921,4.183368e-10
4992,CCCOCCOCCCOCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.877738,-1.596169,0.522391,0.412222,-0.835663,1.200644,0.817318,0.236686,...,50,2,0.286137,3905.0,0.000001,0.000753,9.740583e-09,-8.683015e-09,-8.210357,1.057569e-09
4993,COCCOCCOC1CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.340693,-0.507196,0.871490,-0.409964,-1.059248,1.172802,0.773990,-0.333313,...,50,2,0.341674,1358.0,0.000001,0.000844,2.328477e-08,-2.005211e-08,-6.202975,3.232660e-09
4994,COCCC1CCCC(=O)CC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.631822,-0.656586,1.123381,-0.360369,-0.423082,0.714197,0.992159,-0.337507,...,50,2,0.281449,4240.0,0.000001,0.000746,7.986280e-09,-7.167233e-09,-8.750702,8.190466e-10


In [35]:
df_unlabel_greed_uniq_['solv_comb_sm'] = df_unlabel_greed_uniq_['solv_comb_sm'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
# df_sugg_comm_greed = pd.merge(df_unlabel_greed_uniq_, df_unlabel_greed_uniq, on=['solv_comb_sm', 'salt_comb_sm'], how='inner') ## unique electrolytes
df_sugg_comm_greed = pd.merge(df_unlabel_greed_uniq_, df_unlabel_greed_uniq, on=['solv_comb_sm'], how='inner') ## unique solvents
df_sugg_comm_greed

Unnamed: 0,solv_comb_sm,salt_comb_sm_x,solv_ecfp_pca_0_x,solv_ecfp_pca_1_x,solv_ecfp_pca_2_x,solv_ecfp_pca_3_x,solv_ecfp_pca_4_x,solv_ecfp_pca_5_x,solv_ecfp_pca_6_x,solv_ecfp_pca_7_x,...,EI_2,EI_3,EI_4,prediction_aggr_y,uncertainty_aggr_y,explore_aggr_y,exploit_aggr_y,ratio_aggr_y,EI_aggr_y,prediction_avg
0,COCCOF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.863536,-1.039067,1.081461,-0.406529,-0.227115,0.260969,0.755056,0.576431,...,0.006816,0.001046,0.003425,0.000002,0.000916,8.335992e-08,-6.669670e-08,-0.800105,1.666322e-08,0.360816
1,CCCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.920924,-1.603036,0.550274,0.249128,-0.250719,0.486272,0.282188,0.202565,...,0.002775,0.003349,0.001020,0.000001,0.000804,5.546364e-08,-4.624588e-08,-0.833805,9.217765e-09,0.304128
2,[CH2]CCOOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.653605,-1.424194,0.878123,-0.406314,-0.163474,0.121483,0.449710,0.405105,...,0.004021,0.001674,0.001752,0.000002,0.000836,6.121304e-08,-5.076171e-08,-0.829263,1.045133e-08,0.320825
3,COP(=O)(OC)OCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.621001,-1.115459,1.065245,-0.408802,0.071975,0.005767,0.326389,0.238781,...,0.002410,0.001074,0.000828,0.000001,0.000752,3.948753e-08,-3.362164e-08,-0.851450,5.865891e-09,0.273411
4,COCC=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.615647,-1.037902,1.027586,-0.346690,-0.027341,0.039713,0.286706,0.291601,...,0.002538,0.001512,0.000953,0.000001,0.000764,4.419388e-08,-3.746280e-08,-0.847692,6.731084e-09,0.281570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4038,CCOCCOCCCCCCOC,[Li+].F[P-](F)(F)(F)(F)F,-0.911426,-1.465305,0.625714,0.335631,-0.798999,1.302488,0.828702,0.305266,...,0.006134,0.002082,0.002588,0.000001,0.000828,7.580145e-08,-6.158173e-08,-0.812408,1.421972e-08,0.319906
4039,COCCCCCCC[Si](OC)OC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.658691,-1.435181,0.817402,-0.029180,-0.729157,0.974158,0.848940,0.344663,...,0.003713,0.001496,0.001516,0.000001,0.000768,5.408555e-08,-4.522009e-08,-0.836085,8.865458e-09,0.285682
4040,BCCCCCCOC,[Li+].O=C1O[B-](F)(F)OC1=O,-0.925033,-1.540815,0.859871,0.021506,-0.581742,0.783126,0.821350,0.680388,...,0.003439,0.001120,0.001628,0.000001,0.000822,5.337603e-08,-4.469209e-08,-0.837306,8.683944e-09,0.311947
4041,COCCOCCOC1CCCC1,[Li+].O=C1O[B-](F)(F)OC1=O,0.340693,-0.507196,0.871490,-0.409964,-1.059248,1.172802,0.773990,-0.333313,...,0.007307,0.001283,0.003194,0.000001,0.000803,7.735374e-08,-6.226052e-08,-0.804880,1.509323e-08,0.304166


In [36]:
df_comm_greed = df_unlabel_greed_uniq.merge(uniq_sm_b7, on='solv_comb_sm', how='right')
df_comm_greed.dropna(inplace=True)
df_comm_greed

Unnamed: 0,solv_comb_sm,salt_comb_sm,solv_ecfp_pca_0,solv_ecfp_pca_1,solv_ecfp_pca_2,solv_ecfp_pca_3,solv_ecfp_pca_4,solv_ecfp_pca_5,solv_ecfp_pca_6,solv_ecfp_pca_7,...,EI_2,EI_3,EI_4,prediction_aggr,uncertainty_aggr,explore_aggr,exploit_aggr,ratio_aggr,EI_aggr,prediction_avg
1,COCCOCC(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-1.095649,-0.965356,0.935134,-0.459764,-0.700674,-0.116387,0.853413,0.497956,...,0.005286,0.001072,0.002439,2e-06,0.000863,6.97396e-08,-5.691285e-08,-0.816076,1.282675e-08,0.333566
3,CCCCOCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.948577,-1.676686,0.484386,0.660827,-0.854012,1.322076,0.745136,0.142353,...,0.0121,0.00342,0.008035,2e-06,0.001008,1.396974e-07,-1.049203e-07,-0.751054,3.477707e-08,0.414533
4,CCOCCCCOC,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.825595,-1.40477,0.601096,0.177182,-0.731191,1.110972,0.791357,0.32266,...,0.013991,0.007192,0.007728,2e-06,0.001062,1.574425e-07,-1.162994e-07,-0.738678,4.11431e-08,0.441201
5,COCCCCOCC(F)(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.740833,-1.411703,0.814934,-0.45335,-0.102028,0.873072,1.559289,0.148085,...,0.027447,0.004939,0.014643,2e-06,0.001126,2.002533e-07,-1.328588e-07,-0.663454,6.739449e-08,0.472334
6,COCCCOCC(C)(C)C,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.786659,-1.417662,0.909559,-0.453183,-0.080803,0.779823,1.703168,0.116674,...,0.026921,0.003975,0.01391,2e-06,0.001112,1.930715e-07,-1.288073e-07,-0.667148,6.426424e-08,0.464032
7,COCCCOCC(F)(F)C(F)F,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.924801,-1.49945,0.749023,-0.542249,-0.491989,0.360325,1.438693,0.063002,...,0.018534,0.004016,0.009281,2e-06,0.001055,1.602412e-07,-1.151098e-07,-0.718353,4.513141e-08,0.43563
8,COCCCF,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-0.888535,-1.532757,0.907942,-0.240821,-0.416103,0.506808,0.882704,0.708297,...,0.012827,0.003082,0.008133,2e-06,0.001049,1.422902e-07,-1.061526e-07,-0.746029,3.613763e-08,0.430609


In [None]:
df_comm_EI.to_csv('../../datasets/batch-7/labeled_batch7_uniq_solvents_wo_nmc_data_greedy.csv', index=False)

In [38]:
uniq_sm_comm_greed = df_comm_greed['solv_comb_sm'].unique()
for i in range(len(uniq_sm_b7)):
    if uniq_sm_b7['solv_comb_sm'][i] not in uniq_sm_comm_greed:
        print(uniq_sm_b7['solv_comb_sm'][i])

COCCOS(=O)(=O)C(F)(F)F
COC1CCCCO1
