## Notebook demonstrating how to assign electrolyte score (e$\textit{Score}$)

**Note: Examples shown here are for solvent molecules shown in Figure 4(a) and molecules from eMolecules repository in Figure 4(c) of the manuscript**

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, PandasTools, Fragments, rdMolDescriptors, Descriptors  
import numpy as np
from sklearn import preprocessing

In [8]:
## Load the csv file containing predicted values for ionic conductivity (Chemprop), oxidative stability (Chemprop), and CE (RF) for eMolecules dataset
df_emol = pd.read_csv('../datasets/predicted/emolecules_predicted_cond_oxstab_ce.csv')
df_emol['conductivity'] = np.exp(df_emol['conductivity_log'])
df_emol['coulombic_efficiency'] = 1 - np.exp(df_emol['log_CI'])
emol_cond = df_emol['conductivity']
emol_ie = df_emol['IE']
emol_ce = df_emol['coulombic_efficiency']
df_emol

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log,IE,log_CI,conductivity,coulombic_efficiency
0,CC(=O)C#N,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.730640,7.455977,-3.926734,15.342696,0.980292
1,CC(=O)C=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.711887,6.145564,-3.750991,15.057655,0.976506
2,O=C1CCO1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.706441,7.289484,-3.498769,14.975882,0.969765
3,CCCC=O,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.699731,6.095793,-4.128038,14.875723,0.983886
4,O=CC1CO1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,2.690302,6.506844,-2.901290,14.736120,0.945048
...,...,...,...,...,...,...,...
76603,C(CP(C1CCCCC1)C1CCCCC1)CP(C1CCCCC1)C1CCCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-5.543048,4.463987,-2.927842,0.003915,0.946488
76604,ClCC(CCl)OP(=O)(OC(CCl)CCl)OC(CCl)CCl,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-5.609042,5.600271,-3.285040,0.003665,0.962561
76605,C(CCP(C1CCCCC1)C1CCCCC1)CP(C1CCCCC1)C1CCCCC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-5.660888,4.461893,-2.927842,0.003479,0.946488
76606,S=P(N1CC1)(N1CC1)N1CCN(CC1)P(=S)(N1CC1)N1CC1,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,-6.062411,3.686485,-4.388423,0.002329,0.987580


#### Step 1: Normalization

In [10]:
## Need the original datasets of three properties
df_cond = pd.read_csv('../datasets/raw/EDB-1_cond.csv') ## EDB-1 dataset
cond = df_cond['conductivity']
df_oxstab = pd.read_csv('../datasets/raw/MP_oxstab.csv') ## MP dataset
oxstab = df_oxstab['IE']
df_ce = pd.read_csv('../datasets/raw/EDB-2_CE.csv') ## EDB-2 dataset
ce = df_ce['coulombic_efficiency']
df_ce

Unnamed: 0,solvent_1_smiles,solvent_2_smiles,solvent_3_smiles,salt_1_smiles,salt_2_smiles,additive_smiles,protocol,current_density,coulombic_efficiency,log(1-CE)
0,CC1COC(=O)O1,,,[Li+].[O-][Cl+3]([O-])([O-])[O-],,,0,2.0,0.80000,-1.609438
1,CC1COC(=O)O1,,,[Li+].[O-][Cl+3]([O-])([O-])[O-],,O=S=O,1,5.0,0.83600,-1.807889
2,C1CCOC1,,,F[As-](F)(F)(F)(F)F.[Li+],,,1,5.0,0.89400,-2.244316
3,CC1CCCO1,,,F[As-](F)(F)(F)(F)F.[Li+],,,1,0.9,0.97400,-3.649659
4,CC1CCCO1,,,F[As-](F)(F)(F)(F)F.[Li+],,,1,0.9,0.97400,-3.649659
...,...,...,...,...,...,...,...,...,...,...
137,CO[Si](C)(C)OC,,,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],,,2,0.5,0.99759,-6.028129
138,O=S1(=O)CCCC1,,,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],,,2,0.5,0.98200,-4.017384
139,O=S1(=O)CCCC1,FC(F)C(F)(F)COC(F)(F)C(F)F,,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],,,2,0.5,0.98800,-4.422849
140,COC1OCC(C(F)(F)F)O1,,,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],,,2,1.0,0.98700,-4.342806


In [8]:
print(max(emol_cond), min(emol_cond))
print(max(emol_ie), min(emol_ie))
print(max(emol_ce), min(emol_ce))

15.342695755236896 0.0002894394413605
9.911123800551849 2.577424698153538
0.9933481568444376 0.910463446657438


In [15]:
## Using MinMax Scaler on the original datasets to fit the normalizer
cond_div = np.linspace(max(cond), min(cond),1000000) ## creating a list of values with max and min conductivity values; higher step size required to account for wide variablity in conductivity values
cond_div = cond_div.reshape(-1, 1)
scaler_cond = preprocessing.MinMaxScaler()
scaler_cond.fit(cond_div) ## fit the normalizer on EDB-1 dataset
cond_edb1_norm = scaler_cond.transform(cond_div)

In [10]:
tt = pd.DataFrame()
tt['original_cond_value'] = cond_div.T[0]
# cond_sc = scaler.transform(cond_div)
tt['normalized_cond_value'] = cond_sc.T[0]
tt

Unnamed: 0,original_cond_value,normalized_cond_value
0,6.679000e+01,1.000000
1,6.678993e+01,0.999999
2,6.678987e+01,0.999998
3,6.678980e+01,0.999997
4,6.678973e+01,0.999996
...,...,...
999995,2.673393e-04,0.000004
999996,2.005492e-04,0.000003
999997,1.337591e-04,0.000002
999998,6.696907e-05,0.000001


In [27]:
tt_ = tt.copy()
tt_.drop(columns=['original_cond_value'], inplace=True)
tt_

Unnamed: 0,normalized_cond_value
0,1.000000
1,0.999999
2,0.999998
3,0.999997
4,0.999996
...,...
999995,0.000004
999996,0.000003
999997,0.000002
999998,0.000001


In [14]:
oxstab_div = np.linspace(max(oxstab), min(oxstab),1000).reshape(-1, 1) ## creating a list of values with max and min oxidative stability values
scaler_oxstab = preprocessing.MinMaxScaler()
scaler_oxstab.fit(oxstab_div)
oxstab_mp_norm = scaler_oxstab.transform(oxstab_div)

In [16]:
ce_div = np.linspace(max(ce), min(ce),1000).reshape(-1, 1) ## creating a list of values with max and min CE values
scaler_ce = preprocessing.MinMaxScaler()
scaler_ce.fit(ce_div)
ce_edb2_norm = scaler_ce.transform(ce_div)

**Normalizing the property values for electrolytes shown in Fig4(a) of the manuscript**

In [29]:
## normalized conducitivty values for max, PC, DME, E3F1
fig4a_cond_val = np.array([max(cond), 5.10, 21.90, 1.15, 1.00, min(cond)]).reshape(-1,1) ## max, PC, DME, E3F1, threshold, min
print("normalized values: ", scaler_cond.transform(fig4a_cond_val).reshape(1,6)[0])

normalized values:  [1.         0.07635873 0.3278934  0.01721814 0.0149723  0.        ]


In [30]:
## normalized oxidative stability values for PC, DME, & E3F1
fig4a_oxstab_val = np.array([max(oxstab), 7.0, 5.8, 5.67, 4.50,min(oxstab)]).reshape(-1,1) ## max, PC, DME, E3F1, threshold, min
print("normalized values: ", scaler_oxstab.transform(fig4a_oxstab_val).reshape(1,6)[0])

normalized values:  [1.         0.63346325 0.5244437  0.51263325 0.40633919 0.        ]


In [None]:
## CE scaled values for PC, DME, & FDMB
fig4a_ce_val = np.array([max(ce), 0.80, 0.9840, 0.9914, 0.985]).reshape(-1,1) ## PC, DME, E3F1, & threshold
print("scaled values: ", scaler_ce.transform(fig4a_ce_val).reshape(1,6)[0])

In [None]:
emol_cond = np.array(emol_cond).reshape(-1,1)
emol_cond_sc = scaler_cond.transform(emol_cond)
print(emol_cond_sc)
print(max(emol_cond_sc), min(emol_cond_sc))

In [13]:
emol_ie = np.array(emol_ie).reshape(-1,1)
emol_ie_sc = scaler_oxstab.transform(emol_ie)
print(emol_ie_sc)
print(max(emol_ie_sc), min(emol_ie_sc))

[[0.67488855]
 [0.55583803]
 [0.65976274]
 ...
 [0.40287714]
 [0.33243162]
 [0.50116425]]
[0.89793775] [0.23167395]


In [15]:
emol_ce = np.array(emol_ce).reshape(-1,1)
emol_ce_sc = scaler_ce.transform(emol_ce)
print(emol_ce_sc)
print(max(emol_ce_sc), min(emol_ce_sc))

[[0.97898841]
 [0.97482467]
 [0.96741305]
 ...
 [0.94181616]
 [0.98700209]
 [0.98186756]]
[0.99334524] [0.90220304]


In [14]:
tt_val = np.array([78.90, 1.95e-9]).reshape(-1,1)
scaler.transform(tt_val)

array([[ 1.18131457e+00],
       [-2.65084594e-09]])

#### Step 2: Transformation

In [16]:
target_df = pd.DataFrame()
target_df['solv_comb_sm'] = df_emol['solv_comb_sm']
target_df['conductivity'] = df_emol['conductivity']
target_df['oxstab'] = df_emol['oxstab']
target_df['coulombic_efficiency'] = df_emol['coulombic_efficiency']

Unnamed: 0,solv_comb_sm,conductivity,IE,coulombic_efficiency
0,CC(=O)C#N,15.342696,7.455977,0.980292
1,CC(=O)C=O,15.057655,6.145564,0.976506
2,O=C1CCO1,14.975882,7.289484,0.969765
3,CCCC=O,14.875723,6.095793,0.983886
4,O=CC1CO1,14.736120,6.506844,0.945048
...,...,...,...,...
76603,C(CP(C1CCCCC1)C1CCCCC1)CP(C1CCCCC1)C1CCCCC1,0.003915,4.463987,0.946488
76604,ClCC(CCl)OP(=O)(OC(CCl)CCl)OC(CCl)CCl,0.003665,5.600271,0.962561
76605,C(CCP(C1CCCCC1)C1CCCCC1)CP(C1CCCCC1)C1CCCCC1,0.003479,4.461893,0.946488
76606,S=P(N1CC1)(N1CC1)N1CCN(CC1)P(=S)(N1CC1)N1CC1,0.002329,3.686485,0.987580


In [21]:
target_df['cond_std'] = emol_cond_norm
target_df['IE_std'] = emol_ie_norm
target_df['CE_std'] = emol_ce_norm

In [36]:
print(scaler.transform(emol_cond_sorted['conductivity'].values.reshape(-1,1))[:4])
print(scaler_ie.transform(emol_cond_sorted['IE'].values.reshape(-1,1))[:4])
print(scaler_ce.transform(emol_cond_sorted['coulombic_efficiency'].values.reshape(-1,1))[:4])

[[0.22971546]
 [0.22544774]
 [0.22422341]
 [0.2227238 ]]
[[0.67488855]
 [0.55583803]
 [0.65976274]
 [0.55131638]]
[[0.97898841]
 [0.97482467]
 [0.96741305]
 [0.98293989]]


In [37]:
print(scaler.transform(emol_ie_sorted['conductivity'].values.reshape(-1,1))[:4])
print(scaler_ie.transform(emol_ie_sorted['IE'].values.reshape(-1,1))[:4])
print(scaler_ce.transform(emol_ie_sorted['coulombic_efficiency'].values.reshape(-1,1))[:4])

[[0.03184317]
 [0.01362651]
 [0.01937463]
 [0.07361763]]
[[0.89793775]
 [0.81337397]
 [0.8034949 ]
 [0.79217832]]
[[0.97448888]
 [0.9710599 ]
 [0.97108931]
 [0.94601484]]


In [38]:
print(scaler.transform(emol_ce_sorted['conductivity'].values.reshape(-1,1))[:4])
print(scaler_ie.transform(emol_ce_sorted['IE'].values.reshape(-1,1))[:4])
print(scaler_ce.transform(emol_ce_sorted['coulombic_efficiency'].values.reshape(-1,1))[:4])

[[0.00208554]
 [0.00448441]
 [0.00167059]
 [0.00234439]]
[[0.36488452]
 [0.39933342]
 [0.38823372]
 [0.46413003]]
[[0.99334524]
 [0.99334524]
 [0.99334524]
 [0.99334524]]


1. Equal weights for $\sigma$, CE, & IE

In [22]:
cond_score_1 = []
ie_score_1 = []
ce_score_1 = []
cond_wt = 0.33
# cond_wt = 1.00
ie_wt = 0.33
ce_wt = 0.33
for i in range(len(df_emol)):
    if df_emol['conductivity'][i] >= 1.0 and df_emol['IE'][i] >= 4.5 and df_emol['coulombic_efficiency'][i] >= 0.985:
        cond_score_1.append(cond_wt * target_df['cond_std'][i])
        ie_score_1.append(ie_wt * target_df['IE_std'][i])
        ce_score_1.append(ce_wt * target_df['CE_std'][i])
    else:
        cond_score_1.append(0.0)
        ie_score_1.append(0.0)
        ce_score_1.append(0.0)

In [23]:
overall_score_1 = [(cond_score_1[i]+ie_score_1[i]+ce_score_1[i]) for i in range(len(df_emol))]