## Notebook to create dataset for training ML models using graph embeddings from Chemprop model for LUMO prediction

### Graph embeddings size: 200

**Created on 12th December, 2023; modified on 26th December, 2023**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
%%bash
pwd
ls -ltr

/Users/riteshk/Library/CloudStorage/Box-Box/Research-postdoc/liquid-electrolyte-ML/Science-rev-1/coulombic-efficiency_1.1/pretraining-datasets/LUMO/deploy/edb-2
total 12344
-rw-r--r--@ 1 riteshk  staff   133931 Dec 12 10:35 gr_salt_test_comb.csv
-rw-r--r--@ 1 riteshk  staff   974454 Dec 12 10:35 gr_salt_train_comb.csv
-rw-r--r--@ 1 riteshk  staff   125050 Dec 12 10:35 gr_salt_val_comb.csv
-rw-r--r--@ 1 riteshk  staff   136331 Dec 12 10:35 gr_solv_test_comb.csv
-rw-r--r--@ 1 riteshk  staff   991755 Dec 12 10:35 gr_solv_train_comb.csv
-rw-r--r--@ 1 riteshk  staff   127248 Dec 12 10:35 gr_solv_val_comb.csv
-rw-------@ 1 riteshk  staff     8714 Dec 12 10:39 train_CE_add.csv
-rw-------@ 1 riteshk  staff     7652 Dec 12 10:39 train_CE_comb.csv
-rw-------@ 1 riteshk  staff    49068 Dec 12 10:39 train_CE_pre_feat.csv
-rw-------@ 1 riteshk  staff     1249 Dec 12 10:39 val_CE_add.csv
-rw-------@ 1 riteshk  staff     1162 Dec 12 10:39 val_CE_comb.csv
-rw-------@ 1 riteshk  staff     7288 Dec 12 1

-rw-------@ 1 riteshk  staff   137952 Dec 26 13:31 gr_salt_ood_comb_200.csv
-rw-------@ 1 riteshk  staff   141351 Dec 26 13:31 gr_solv_ood_comb_200.csv
-rw-------@ 1 riteshk  staff    66374 Dec 26 14:06 gr_salt_test_comb_200.csv
-rw-------@ 1 riteshk  staff   483255 Dec 26 14:06 gr_salt_train_comb_200.csv
-rw-------@ 1 riteshk  staff    62084 Dec 26 14:06 gr_salt_val_comb_200.csv
-rw-------@ 1 riteshk  staff   492008 Dec 26 14:06 gr_solv_train_comb_200.csv
-rw-------@ 1 riteshk  staff    63141 Dec 26 14:06 gr_solv_val_comb_200.csv
-rw-------@ 1 riteshk  staff    67575 Dec 26 14:10 gr_solv_test_comb_200.csv
-rw-r--r--@ 1 riteshk  staff   137137 Dec 26 14:14 create_dataset_gr_200_add_feat_CE.ipynb


**Random split dataset (EDB-2)**

In [9]:
df_train_comb = pd.read_csv('./train_CE_comb.csv')
df_train_add = pd.read_csv('./train_CE_add.csv')
df_val_comb = pd.read_csv('./val_CE_comb.csv')
df_val_add = pd.read_csv('./val_CE_add.csv')
df_test_comb = pd.read_csv('./test_CE_comb.csv')
df_test_add = pd.read_csv('./test_CE_add.csv')
df_ood_comb = pd.read_csv('./CE_ood_comb.csv')
df_ood_add = pd.read_csv('./CE_ood_add.csv')

In [4]:
df_test_add

Unnamed: 0,mw_solv_1,mw_solv_2,mw_solv_3,solv_ratio_1,solv_ratio_2,solv_ratio_3,mw_salt_1,mw_salt_2,salt_1_conc,salt_2_conc,mw_add,protocol,current_density
0,88.016044,104.047344,0.0,0.25,0.75,0.0,151.980186,0.0,1.0,0.0,277.869633,0,0.5
1,182.070796,182.016634,0.0,0.25,0.75,0.0,186.939685,0.0,1.2,0.0,0.0,2,0.5
2,90.031694,182.016634,0.0,0.25,0.75,0.0,186.939685,0.0,2.5,0.0,0.0,2,0.5
3,90.06808,0.0,0.0,1.0,0.0,0.0,186.939685,286.933298,4.0,2.0,0.0,0,0.5
4,205.038434,0.0,0.0,1.0,0.0,0.0,186.939685,0.0,1.0,0.0,0.0,3,0.5
5,88.016044,74.036779,0.0,0.000999,0.999001,0.0,195.92802,0.0,1.0,0.0,0.0,1,1.5
6,156.003429,90.031694,0.0,0.5,0.5,0.0,151.980186,0.0,1.0,0.0,0.0,1,0.6
7,172.034729,0.0,0.0,1.0,0.0,0.0,186.939685,0.0,1.0,0.0,0.0,2,1.0
8,90.031694,182.016634,0.0,0.333333,0.666667,0.0,186.939685,0.0,1.2,0.0,0.0,1,0.5
9,88.016044,104.047344,0.0,0.5,0.5,0.0,151.980186,0.0,1.0,0.0,106.006622,1,0.5


In [5]:
gr_train_solv = pd.read_csv('./gr_solv_train_comb_200.csv')
gr_train_salt = pd.read_csv('./gr_salt_train_comb_200.csv')
gr_val_solv = pd.read_csv('./gr_solv_val_comb_200.csv')
gr_val_salt = pd.read_csv('./gr_salt_val_comb_200.csv')
gr_test_solv = pd.read_csv('./gr_solv_test_comb_200.csv')
gr_test_salt = pd.read_csv('./gr_salt_test_comb_200.csv')
gr_ood_solv = pd.read_csv('./gr_solv_ood_comb_200.csv')
gr_ood_salt = pd.read_csv('./gr_salt_ood_comb_200.csv')
gr_ood_solv

Unnamed: 0,solv_smile_comb,fp_0_mol_0,fp_1_mol_0,fp_2_mol_0,fp_3_mol_0,fp_4_mol_0,fp_5_mol_0,fp_6_mol_0,fp_7_mol_0,fp_8_mol_0,...,fp_190_mol_0,fp_191_mol_0,fp_192_mol_0,fp_193_mol_0,fp_194_mol_0,fp_195_mol_0,fp_196_mol_0,fp_197_mol_0,fp_198_mol_0,fp_199_mol_0
0,FC(F)(F)COCCOCCOCC(F)(F)F,0.159038,-0.000151,-0.008674,-0.004411,-0.008957,-0.003532,0.015135,0.017735,-0.009333,...,0.033149,-0.012161,-0.007187,-0.013353,-0.007202,-0.008244,-0.015047,-0.005401,-0.008498,-0.006235
1,FCCOCCOCCOCCF,0.062925,0.008317,-0.011106,-0.007193,-0.007158,-0.005362,-0.005563,0.009896,-0.009725,...,0.026003,-0.012498,-0.010052,-0.016475,-0.007119,-0.00737,-0.021913,-0.006744,-0.007426,-0.006957
2,FCCOB(OCCF)OCCF,0.107247,0.030299,-0.010817,-0.008009,-0.01227,-0.005464,-0.008932,-0.009274,-0.008047,...,0.038013,-0.014322,-0.011197,-0.017044,-0.00867,-0.012252,-0.021767,0.098848,-0.008047,-0.00709
3,CCOC(OCC)OCC,-0.006822,0.075634,-0.014248,-0.008905,-0.008846,-0.007019,-0.008252,-0.004568,-0.012605,...,-0.004574,-0.010823,-0.013742,-0.021678,-0.006187,-0.006198,-0.023088,-0.011233,-0.004924,-0.006328
4,COCCOCC(C)C,-0.006357,0.039191,-0.012564,-0.009986,0.0086,-0.006031,-0.00729,0.037543,-0.011298,...,0.055497,-0.012207,-0.012738,-0.019588,-0.005607,-0.006019,-0.024201,-0.009887,-0.007159,-0.007079
5,COCCOB(OCCOC)OCCOC,0.003128,0.018407,-0.012234,-0.00986,-0.010138,-0.006105,-0.010329,-0.00727,-0.008951,...,0.031993,-0.012784,-0.012376,-0.018368,-0.00746,-0.011098,-0.023513,0.077893,-0.008293,-0.007617
6,FCCOB(OCCF)OCCF,0.107247,0.030299,-0.010817,-0.008009,-0.01227,-0.005464,-0.008932,-0.009274,-0.008047,...,0.038013,-0.014322,-0.011197,-0.017044,-0.00867,-0.012252,-0.021767,0.098848,-0.008047,-0.00709
7,COCCOC(C)(C)C,-0.007731,0.00764,-0.013876,-0.011243,-0.008276,-0.006096,-0.00833,-0.004153,-0.010116,...,0.079477,-0.010948,-0.013928,-0.020828,-0.00669,-0.006499,-0.026614,-0.011198,-0.007982,-0.008566
8,CCCCOCCOC,-0.005973,0.069235,-0.012739,-0.009539,0.033017,-0.00621,-0.007513,0.033261,-0.010879,...,0.091746,-0.011189,-0.01271,-0.019293,-0.005581,-0.005448,-0.025357,-0.009299,-0.006406,-0.007482
9,COCCOC(C)C,-0.006045,0.049645,-0.013165,-0.010896,-0.005858,-0.00562,-0.007385,-0.003979,-0.011482,...,0.060141,-0.01257,-0.013267,-0.021017,-0.006409,-0.00657,-0.023954,-0.009722,-0.007077,-0.006834


In [8]:
## ignore this!!
# def create_dataset(split='train'):
#     if split == 'train':
#         # df = df_train_comb
#         df = f'df_{split}_comb'
#         df_add = df_train_add
#         gr_solv = gr_train_solv
#         gr_salt = gr_train_salt
#     elif split == 'val':
#         df = df_val_comb
#         df_add = df_val_add
#         gr_solv = gr_val_solv
#         gr_salt = gr_val_salt
#     elif split == 'test':
#         df = df_test_comb
#         df_add = df_test_add
#         gr_solv = gr_test_solv
#         gr_salt = gr_test_salt
#     else:
#         print('Error: split must be train, val, or test')
#         return

#     gr_solv_ = gr_solv.iloc[:,1:]
#     gr_salt_ = gr_salt.iloc[:,1:]
#     gr_solv_.columns = [('solv_'+x) for x in gr_solv_.columns]
#     gr_salt_.columns = [('salt_'+x) for x in gr_salt_.columns]
    

#     df_ = pd.concat([df[['solv_smile_comb', 'salt_smile_comb']], gr_solv_, gr_salt_, df_add, df['log(1-CE)']], axis=1)
#     return df_

In [6]:
## try this more general one
def create_dataset(split='train', std_add=False):
    if split == 'train' or 'val' or 'test' or 'iood' or 'lood' or 'ood':
        # df = df_train_comb
        df = f'df_{split}_comb'
        df_add = f'df_{split}_add'
        gr_solv = f'gr_{split}_solv'
        gr_salt = f'gr_{split}_salt'
        df = globals()[df]
        df_add = globals()[df_add]
        gr_solv = globals()[gr_solv]
        gr_salt = globals()[gr_salt]
    else:
        print('Error: split must be train, val, or test, iood, lood, ood')
        return
    
    if std_add == True:
        ## use StandardScaler to standardize the additive features
        scaler = StandardScaler()
        scaler.fit(df_add)
        df_add = pd.DataFrame(scaler.transform(df_add), columns=df_add.columns)
        # df_add = (df_add - df_add.mean())/df_add.std()

    gr_solv_ = gr_solv.iloc[:,1:]
    gr_salt_ = gr_salt.iloc[:,1:]
    gr_solv_.columns = [('solv_'+x) for x in gr_solv_.columns]
    gr_salt_.columns = [('salt_'+x) for x in gr_salt_.columns]
    

    df_ = pd.concat([df[['solv_smile_comb', 'salt_smile_comb']], gr_solv_, gr_salt_, df_add, df['log(1-CE)']], axis=1)
    return df_

In [7]:
gr_train_solv_ = gr_train_solv.iloc[:,1:]
gr_train_solv_.columns = [('solv_'+x) for x in gr_train_solv_.columns]
gr_train_solv_

Unnamed: 0,solv_fp_0_mol_0,solv_fp_1_mol_0,solv_fp_2_mol_0,solv_fp_3_mol_0,solv_fp_4_mol_0,solv_fp_5_mol_0,solv_fp_6_mol_0,solv_fp_7_mol_0,solv_fp_8_mol_0,solv_fp_9_mol_0,...,solv_fp_190_mol_0,solv_fp_191_mol_0,solv_fp_192_mol_0,solv_fp_193_mol_0,solv_fp_194_mol_0,solv_fp_195_mol_0,solv_fp_196_mol_0,solv_fp_197_mol_0,solv_fp_198_mol_0,solv_fp_199_mol_0
0,0.028599,0.031528,-0.013138,0.034162,-0.008458,-0.006427,-0.009211,0.015324,-0.010123,-0.009936,...,0.036609,-0.013629,-0.011892,-0.017518,-0.007229,-0.008200,-0.022971,-0.007925,0.000642,0.008547
1,-0.008580,-0.008070,-0.013210,-0.009722,-0.010997,-0.007725,-0.011858,-0.004951,-0.010775,-0.013031,...,-0.009986,-0.014910,-0.012864,-0.018439,-0.006711,-0.008819,-0.023484,-0.009390,0.015851,-0.005629
2,-0.005206,0.013194,-0.012076,-0.008618,-0.006324,-0.005831,-0.007005,0.032790,-0.010144,-0.007416,...,0.073685,-0.011648,-0.011043,-0.017493,-0.006465,-0.007201,-0.023135,-0.007566,-0.007684,-0.007349
3,0.070854,0.054446,-0.014466,0.087637,-0.011126,-0.007172,-0.011968,-0.006509,-0.010096,-0.013087,...,-0.009736,-0.016106,-0.012953,-0.017550,-0.008183,-0.009450,-0.022765,-0.008374,0.011050,0.028416
4,-0.005390,-0.004935,-0.011686,-0.010327,-0.004394,-0.005828,-0.007275,0.091065,-0.010062,-0.007175,...,0.116712,-0.012330,-0.011846,-0.017863,-0.005568,-0.006261,-0.021857,-0.007849,-0.007019,-0.007010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,0.069424,0.046331,-0.015139,0.083333,-0.011125,-0.007302,-0.011989,-0.007036,-0.010284,-0.013385,...,-0.008837,-0.015824,-0.013036,-0.018397,-0.009691,-0.010143,-0.023307,-0.008495,-0.002766,0.023127
109,0.130691,0.050793,-0.015092,0.162493,0.007925,-0.006418,-0.010257,-0.007271,-0.010462,-0.012527,...,-0.009223,-0.017342,-0.012355,-0.018624,-0.009288,-0.010069,-0.020309,-0.008691,0.001443,0.046280
110,-0.006364,0.055103,-0.013114,-0.009644,-0.006275,-0.006170,-0.007244,-0.003442,-0.011148,-0.006358,...,0.018936,-0.012198,-0.013361,-0.019687,-0.006173,-0.005499,-0.025035,-0.009832,-0.006860,-0.006746
111,0.218118,0.012724,-0.010560,0.073425,-0.011996,0.002649,-0.000737,0.001076,-0.008533,-0.010564,...,0.037698,-0.014318,-0.009197,-0.014209,0.018070,-0.011001,-0.014719,0.035924,0.011922,0.100208


In [10]:
train_gr_add = create_dataset(split='train')
val_gr_add = create_dataset(split='val')
test_gr_add = create_dataset(split='test')
ood_gr_add = create_dataset(split='ood')

In [11]:
col = list(train_gr_add.columns)

In [12]:
train_gr_add.to_csv('train_gr_comb_add.csv', index=False)
val_gr_add.to_csv('val_gr_comb_add.csv', index=False)
test_gr_add.to_csv('test_gr_comb_add.csv', index=False)
ood_gr_add.to_csv('ood_gr_comb_add.csv', index=False)

In [13]:
create_dataset(split='ood', std_add=True)

Unnamed: 0,solv_smile_comb,salt_smile_comb,solv_fp_0_mol_0,solv_fp_1_mol_0,solv_fp_2_mol_0,solv_fp_3_mol_0,solv_fp_4_mol_0,solv_fp_5_mol_0,solv_fp_6_mol_0,solv_fp_7_mol_0,...,solv_ratio_2,solv_ratio_3,mw_salt_1,mw_salt_2,salt_1_conc,salt_2_conc,mw_add,protocol,current_density,log(1-CE)
0,FC(F)(F)COCCOCCOCC(F)(F)F,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.159038,-0.000151,-0.008674,-0.004411,-0.008957,-0.003532,0.015135,0.017735,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-4.757323
1,FCCOCCOCCOCCF,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.062925,0.008317,-0.011106,-0.007193,-0.007158,-0.005362,-0.005563,0.009896,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-2.077424
2,FCCOB(OCCF)OCCF,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.107247,0.030299,-0.010817,-0.008009,-0.01227,-0.005464,-0.008932,-0.009274,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-3.661161
3,CCOC(OCC)OCC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.006822,0.075634,-0.014248,-0.008905,-0.008846,-0.007019,-0.008252,-0.004568,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-4.197091
4,COCCOCC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.006357,0.039191,-0.012564,-0.009986,0.0086,-0.006031,-0.00729,0.037543,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-3.661018
5,COCCOB(OCCOC)OCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.003128,0.018407,-0.012234,-0.00986,-0.010138,-0.006105,-0.010329,-0.00727,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-0.799155
6,FCCOB(OCCF)OCCF,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],0.107247,0.030299,-0.010817,-0.008009,-0.01227,-0.005464,-0.008932,-0.009274,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-2.215396,-3.408275
7,COCCOC(C)(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.007731,0.00764,-0.013876,-0.011243,-0.008276,-0.006096,-0.00833,-0.004153,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-4.033095
8,CCCCOCCOC,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.005973,0.069235,-0.012739,-0.009539,0.033017,-0.00621,-0.007513,0.033261,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-4.074542
9,COCCOC(C)C,O=S(=O)(F)[N-]S(=O)(=O)F.[Li+],-0.006045,0.049645,-0.013165,-0.010896,-0.005858,-0.00562,-0.007385,-0.003979,...,-0.377964,0.0,0.265394,0.0,-0.54259,0.0,0.0,0.6742,-0.431933,-4.199705
