## Notebook for generating multicomponent dataset (for Chemprop)

### Random splitting

**Created on 5th April, 2022; re-modified on 31st August**

In [10]:
import warnings
warnings.filterwarnings('ignore')
import requests
#from selenium import webdriver
import os
import json
import csv
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from rdkit import Chem
import rdkit.Chem.rdMolDescriptors as MolDescriptors
import rdkit.Chem.Descriptors as Descriptors
from sklearn.model_selection import train_test_split

In [2]:
%%bash
pwd
ls -ltr

/Users/riteshkumar/Library/CloudStorage/Box-Box/Research-postdoc/liquid-electrolyte-ML/2022-01-24/ionic-conductivity_2.0/chemprop/multicomp-dataset/2D-feat/random-split
total 4168
drwx------  20 riteshkumar  staff     640 Aug 30 13:21 old-hyp-wo-mw-salt
-rw-r--r--@  1 riteshkumar  staff  728011 Aug 31 11:38 all_multi_comp.csv
-rw-r--r--@  1 riteshkumar  staff  626696 Aug 31 11:38 all_multi_comp_add.csv
-rw-r--r--@  1 riteshkumar  staff  703675 Aug 31 11:38 all_multi_comp_comb.csv
-rw-------@  1 riteshkumar  staff   66304 Aug 31 11:38 split_dataset.ipynb


In [3]:
df = pd.read_csv('all_multi_comp.csv')
df

Unnamed: 0,solv_1_sm,solv_2_sm,solv_3_sm,solv_4_sm,salt_sm,conductivity_log
0,O=C1OCCO1,COC(=O)OC,,,[Li+].F[P-](F)(F)(F)(F)F,2.459589
1,O=C1OCCO1,CCCCOC(=O)OC,,,[Li+].F[P-](F)(F)(F)(F)F,1.774952
2,O=C1OCCO1,CC(C)COC(=O)OC,,,[Li+].F[P-](F)(F)(F)(F)F,1.686399
3,O=C1OCCO1,CCC(C)OC(=O)OC,,,[Li+].F[P-](F)(F)(F)(F)F,1.871802
4,CCOC(=O)OCC,,,,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.741937
...,...,...,...,...,...,...
10191,CS(=O)C,COCCOC,,,[Li+].F[P-](F)(F)(F)(F)F,1.547563
10192,CS(=O)C,COCCOC,,,[Li+].F[P-](F)(F)(F)(F)F,2.624669
10193,CS(=O)C,COCCOC,,,[Li+].F[P-](F)(F)(F)(F)F,2.772589
10194,CS(=O)C,COCCOC,,,[Li+].F[P-](F)(F)(F)(F)F,2.687847


In [4]:
cols = df.columns
cols

Index(['solv_1_sm', 'solv_2_sm', 'solv_3_sm', 'solv_4_sm', 'salt_sm',
       'conductivity_log'],
      dtype='object')

In [5]:
df_comb = pd.read_csv('all_multi_comp_comb.csv')
df_add = pd.read_csv('all_multi_comp_add.csv')

In [6]:
cols_main = ['solvent_1', 'solvent_2', 'solvent_3', 'solvent_4', 'salt_1', 'conductivity_log']
cols_add = ['conc_salt', 'temperature', 'solv_ratio_1', 'solv_ratio_2', 'solv_ratio_3', 'solv_ratio_4']

In [7]:
## My version of calculating frequency
cols = df.columns
def frequency_count():
    freq_dct = {}
    for i in range(len(df)):
        for j in range(4):
            if df[cols[j]][i] != np.nan:
                freq_dct[df[cols[j]][i]] = freq_dct.get(df[cols[j]][i],0)+1
            else:
                pass
    del freq_dct[np.nan]
    return freq_dct

frequency_count()

{'O=C1OCCO1': 2078,
 'COC(=O)OC': 581,
 'CCCCOC(=O)OC': 1,
 'CC(C)COC(=O)OC': 1,
 'CCC(C)OC(=O)OC': 1,
 'CCOC(=O)OCC': 2487,
 'CCOC(=O)OC(C)F': 14,
 'CC(OC(=O)OCC(F)(F)F)F': 11,
 'FC1COC(=O)O1': 51,
 'C1(C(OC(=O)O1)F)F': 23,
 'CCOC(=O)OC': 704,
 'FC(F)C(F)(F)COC(F)(F)C(F)F': 31,
 'C1C(OC(=O)O1)CC(C(F)(F)F)(C(F)(F)F)F': 15,
 'COC(=O)OCC(F)(F)F': 10,
 'COC(F)(C(F)(F)C(F)(F)F)C(F)(C(F)(F)F)C(F)(F)F': 5,
 'CC(OC(F)(F)C(F)C(F)(F)F)C(F)(F)C(F)C(F)(F)F': 15,
 'CC1COC(=O)O1': 5885,
 'CCOCCOCC': 55,
 'ClCCl': 94,
 'C1CCOC1': 292,
 'Cc1ccccc1': 39,
 'CC1CCCO1': 270,
 'COCCOC': 729,
 'C1COCO1': 153,
 'O=c1occo1': 10,
 'C=CCCCOCC1COC(=O)O1': 5,
 'CO[Si](CCCOCC1COC(=O)O1)(OC)OC': 5,
 'CCO[Si](CCCOCC1COC(=O)O1)(OCC)OCC': 5,
 'C[Si](C)(C)O[Si](C)(C)CCCOCC1COC(=O)O1': 5,
 'CCCC(=O)OC': 6,
 'CN1CCCC1=O': 1,
 'CCCCOCCOCCCC': 19,
 'COP(=O)(OC)OC': 5,
 'O=C1CCCO1': 415,
 'COC(=O)OCCCF': 5,
 'CCCOC(=O)OC': 21,
 'COC(=O)OCCC(F)(F)F': 5,
 'COC(=O)OCC(F)(F)C(F)F': 5,
 'COC(=O)OCC(F)(F)C(F)(F)F': 5,
 'O=C1OCC(

In [9]:
y = df_comb.iloc[:,-1]
y

0        2.459589
1        1.774952
2        1.686399
3        1.871802
4        0.741937
           ...   
10191    1.547563
10192    2.624669
10193    2.772589
10194    2.687847
10195    0.746688
Name: conductivity_log, Length: 10196, dtype: float64

In [12]:
rest_df_comb, test_df_comb, y_rest, y_test = train_test_split(df_comb, y, test_size=0.1, random_state=0)
test_df_comb

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
6642,CC1COC(=O)O1.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,2.063161
3424,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,2.004640
3779,CC1COC(=O)O1.C1COC(=O)O1,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,2.011817
7066,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].[B-](F)(F)(F)F,-1.681759
176,O=C1OCCO1,[Li+].F[As-](F)(F)(F)(F)F,2.754934
...,...,...,...
7780,CC1CCCO1.O=C1OCCO1.CC1COC(=O)O1,[Li+].F[As-](F)(F)(F)(F)F,1.266948
7497,C1COB(OCCOB2OCCCO2)OC1,[Li+].[O-]Cl(=O)(=O)=O,-8.727914
5473,CC1COC(=O)O1.CCOC(=O)OCC,CC[N+](CC)(CC)CC.F[P-](F)(F)(F)(F)F,2.120264
8756,COCCOCCOCCOCCOC,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.079429


In [13]:
rest_df_add, test_df_add, y_rest, y_test = train_test_split(df_add, y, test_size=0.1, random_state=0)
test_df_add

Unnamed: 0,solv_ratio_1,solv_ratio_2,solv_ratio_3,solv_ratio_4,mol_wt_solv_1,mol_wt_solv_2,mol_wt_solv_3,mol_wt_solv_4,mol_wt_salt,conc_salt,temperature
6642,0.700000,0.300000,0.000,0.0,102.031694,88.016044,0.000000,0.0,151.980186,2.004762,60.00
3424,0.400000,0.600000,0.000,0.0,102.031694,118.062994,0.000000,0.0,193.984627,0.430000,59.00
3779,0.600000,0.400000,0.000,0.0,102.031694,88.016044,0.000000,0.0,193.984627,0.480000,59.00
7066,0.600000,0.400000,0.000,0.0,102.031694,118.062994,0.000000,0.0,94.018923,2.000000,-30.00
176,1.000000,0.000000,0.000,0.0,88.016044,0.000000,0.000000,0.0,195.928020,1.000000,60.00
...,...,...,...,...,...,...,...,...,...,...,...
7780,0.750000,0.125000,0.125,0.0,86.073165,88.016044,102.031694,0.0,195.928020,1.000000,-30.00
7497,1.000000,0.000000,0.000,0.0,230.113299,0.000000,0.000000,0.0,105.964516,1.000000,30.00
5473,0.800000,0.200000,0.000,0.0,102.031694,118.062994,0.000000,0.0,275.123756,1.160764,10.00
8756,1.000000,0.000000,0.000,0.0,222.146724,0.000000,0.000000,0.0,286.933298,2.347062,50.04


In [14]:
train_df_comb, val_df_comb, y_train, y_val = train_test_split(rest_df_comb, y_rest, test_size=0.11, random_state=0)
train_df_comb

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
8357,CCOCCOCCF.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,1.887070
4397,CC1COC(=O)O1,[Li+].F[P-](F)(F)(F)(F)F,1.077993
4649,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,0.362224
8266,COCCOC.COCCOCC(F)(F)C(F)F,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.857859
1124,CC1COC(=O)O1,[Li+].F[P-](F)(F)(F)(F)F,1.629241
...,...,...,...
102,FC(F)C(F)(F)COC(F)(F)C(F)F.COC(=O)OCC(F)(F)F.O...,[Li+].F[P-](F)(F)(F)(F)F,1.098612
1664,CCCCCCCCCC1COC(=O)O1,[Li+].C(F)(F)(F)S(=O)(=O)[O-],-3.028255
8946,COCCOCCOCCOCCOC,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.182280
2190,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,2.515274


In [15]:
train_df_add, val_df_add, y_train, y_val = train_test_split(rest_df_add, y_rest, test_size=0.11, random_state=0)
train_df_add

Unnamed: 0,solv_ratio_1,solv_ratio_2,solv_ratio_3,solv_ratio_4,mol_wt_solv_1,mol_wt_solv_2,mol_wt_solv_3,mol_wt_solv_4,mol_wt_salt,conc_salt,temperature
8357,0.708105,0.291895,0.0,0.0,136.089958,88.016044,0.000000,0.0,151.980186,1.0000,15.0
4397,1.000000,0.000000,0.0,0.0,102.031694,0.000000,0.000000,0.0,151.980186,0.4800,-0.2
4649,0.600000,0.400000,0.0,0.0,102.031694,118.062994,0.000000,0.0,151.980186,0.4800,-29.5
8266,0.333333,0.666667,0.0,0.0,90.068080,190.061692,0.000000,0.0,286.933298,1.0000,40.0
1124,1.000000,0.000000,0.0,0.0,102.031694,0.000000,0.000000,0.0,151.980186,1.7000,25.0
...,...,...,...,...,...,...,...,...,...,...,...
102,0.200000,0.200000,0.6,0.0,232.013441,158.019079,88.016044,0.0,151.980186,1.2000,25.0
1664,1.000000,0.000000,0.0,0.0,214.156895,0.000000,0.000000,0.0,155.968029,0.3000,25.0
8946,1.000000,0.000000,0.0,0.0,222.146724,0.000000,0.000000,0.0,286.933298,1.2700,30.0
2190,0.900000,0.100000,0.0,0.0,102.031694,118.062994,0.000000,0.0,151.980186,0.6524,59.0


In [16]:
val_df_comb.to_csv('val_multi_comp_comb.csv', index=False)
val_df_comb

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
1992,O=C1OCCO1.CCOC(=O)OC.CC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,2.541602
9701,CS(=O)(=O)F,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.576613
7558,C1COB(OCCOB2OCCCO2)OC1.C1COC(=O)O1,[Li+].[O-]Cl(=O)(=O)=O,2.271271
7189,O=C1CCCO1,[B-](C1=CC=CC=C1)(C2=CC=CC=C2)(C3=CC=CC=C3)C4=...,-1.525329
2568,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,1.332102
...,...,...,...
1822,C1COC(=O)O1.CCOC(=O)OC.COC(=O)OC,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,0.683489
2923,CC1COC(=O)O1,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,1.087214
6641,CC1COC(=O)O1.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,2.389038
6578,CC1COC(=O)O1.CCOC(=O)OCC,[B-](F)(F)(F)F.CC[N+](CC)(CC)CC,1.590862


In [17]:
val_df_add.to_csv('val_multi_comp_add.csv', index=False)
val_df_add

Unnamed: 0,solv_ratio_1,solv_ratio_2,solv_ratio_3,solv_ratio_4,mol_wt_solv_1,mol_wt_solv_2,mol_wt_solv_3,mol_wt_solv_4,mol_wt_salt,conc_salt,temperature
1992,0.30,0.50,0.20,0.0,88.016044,104.047344,74.036779,0.0,151.980186,1.500000,30.00000
9701,1.00,0.00,0.00,0.0,97.983779,0.000000,0.000000,0.0,286.933298,1.000000,1.73000
7558,0.25,0.75,0.00,0.0,230.113299,88.016044,0.000000,0.0,105.964516,2.500000,80.68127
7189,1.00,0.00,0.00,0.0,86.036779,0.000000,0.000000,0.0,561.450581,0.010483,25.00000
2568,0.30,0.70,0.00,0.0,102.031694,118.062994,0.000000,0.0,151.980186,2.316200,29.40000
...,...,...,...,...,...,...,...,...,...,...,...
1822,0.25,0.50,0.25,0.0,88.016044,104.047344,90.031694,0.0,193.984627,0.600000,-10.00000
2923,1.00,0.00,0.00,0.0,102.031694,0.000000,0.000000,0.0,193.984627,0.410000,19.50000
6641,0.70,0.30,0.00,0.0,102.031694,88.016044,0.000000,0.0,151.980186,1.500000,60.00000
6578,0.30,0.70,0.00,0.0,102.031694,118.062994,0.000000,0.0,217.162493,0.251163,60.00000


In [18]:
test_df_comb.to_csv('test_multi_comp_comb.csv', index=False)

In [19]:
test_df_add.to_csv('test_multi_comp_add.csv', index=False)

In [20]:
train_df_comb.to_csv('train_multi_comp_comb.csv', index=False)

In [21]:
train_df_add.to_csv('train_multi_comp_add.csv', index=False)