In [2]:
import pandas as pd
import cirpy
import re

In [6]:
### Read in all dataframes
js_hsp = pd.read_csv("JoshuaSchrier_Hansen-Solubility-Parameters.csv")
initial_solvents = pd.read_csv('solvent_data-1.csv')



### Get all solvent names
initial_solvents_names = initial_solvents['Solvent'].values

### convert all strings to lowercase and remove blank spaces
process = [x.lower().replace(" ", "") for x in initial_solvents_names]

In [7]:
### process wolfram HSP data and save as csv ###

js_hsp_processed = pd.DataFrame()

pattern = r"([a-z]-(?=[a-z]))|([a-z](?=[0-9])|[0-9](?=[a-z]))"

for idx, row in js_hsp.iterrows():
    solvent = row['Solvent'].split('"')[-2].lower()
    molar_vol = row['Volume'].split('[')[1].split(',')[0]
    dis = row['hd'].split('[')[1].split(',')[0]
    pol = row['hp'].split('[')[1].split(',')[0]
    hbond = row['hh'].split('[')[1].split(',')[0]
    name = re.sub(
        pattern,
        lambda x: x.group(1)[:-1] if x.group(1) else x.group(2) + "-",
        solvent
    )

    rowdf = pd.DataFrame({
        'Solvent': name,
        'Molar Volume': molar_vol,
        'Dispersion': dis,
        'Polarity': pol,
        'H Bonding': hbond
    }, index=[0])

    js_hsp_processed = pd.concat([js_hsp_processed, rowdf])

js_hsp_processed.to_csv('js_hsp_processed.csv')

In [9]:
### add non duplicate HSP solvents to original dataframe (converted strings to lowercase and no spaces. there are still some duplicates)
js_hsp_proc = pd.read_csv('original_datasets/js_hsp_processed.csv')

js_hsp_names = js_hsp_proc['Solvent']


jshp_keys = ['Solvent', 'Molar Volume', 'Dispersion', 'Polarity', 'H Bonding']

#add solvents from js_hsp
for idx, row in js_hsp_proc.iterrows():
    if row['Solvent'].lower().replace(" ", "") not in process:
        
        df1 = pd.DataFrame(columns=initial_solvents.columns)
        df1.loc[len(df1)] = pd.Series()
        for i in jshp_keys:
            df1.at[0, i] = row[i]

        initial_solvents = pd.concat([initial_solvents, df1.fillna('')])

initial_solvents.to_csv('updated_datasets/updated_solvents_JSHSP.csv')

In [12]:
### add non duplicate solvent from HSP Calcs
hsp_calc = pd.read_csv('original_datasets/HSP_Calculations-1.csv')
updated = pd.read_csv('updated_datasets/updated_solvents_JSHSP.csv')

updated_names = updated['Solvent'].values
formatted_updated_names = [x.lower().replace(" ", "") for x in updated_names]

hsp_calc_keys = ['Solvent', 'Molar Volume', 'Dispersion', 'Polarity', 'H Bonding']

#add solvents from hsp_calc
for idx, row in hsp_calc.iterrows():
    if row['Solvent'].lower().replace(" ", "") not in formatted_updated_names:
        
        df1 = pd.DataFrame(columns=initial_solvents.columns)
        df1.loc[len(df1)] = pd.Series()
        for i in jshp_keys:
            df1.at[0, i] = row[i]

        updated = pd.concat([updated, df1.fillna('')])

updated.to_csv('updated_datasets/updated_solvents_CALCHSP.csv')


In [36]:
updated_CALCHSP = pd.read_csv('updated_datasets/updated_solvents_CALCHSP.csv')
sussol = pd.read_csv('original_datasets/SUSSOL_data-1.csv', sep=';')

updated_calc_names = updated_CALCHSP['Solvent'].values
formatted_updated_calc_names = [x.lower().replace(" ", "") for x in updated_calc_names]

updated_sussol_keys = ['Solvent', 'Boiling Point', 'Melting Point', 'Vapour Pressure', 'Density', 'Dispersion', 'Polarity', 'H Bonding', 'Viscosity', 'LogP', 'Refractive Index', 'Molar Volume', 'Molecular Weight', 'CHEM21']

#add solvents from hsp_calc
for idx, row in sussol.iterrows():
    if row['Solvent'].lower().replace(" ", "") not in formatted_updated_calc_names:
        
        df1 = pd.DataFrame(columns=initial_solvents.columns)
        df1.loc[len(df1)] = pd.Series()
        for i in updated_sussol_keys:
            if i == 'Molecular Weight':
                df1.at[0, i] = row[i].replace(',','')
            else:
                df1.at[0, i] = row[i]
                
        updated_CALCHSP = pd.concat([updated_CALCHSP, df1.fillna('')])

    else:
        updated_CALCHSP.fillna('',inplace=True)
        index = formatted_updated_calc_names.index(row['Solvent'].lower().replace(" ", ""))
        print(index)
        print(row['Solvent'])
        for i in updated_sussol_keys:
            if updated_CALCHSP.iloc[index][i]== '':
                if i == 'Molecular Weight':
                    updated_CALCHSP.at[index, i] = row[i].replace(',','')
                else:
                    updated_CALCHSP.at[index, i] = row[i]
                
                    
                #print(updated_CALCHSP.iloc[index][i])

updated_CALCHSP.to_csv('updated_datasets/updated_sussol.csv')



        

  updated_CALCHSP.fillna('',inplace=True)


68
1-butanol
99
3-pentanone
3
Acetic acid
4
Acetic anhydride
5
acetone
6
acetonitrile
7
anisole
8
benzene
10
benzyl alcohol
136
carbon disulfide
11
Carbon tetrachloride
137
chlorobenzene
12
chloroform
13
cyclohexane
141
cyclohexanol
14
cyclohexanone
15
Cyclopentyl methyl ether
16
Cyrene
17
dichloromethane
18
Diethyl ether
157
Diisopropyl ether
19
Dimethyl carbonate
43
dimethylformamide
20
dimethylsulfoxide
22
ethanol
23
Ethyl acetate
25
ethylene carbonate
26
ethylene glycol
178
formic acid
28
heptane
31
hexane
183
Isobutyl acetate
35
isopropylacetate
36
methanol
191
Methyl acetate
37
Methylcyclohexane
236
n-Butyl Acetate
241
n-Butyl Propionate
46
Nitromethane
48
pentane
49
propylene carbonate
51
pyridine
53
tetrahydrofuran
224
Tetrahydrofurfuryl alcohol
54
toluene


In [71]:
def drop_duplicates(df, outname):
    cas_numbers = []
    removed = []

    for idx, row in df.iterrows():
        cas = cirpy.resolve(row['Solvent'], 'cas')
        if cas is not None:
            if cas not in cas_numbers:
                cas_numbers.append(cas)
            else:
                df.drop(idx, axis=0, inplace=True)
                removed.append([row['Solvent'],idx])
        else:
            ### if cas number is None print name and index so we can manually check
            print('to_check:', idx, row['Solvent'])

    print('removed', len(removed), 'Solvent')

    df.to_csv(outname + '.csv')
    with open(outname + '.txt', 'w')as f:
        for i in removed:
            print(i,file = f)
        

In [41]:
updated_sussol = pd.read_csv('updated_datasets/updated_sussol.csv')

drop_duplicates(updated_sussol, 'updated_datasets/first_duplicates_dropped_sussol')

to_check: 16 Cyrene
to_check: 35 Isopropylacetate
to_check: 42 Dimethylethylene urea
to_check: 44 Dimethylpropylene urea
to_check: 79 2,2-butoxyethoxyethylacetate
to_check: 116 benzylbutylphthalate
to_check: 119 bis-2-chloroisopropylether
to_check: 149 diethyleneglycolbutylether
to_check: 150 diethyleneglycoldimethylether
to_check: 151 diethyleneglycolmethylether
to_check: 152 diethyleneglycolmonoethylether
to_check: 160 dinoctylphthalate
to_check: 163 dipropyleneglycolbutylether
to_check: 164 dllimonene
to_check: 165 dowanoldpm
to_check: 174 ethyleneglycolbutylether
to_check: 175 ethyleneglycolmonophenylether
to_check: 183 isobutylacetate
to_check: 185 isopentylacetate
to_check: 188 mcresol
to_check: 190 mesityloxide
to_check: 198 nmethylpyrrolidine
to_check: 199 n,ndimethylacetamide
to_check: 200 n,ndimethylformamide
to_check: 201 n,ndimethylformamide
to_check: 202 n,ndimethylhydrazine
to_check: 206 oxylene
to_check: 212 propyleneglycolmonomethyletheracetate
to_check: 216 secbutylace

In [44]:
### Add Kamlett Taft data to existing solvents or append new row if solvent doesnt exist
kt = pd.read_csv('original_datasets/kt_data-1.csv')
updated2 = pd.read_csv('updated_datasets/first_duplicates_dropped_sussol.csv')

updated2_names = updated2['Solvent'].values
formatted_updated2_names = [x.lower().replace(" ", "") for x in updated2_names]

kt_keys = ['Solvent', 'Alpha', 'Beta', 'Pi']

#add solvents from hsp_calc
for idx, row in kt.iterrows():
    if row['Solvent'].lower().replace(" ", "") not in formatted_updated2_names:
        
        df1 = pd.DataFrame(columns=initial_solvents.columns)
        df1.loc[len(df1)] = pd.Series()
        for i in kt_keys:
            df1.at[0, i] = row[i]

        updated2 = pd.concat([updated2, df1.fillna('')])
    else:
        updated2.fillna('',inplace=True)
        index = formatted_updated2_names.index(row['Solvent'].lower().replace(" ", ""))
        print(index)
        print(row['Solvent'])
        for i in kt_keys:
            if updated2.iloc[index][i]== '':
                updated2.at[index, i] = row[i]
   

updated2.to_csv('updated_datasets/updated_kt.csv')

  updated2.fillna('',inplace=True)


57
1,1,1-trichloroethane
58
1,1,2,2-tetrachloroethane
59
1,1-dichloroethane
62
1,2-dibromoethane
63
1,2-dichlorobenzene
0
1,2-dichloroethane
1
1,2-dimethoxyethane
65
1,3-dioxolane
68
1-chlorobutane
79
2-butanol
87
2-pentanone
92
3-pentanone
3
acetic acid
4
acetic anhydride
6
acetonitrile
99
acetophenone
8
benzene
106
benzonitrile
10
benzyl alcohol
112
bromobenzene
118
butyl acetate
119
butylamine
128
carbon disulfide
129
chlorobenzene
13
cyclohexane
133
cyclohexanol
14
cyclohexanone
243
dibutyl ether
17
dichloromethane
139
diethyl carbonate
18
diethyl ether
147
diethyl sulfide
138
diethylamine
148
diisopropyl ether
149
dimethyl phthalate
20
dimethyl sulfoxide
21
dioxane
22
ethanol
23
ethyl acetate
166
ethyl formate
167
formamide
168
formic acid
28
heptane
30
hexamethylphosphoramide
31
hexane
36
methanol
181
methyl acetate
184
morpholine
233
N,N-dimethylacetamide
45
nitrobenzene
46
nitromethane
48
pentane
195
phenol
51
pyridine
201
quinoline
207
sulfolane
53
tetrahydrofuran
54
toluene
2

In [49]:
updated_kt_data = pd.read_csv('updated_datasets/updated_kt.csv')
drop_duplicates(updated_kt_data,'first_duplicate_kt.csv')

                          
                          

to_check: 16 Cyrene
to_check: 35 Isopropylacetate
to_check: 42 Dimethylethylene urea
to_check: 44 Dimethylpropylene urea
to_check: 77 2,2-butoxyethoxyethylacetate
to_check: 108 benzylbutylphthalate
to_check: 111 bis-2-chloroisopropylether
to_check: 140 diethyleneglycolbutylether
to_check: 141 diethyleneglycoldimethylether
to_check: 142 diethyleneglycolmethylether
to_check: 143 diethyleneglycolmonoethylether
to_check: 151 dinoctylphthalate
to_check: 154 dipropyleneglycolbutylether
to_check: 155 dllimonene
to_check: 156 dowanoldpm
to_check: 164 ethyleneglycolbutylether
to_check: 165 ethyleneglycolmonophenylether
to_check: 173 isobutylacetate
to_check: 175 isopentylacetate
to_check: 178 mcresol
to_check: 180 mesityloxide
to_check: 188 nmethylpyrrolidine
to_check: 189 n,ndimethylhydrazine
to_check: 193 oxylene
to_check: 199 propyleneglycolmonomethyletheracetate
to_check: 203 secbutylacetate
to_check: 208 tertbutylacetate
to_check: 225 Dipropylene Glycol Mono n-Butyl Ether
to_check: 227 Gly

In [53]:

import cirpy

smi_string = cirpy.resolve("2-Methyl-2-butanol", 'smiles')
print(smi_string)

CCC(C)(C)O


In [67]:
first_duplicate_data = pd.read_csv('first_duplicate_kt.csv')

all_solvent_names = first_duplicate_data['Solvent'].values
smi_string = []

for index in all_solvent_names:
    conv_smi = cirpy.resolve(index, 'smiles')
    smi_string.append(conv_smi)





    
                                   

                            

KeyboardInterrupt: 

In [69]:
initial_data_acs = pd.read_csv('original_datasets/initial_solvent_data1.csv')
first_duplicate_data = pd.read_csv('first_duplicate_kt.csv')

for d, row in initial_data_acs.iterrows():
    smiles_string = row['smiles']
    if smiles_string not in smi_string:
        name_smi = cirpy.resolve(smiles_string, 'names')[0]
        df = pd.DataFrame(columns = first_duplicate_data.columns)
        df.loc[len(df)] = pd.Series()
        df.at[0, 'Solvent'] = name_smi.lower().replace(' ', '')
        first_duplicate_data = pd.concat([first_duplicate_data, df.fillna('')])



first_duplicate_data.to_csv('updated_datasets/second_duplicate_data.csv')


        
        
        
      

In [72]:
second_dataset = pd.read_csv('updated_datasets/second_duplicate_data.csv')

drop_duplicates(second_dataset, 'updated_datasets/final_solvent_list')


to_check: 16 Cyrene
to_check: 35 Isopropylacetate
to_check: 42 Dimethylethylene urea
to_check: 44 Dimethylpropylene urea
to_check: 77 2,2-butoxyethoxyethylacetate
to_check: 108 benzylbutylphthalate
to_check: 111 bis-2-chloroisopropylether
to_check: 140 diethyleneglycolbutylether
to_check: 141 diethyleneglycoldimethylether
to_check: 142 diethyleneglycolmethylether
to_check: 143 diethyleneglycolmonoethylether
to_check: 151 dinoctylphthalate
to_check: 154 dipropyleneglycolbutylether
to_check: 155 dllimonene
to_check: 156 dowanoldpm
to_check: 164 ethyleneglycolbutylether
to_check: 165 ethyleneglycolmonophenylether
to_check: 173 isobutylacetate
to_check: 175 isopentylacetate
to_check: 178 mcresol
to_check: 180 mesityloxide
to_check: 188 nmethylpyrrolidine
to_check: 189 n,ndimethylhydrazine
to_check: 193 oxylene
to_check: 199 propyleneglycolmonomethyletheracetate
to_check: 203 secbutylacetate
to_check: 208 tertbutylacetate
to_check: 225 Dipropylene Glycol Mono n-Butyl Ether
to_check: 227 Gly

In [157]:
final_solvent_list = pd.read_csv('updated_datasets/final_solvent_list.csv')

cas_numbers = []

for d, row in final_solvent_list.iterrows():
    cas_number = row['Solvent']
    converted_cas = cirpy.resolve(cas_number, 'cas')
    if converted_cas is not None:
        if isinstance(converted_cas, list):
            cas_numbers.append(converted_cas[0])
            print(converted_cas[0])
        else:
            cas_numbers.append(converted_cas)
            print(converted_cas)
    else:
        cas_numbers.append('NOT FOUND')

final_solvent_list['Cas Number'] = cas_numbers
final_solvent_list.to_csv('updated_datasets/final_list_casnumbers.csv')
    

107-06-2
173201-80-4
109-86-4
157090-22-7
108-24-7
67-64-1
54841-72-4
100-66-3
27271-55-2
98-08-8
1336-27-2
56-23-5
67-66-3
25012-93-5
11119-77-0
5614-37-9
1605-72-7
60-29-7
616-38-6
164071-41-4
28347-88-8
121182-78-3
141-78-6
97-64-3
96-49-1
37221-95-7
108-29-2
142-82-5
920-66-1
630-31-9
92112-69-1
123-51-3
68989-27-5
8013-70-5
54841-71-3
108-87-2
78-93-3
108-10-1
1634-04-4
25265-68-3
68-12-2
98-95-3
19527-13-0
26138-58-9
109-66-0
108-32-7
106-42-3
152758-95-7
75-65-0
77392-70-2
3101-08-4
28606-06-6
14314-42-2
74552-83-3
79-34-5
75-34-3
9002-85-1
119-64-2
624-61-3
95-50-1
591-21-9
646-06-0
105-05-5
107569-51-7
109-69-3
540-54-5
85566-12-7
107-98-2
78900-94-4
220713-26-8
770-35-4
71-23-8
112-70-9
108-83-8
4221-99-2
112-07-2
111-44-4
110-80-5
111-15-9
98-01-1
79-46-9
123-96-6
27154-67-2
22580-55-8
627-30-5
623-37-0
109-78-4
96-22-0
123-42-2
63121-19-7
40747-85-1
589-62-8
110-12-3
75-07-0
98-86-2
75-36-5
63908-52-1
107-18-6
71-41-0
100-52-7
65-85-0
100-47-0
120-51-4
103-50-4
81846-81-3
1