In [1]:
import time
import cirpy
import re
import chemicals

from pypdf import PdfReader
from tqdm import tqdm
from urllib.error import HTTPError 
from pathlib import Path
from urllib.parse import quote
from pubchemprops.pubchemprops import get_second_layer_props
import pubchempy as pcp
from rdkit.Chem.Descriptors import ExactMolWt

from IPython.display import Markdown, Image
import requests
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage


In [2]:
HERE = Path(_dh[-1])
DATA = HERE / "data"

In [3]:
final_solvent_list = pd.read_csv('updated_datasets/final_solvent_list.csv')

prop_pubchem = {'Molecular Weight' : 'MolecularWeight'}

for d, row in final_solvent_list.iterrows():
    solvent_name = row['Solvent']
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{solvent_name}/cids/JSON"
    r = requests.get(url)
    #r.raise_for_status()
    response = r.json()
    if "IdentifierList" in response:
        cid = response["IdentifierList"]["CID"][0]
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/BoilingPoint/JSON"
        r = requests.get(url)
        #r.raise_for_status()
        response = r.json()
        if "PropertyTable" in response:
            mol_weight = response["PropertyTable"]["Properties"][0]['BoilingPoint']
            print(solvent_name, mol_weight)
        #else:
            #raise ValueError(f"Could not find matches for PubChem CID: {cid}")
        #print(f"Molecular weight for {name} is:\n{mol_weight}")
    #else:
        #print(f"Could not find matches for compound: {solvent_name}")
    #print(f"PubChem CID for {solvent_name} is:\n{cid}")

In [4]:

#functions for each properties
def get_properties(cas_number):
    cas_props = get_second_layer_props(cas_number, ['Density', 'Melting Point', 'Boiling Point', 'Viscosity'])
    Density = (cas_props['Density'][0]['Value']['StringWithMarkup'][0]['String'][0:8])
    MeltingPoint = (cas_props['Melting Point'][1]['Value']['StringWithMarkup'][0]['String'][0:8])
    BoilingPoint = (cas_props['Boiling Point'][1]['Value']['StringWithMarkup'][0]['String'][0:8])
    Viscosity = (cas_props['Viscosity'][0]['Value']['StringWithMarkup'][0]['String'][0:5])
    return Density, MeltingPoint, BoilingPoint, Viscosity

D,M,B,V = get_properties('108-24-7')

print(D,M,B,V)



    








        


1.08 (US -73.4 °C 139.5 °C 0.843


In [5]:

def get_properties(cas_number):
    value_dict = {}
    #MolecularWeightprops = get_first_layer_props(cas_number, ['MolecularWeight'])
#    MolecularWeight = MolecularWeightprops['MolecularWeight'] 
    try:
        #cas_props = get_second_layer_props(cas_number, ['Density', 'Melting Point', 'Boiling Point', 'Viscosity', 'Vapor Pressure', 'LogP'])
        cas_props = get_second_layer_props(cas_number, ['Hazard Classes and Categories'])
        print(cas_props)
    except (HTTPError,UnicodeDecodeError):
        cas_props = {}
        print('Cas_NotFound',cas_number)
    for key, val in cas_props.items():
        for i in val:
            if 'Value' in i.keys():
                den_value = i['Value']
                if 'StringWithMarkup' in den_value.keys():
                    #print(den_value['StringWithMarkup'][0])
                    string_dict = den_value['StringWithMarkup'][0]
                    if 'String' in string_dict.keys():
                        if key in ['Melting Point', 'Boiling Point']:
                            if '°C' in string_dict['String']:
                                value_dict[key] = string_dict['String']
                        elif key == 'Vapor Pressure':
                            if 'mm Hg at 25 °C' in string_dict['String']:
                                value_dict[key] = string_dict['String']
                        elif key == 'Viscosity':
                            if 'Pa' in string_dict['String']:
                                value_dict[key] = string_dict['String']
                        #elif key == 'Other Experimental Properties':
                                print(value_dict[key])
                        else:
                            value_dict[key] = string_dict['String']
 #   cas_props['Molecular Weight'] = MolecularWeight
                        
                        
                        
    return value_dict

#MW,D,M,B,V,VP,LP = get_properties('107-06-2')
get_properties('67-64-1')

#print(MW,D,M,B,V,VP,LP)

{}


{}

In [6]:
def get_properties(cas_number):
    cas_props = get_second_layer_props(cas_number, ['Density', 'Melting Point', 'Boiling Point', 'Viscosity'])
    Density = (cas_props['Density'][0]['Value']['StringWithMarkup'][0]['String'][0:8])
    MeltingPoint = (cas_props['Melting Point'][1]['Value']['StringWithMarkup'][0]['String'][0:8])
    BoilingPoint = (cas_props['Boiling Point'][1]['Value']['StringWithMarkup'][0]['String'][0:8])
    Viscosity = (cas_props['Viscosity'][0]['Value']['StringWithMarkup'][0]['String'][0:15])
    return Density, MeltingPoint, BoilingPoint, Viscosity

D,M,B,V = get_properties('108-24-7')

print(D,M,B,V)

1.08 (US -73.4 °C 139.5 °C 0.843 mPa.s at 


In [7]:
def get_LogP(LogP):
    LogP_props = get_second_layer_props(LogP, ['LogP'])
    x = LogP_props['LogP']
    print(x)
    LogP = float(x[1]['Value']['StringWithMarkup'][0]['String'][0:5])
    return LogP

get_LogP('108-24-7')

[{'ReferenceNumber': 30, 'Reference': ['The Good Scents Company Information System'], 'Value': {'StringWithMarkup': [{'String': '-0.480'}]}}, {'ReferenceNumber': 31, 'Name': '', 'Value': {'StringWithMarkup': [{'String': '-0.27 (calculated)'}]}}]


-0.27

In [14]:
formatted_properties_values = pd.read_csv('updated_datasets/updated_with_viscosity_values.csv')
formatted_properties_values.fillna('', inplace = True)


dfindex = formatted_properties_values.index.values
#print(dfindex)
for idx in tqdm(dfindex):
    if idx > 56:
        cas_number = formatted_properties_values.loc[idx, 'Cas Number']
        prop_dict = get_properties(cas_number)
        for key, val in enumerate(prop_dict):
            key_values = formatted_properties_values.loc[idx, key]
            if key_values == '':
                formatted_properties_values.at[idx, key] = val

    formatted_properties_values.to_csv('updated_datasets/updated_with_all_viscosity_values.csv')
    
    

  formatted_properties_values.fillna('', inplace = True)
 15%|█▌        | 57/369 [00:04<00:23, 13.55it/s] 


KeyError: 0

In [15]:
formatted_solvent_properties = pd.read_csv('updated_datasets/formatted_properties_values.csv')
formatted_solvent_properties.fillna('', inplace = True)

dfindex = formatted_solvent_properties.index.values
#print(dfindex)
for idx in tqdm(dfindex):
    if idx > 56:
        cas_number = formatted_solvent_properties.loc[idx, 'Cas Number']
        formatted_smi_string = cirpy.resolve(cas_number, 'smiles')
        try:
            get_mol_weight = ExactMolWt(Chem.MolFromSmiles(formatted_smi_string))
        except TypeError:
            print(idx)
            get_mol_weight = ''
        mol_weight = formatted_solvent_properties.loc[idx, 'Molecular Weight']
        #print(mol_weight)
        if mol_weight == '':
               formatted_solvent_properties.at[idx, 'Molecular Weight'] = get_mol_weight

    formatted_solvent_properties.to_csv('updated_datasets/updated_mol_weight_values.csv')
        



  formatted_solvent_properties.fillna('', inplace = True)
 65%|██████▍   | 266/411 [03:15<02:05,  1.16it/s]

265


 65%|██████▍   | 267/411 [03:16<01:55,  1.25it/s]

266


 65%|██████▌   | 268/411 [03:17<01:52,  1.27it/s]

267


 65%|██████▌   | 269/411 [03:17<01:44,  1.36it/s]

268


 66%|██████▌   | 271/411 [03:19<01:44,  1.34it/s]

270


 66%|██████▌   | 272/411 [03:19<01:41,  1.36it/s]

271


 66%|██████▋   | 273/411 [03:20<01:52,  1.22it/s]

272


 67%|██████▋   | 274/411 [03:21<01:51,  1.23it/s]

273


 67%|██████▋   | 275/411 [03:22<01:44,  1.30it/s]

274


 67%|██████▋   | 276/411 [03:23<01:42,  1.31it/s]

275


 67%|██████▋   | 277/411 [03:23<01:35,  1.40it/s]

276


 72%|███████▏  | 294/411 [03:39<01:47,  1.09it/s]

293


100%|██████████| 411/411 [05:29<00:00,  1.25it/s]


In [16]:
def get_refractive(cas_number):
    value_dict = {}
    try:
        cas_props = get_second_layer_props(cas_number, ['Other Experimental Properties'])
    except (HTTPError,UnicodeDecodeError):
        print(cas_number)
    return cas_props

get_refractive('107-06-2')
        

{'Other Experimental Properties': [{'ReferenceNumber': 37,
   'Description': 'PEER REVIEWED',
   'Reference': ['IARC. Monographs on the Evaluation of the Carcinogenic Risk  of Chemicals to Humans. Geneva: World Health Organization,  International Agency for Research on Cancer, 1972-PRESENT.  (Multivolume work). Available at: https://monographs.iarc.fr/ENG/Classification/index.php, p. V20 430 (1979)'],
   'Value': {'StringWithMarkup': [{'String': '1 ppm in air = 4 mg/cu m'}]}},
  {'ReferenceNumber': 37,
   'Description': 'PEER REVIEWED',
   'Reference': ["Larranaga, M.D., Lewis, R.J. Sr., Lewis, R.A.; Hawley's Condensed Chemical Dictionary 16th  Edition. John Wiley & Sons, Inc. Hoboken, NJ 2016., p. 583"],
   'Value': {'StringWithMarkup': [{'String': 'Resistant to oxidation'}]}},
  {'ReferenceNumber': 37,
   'Description': 'PEER REVIEWED',
   'Reference': ['Snedecor G et al; Chloroethylenes. Kirk-Othmer Encyclopedia of Chemical Technology. (1999-2018). New York, NY: John Wiley & Sons. O

In [17]:
cas_props = get_second_layer_props('107-06-2', ['Other Experimental Properties'])
print(cas_props)


{'Other Experimental Properties': [{'ReferenceNumber': 37, 'Description': 'PEER REVIEWED', 'Reference': ['IARC. Monographs on the Evaluation of the Carcinogenic Risk  of Chemicals to Humans. Geneva: World Health Organization,  International Agency for Research on Cancer, 1972-PRESENT.  (Multivolume work). Available at: https://monographs.iarc.fr/ENG/Classification/index.php, p. V20 430 (1979)'], 'Value': {'StringWithMarkup': [{'String': '1 ppm in air = 4 mg/cu m'}]}}, {'ReferenceNumber': 37, 'Description': 'PEER REVIEWED', 'Reference': ["Larranaga, M.D., Lewis, R.J. Sr., Lewis, R.A.; Hawley's Condensed Chemical Dictionary 16th  Edition. John Wiley & Sons, Inc. Hoboken, NJ 2016., p. 583"], 'Value': {'StringWithMarkup': [{'String': 'Resistant to oxidation'}]}}, {'ReferenceNumber': 37, 'Description': 'PEER REVIEWED', 'Reference': ['Snedecor G et al; Chloroethylenes. Kirk-Othmer Encyclopedia of Chemical Technology. (1999-2018). New York, NY: John Wiley & Sons. Online Posting Date: 16 Jan 2

In [18]:
def Extracting_de_constants():
    with open('updated_datasets/Dilectric-Constants_organic_solvents_2.pdf', 'rb') as f:
        print(f)
        reader = PdfReader(f)
        de_constant = []
        com_names = []
        cas_num = []

        for j in tqdm(range (1,73)):
            pg = reader.pages [j]
            txt = pg.extract_text()
            #print(txt.strip())
            #print(txt.split('\n'))
            for n in txt.split('\n'):
                #print(n)
                numbers = re.findall(r"[-+]?(?:\d*\.*\d+)", n)
                alphabets = re.findall(r'[a-zA-Z]+', n)
                #print(alphabets,numbers)
                if len(numbers) > 0:
                    #print(alphabets, numbers [-1])
                    #de_constant.append(numbers[-1])
                    combined_name = ''
                    for d in alphabets:
                        combined_name = (combined_name + d+' ')
                        try:
                            formatted_cas = cirpy.resolve(combined_name, 'cas')
                        except HTTPError:
                            formatted_cas = None
                    if formatted_cas is not None:
                        cas_num.append(formatted_cas)
                        com_names.append(combined_name)
                        de_constant.append(numbers[-1])
                        #print(combined_name)
            data = {'Dielectric constant': de_constant,
                   'Solvent': com_names,
                   'Cas Number': cas_num} 
            print(data)
            df = pd.DataFrame(data)
            df.to_csv('Data_dielectric_constant.csv')
            
    return de_constant,com_names,cas_num

de_constant, com_names, cas_num = Extracting_de_constants()
print(len(de_constant))
#print(com_names)
print(len(com_names))
print(len(cas_num))
#print(de_constant,com_names)

<_io.BufferedReader name='updated_datasets/Dilectric-Constants_organic_solvents_2.pdf'>


  1%|▏         | 1/72 [00:39<47:17, 39.97s/it]

{'Dielectric constant': ['22', '7', '8.2', '6.1', '17.2', '-11.5', '4.5', '3.4', '2.2', '2.4', '6', '-1.8', '25', '22', '18.9', '16.5', '1.0072', '7.2', '7'], 'Solvent': ['ALLYL ALCOHOL ', 'ALLYL BROMIDE ', 'ALLYL CHLORIDE ', 'ALLYL IODIDE ', 'ALLYL ISOTHIOCYANATE ', 'ALUMINA ', 'ALUMINA ', 'ALUMINUM BROMIDE ', 'ALUMINUM FLUORIDE ', 'ALUMINUM OLEATE ', 'ALUMINUM PHOSPHATE ', 'ALUMINUM POWDER ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA GAS ', 'AMMONIUM BROMIDE ', 'AMMONIUM CHLORIDE '], 'Cas Number': ['107-18-6', '106-95-6', '107-05-1', '556-56-9', ['58391-87-0', '50978-48-8', '57-06-7', '50888-64-7', '107231-30-1'], '1302-74-5', '1302-74-5', ['12794-92-2', '7727-15-3', '39380-76-2', '56803-00-0'], '7784-18-1', '688-37-9', ['135151-77-8', '13765-93-0', '36201-72-6', '37324-42-8', '51668-55-4', '52350-11-5', '7784-30-7', '8022-59-1', '89686-54-4', '93237-81-1'], ['113962-66-6', '12766-45-9', '182260-45-3', '37202-64-5', '39302-71-1', '39332-62-2', '80341-19-1', '91728-14-2

  3%|▎         | 2/72 [01:17<45:12, 38.75s/it]

{'Dielectric constant': ['22', '7', '8.2', '6.1', '17.2', '-11.5', '4.5', '3.4', '2.2', '2.4', '6', '-1.8', '25', '22', '18.9', '16.5', '1.0072', '7.2', '7', '5', '35.5', '15.8', '11.2', '5.1', '6.3', '6.6', '3.1', '5.7', '6.9', '9.1', '17.4', '4.6', '2', '4.7', '7.8', '7.3', '5.5', '15.8', '4.3', '3.2', '20.9', '33'], 'Solvent': ['ALLYL ALCOHOL ', 'ALLYL BROMIDE ', 'ALLYL CHLORIDE ', 'ALLYL IODIDE ', 'ALLYL ISOTHIOCYANATE ', 'ALUMINA ', 'ALUMINA ', 'ALUMINUM BROMIDE ', 'ALUMINUM FLUORIDE ', 'ALUMINUM OLEATE ', 'ALUMINUM PHOSPHATE ', 'ALUMINUM POWDER ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA GAS ', 'AMMONIUM BROMIDE ', 'AMMONIUM CHLORIDE ', 'AMYL ACETATE ', 'AMYL ALCOHOL ', 'AMYL ALCOHOL ', 'AMYL ALCOHOL ', 'AMYL BENZOATE ', 'AMYL BROMIDE ', 'AMYL CHLORIDE ', 'AMYL ETHER ', 'AMYL FORMATE ', 'AMYL IODIDE ', 'AMYL NITRATE ', 'AMYL THIOCYANATE ', 'AMYLAMINE ', 'AMYLENE ', 'AMYLMERCAPTAN ', 'ANILINE ', 'ANILINE ', 'ANILINE ', 'ANISALDEHYDE ', 'ANISOLE ', 'ANTIMONY PENTACH

  4%|▍         | 3/72 [01:53<42:57, 37.36s/it]

{'Dielectric constant': ['22', '7', '8.2', '6.1', '17.2', '-11.5', '4.5', '3.4', '2.2', '2.4', '6', '-1.8', '25', '22', '18.9', '16.5', '1.0072', '7.2', '7', '5', '35.5', '15.8', '11.2', '5.1', '6.3', '6.6', '3.1', '5.7', '6.9', '9.1', '17.4', '4.6', '2', '4.7', '7.8', '7.3', '5.5', '15.8', '4.3', '3.2', '20.9', '33', '5.3', '1.5', '1.000513', '9', '7', '12.4', '7', '5.1', '9.4', '5.8', '11.4'], 'Solvent': ['ALLYL ALCOHOL ', 'ALLYL BROMIDE ', 'ALLYL CHLORIDE ', 'ALLYL IODIDE ', 'ALLYL ISOTHIOCYANATE ', 'ALUMINA ', 'ALUMINA ', 'ALUMINUM BROMIDE ', 'ALUMINUM FLUORIDE ', 'ALUMINUM OLEATE ', 'ALUMINUM PHOSPHATE ', 'ALUMINUM POWDER ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA ', 'AMMONIA GAS ', 'AMMONIUM BROMIDE ', 'AMMONIUM CHLORIDE ', 'AMYL ACETATE ', 'AMYL ALCOHOL ', 'AMYL ALCOHOL ', 'AMYL ALCOHOL ', 'AMYL BENZOATE ', 'AMYL BROMIDE ', 'AMYL CHLORIDE ', 'AMYL ETHER ', 'AMYL FORMATE ', 'AMYL IODIDE ', 'AMYL NITRATE ', 'AMYL THIOCYANATE ', 'AMYLAMINE ', 'AMYLENE ', 'AMYLMERCAPTAN ', 'ANI

  4%|▍         | 3/72 [02:12<50:48, 44.18s/it]


KeyboardInterrupt: 

In [199]:
formatted_solvent_properties = pd.read_csv('updated_datasets/updated_Dielectric_constant_values.csv')
data_dielectric_constant = pd.read_csv('Data_dielectric_constant.csv')

all_cas_number = formatted_solvent_properties['Cas Number'].values.
#print(all_cas_number)

for idx in tqdm(data_dielectric_constant.index.values):
    cas_number = data_dielectric_constant.loc[idx, 'Cas Number']
    if isinstance(cas_number, str):
        if cas_number in all_cas_number:
            list_index = all_cas_number.index(cas_number)
            de_constant = data_dielectric_constant.loc[idx, 'Dielectric constant']
            formatted_solvent_properties.at[list_index, 'Dielectric Constant'] = de_constant
    else:
        for i in cas_number:
            if i in all_cas_number:
                list_index = all_cas_number.index(i)
                de_constant = data_dielectric_constant.loc[idx, 'Dielectric constant']
                formatted_solvent_properties.at[list_index, 'Dielectric Constant'] = de_constant

formatted_solvent_properties.to_csv('Updated_with_all_props_values.csv')
    
    
    
    


100%|██████████| 1955/1955 [00:00<00:00, 63252.50it/s]


In [16]:
dip_moment = chemicals.dipole.dipole_moment('67-64-1', method = 'MULLER')
print(dip_moment)

2.87998704896212


In [19]:
all_props_values = pd.read_csv('Updated_with_all_props_values.csv')
#data_dielectric_constant = pd.read_csv('Data_dielectric_constant.csv')
all_props_values.fillna('', inplace = True)
dfindex = all_props_values.index.values
#print(all_cas_number)

for idx in tqdm(dfindex):
    if idx > 56:
        cas_number = all_props_values.loc[idx, 'Cas Number']
        formatted_dipole_moment = chemicals.dipole.dipole_moment(cas_number, method = 'MULLER')
        dip_moment = all_props_values.loc[idx, 'Dipole Moment']
        if dip_moment == '':
               all_props_values.at[idx, 'Dipole Moment'] = formatted_dipole_moment
    all_props_values.to_csv('Updated_with_dip_moment.csv')

  all_props_values.fillna('', inplace = True)
100%|██████████| 411/411 [00:02<00:00, 189.49it/s]
