In [5]:
#! python3 - Script1.py - Retrieve SMILES codes from PubChem API 

'''This script enables automatically connecting to the PubChem database, 
transfer of CAS numbers which are converted to CID identifiers
as first step and then resolved to respective SMILES codes.'''

# Import the library necessary for making a web service request.
from os import chdir
import urllib.request, urllib.error
import json
import time
import pandas as pd

# Define working directory
chdir('/localhome/cschiebroek/MDFPs/mdfptools/carl/data_curation/sandbox')

# Function for resolving given CAS number into CID. Therefore
# variables for PUG-REST request URL pieces are defined 
def cas_to_cid(cas):
    path_prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    path_compound = '/compound/'
    path_name = 'name/'
    path_cas = cas
    path_cas_rest = '/cids/JSON'
    
    url = path_prolog + path_compound + path_name + path_cas + path_cas_rest
# Make a PUG-REST request and store the output in "request"
    print('cas_to_cid:', url)
    try:
        request = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        print('HTTPError while requesting cas', cas)
        return ''
    
    # Give the output/reply back as JSON and return CID number from function
    if request is not None:
        reply = request.read()
        if reply is not None and len(reply) > 0:
            json_out = json.loads(reply)
            cid = json_out['IdentifierList']['CID'][0]
            return cid
    return ''

# Function for searching and extracting SMILES code with entering CID 
def cid_to_smiles(cid):
    path_prolog = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
    path_compound = '/compound/'
    path_name = 'cid/'
    path_cid = str(cid)
    path_cid_rest = '/property/CanonicalSMILES/JSON'
    
    url = path_prolog + path_compound + path_name + path_cid + path_cid_rest
        
    # Make a PUG-REST request and store the output in "request"
    print('cid_to_smiles:', url)
    try:
        request = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        print('HTTPError while requesting cid', cid)
        return ''
    
    # Give the reply in JSON format, access and return the SMILES code
    if request is not None:
        reply = request.read()
        if reply is not None and len(reply) > 0:
            json_out = json.loads(reply)
            #return json_out
            smiles = json_out['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return smiles
    return ''

# NOTE: to do this for many CAS numbers, iterate thru the given list and call above functions to 
# resolve to cid and, in turn, to SMILES. Sleep between each request to avoid overloading
# PubChem servers.

# Load list with CAS numbers where SMILES code is to be requested
df = missing_smiles
list_cas = df['cas'].astype(str).values.tolist()

# Both functions described above are now called in the third function map_cas_list_to_csv
def map_cas_list_to_csv(list_cas):
    output = ''
    for cas in list_cas:
        cid = cas_to_cid(cas)
        if len(str(cid)) > 0:
            smiles = cid_to_smiles(cid)
            if len(smiles) > 0:
                line = cas + '|' + str(cid) + '|' + smiles
                output = output + line + '\n' # create and concatenate output
                print(line)
                time.sleep(0.8) # sleep after each loop for 0,8 seconds
    return output

s_out = 'CAS|CID|SMILES\n'
output = map_cas_list_to_csv(list_cas) # call function for generating final result
final = s_out + output # now final contains a complete csv as string, just write it out to a file.

with open("RESULT_Substances_with_CAS_with_SMILES.csv", "w") as file:
    file.write(final)

cas_to_cid: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/599-04-2/cids/JSON
cid_to_smiles: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/439368/property/CanonicalSMILES/JSON
599-04-2|439368|CC1(COC(=O)C1O)C
cas_to_cid: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/599-04-2/cids/JSON
cid_to_smiles: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/439368/property/CanonicalSMILES/JSON
599-04-2|439368|CC1(COC(=O)C1O)C
cas_to_cid: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/599-04-2/cids/JSON
cid_to_smiles: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/439368/property/CanonicalSMILES/JSON
599-04-2|439368|CC1(COC(=O)C1O)C
cas_to_cid: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/329-01-1/cids/JSON
cid_to_smiles: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/9483/property/CanonicalSMILES/JSON
329-01-1|9483|C1=CC(=CC(=C1)N=C=O)C(F)(F)F
cas_to_cid: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/98-16-8/cids/JSO

URLError: <urlopen error [Errno 101] Network is unreachable>

In [3]:
#now see if we can get the correct smiles for the REACH data
import pickle
import pandas as pd
with open('../tmp_data/cas_to_smiles.pickle', 'rb') as handle:
    cas_to_smiles = pickle.load(handle)


df_reach_standardized_uncurated = pd.read_csv('/localhome/cschiebroek/MDFPs/mdfptools/carl/data_curation/tmp_data/REACH_standardized_not_cleaned.csv')
df_reach_standardized_uncurated_298 = df_reach_standardized_uncurated[df_reach_standardized_uncurated['Temperature_K'] == 298.15]
df_reach_standardized_uncurated_298
#get all the unique cas
cas = df_reach_standardized_uncurated_298['cas'].tolist()
#iterate over cas, get smiles from dict, None if not found, add to df and count number of None
smiles = []
none_count = 0
for ca in cas:
    try:
        smiles.append(cas_to_smiles[ca])
    except:
        smiles.append(None)
        none_count += 1
print(none_count)
#add smiles to df
df_reach_standardized_uncurated_298['SMILES_new_dict'] = smiles
#get where nan
missing_smiles = df_reach_standardized_uncurated_298[df_reach_standardized_uncurated_298['SMILES_new_dict'].isnull()]
missing_smiles


478


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reach_standardized_uncurated_298['SMILES_new_dict'] = smiles


Unnamed: 0,#name,cas,iupacName,P (upperQualifier),P (upperValue),P (lowerQualifier),P (lowerValue),P (unit),T (upperQualifier),T (upperValue),T (lowerQualifier),T (lowerValue),T (unit),VP_Pa,Temperature_K,SMILES,standardized_smiles,log10_VP_Pa,NumHeavyAtoms,SMILES_new_dict
15,"Ô±-hydroxy-Ô_,Ô_-dimethyl-Ô_-butyrolactone",599-04-2,"(3R)-dihydro-3-hydroxy-4,4-dimethyl-2(3H)-fura...",,,,4.120000e-04,mm Hg,,,,25.0,C,5.492866e-02,298.15,CC1(C)COC(=O)C1O,CC1(C)COC(=O)C1O,-1.260201,9,
16,"Ô±-hydroxy-Ô_,Ô_-dimethyl-Ô_-butyrolactone",599-04-2,"(3R)-3-hydroxy-4,4-dimethyloxolan-2-one",,,,1.100000e+00,Pa,,,,25.0,C,1.100000e+00,298.15,CC1(C)COC(=O)C1O,CC1(C)COC(=O)C1O,0.041393,9,
17,"Ô±-hydroxy-Ô_,Ô_-dimethyl-Ô_-butyrolactone",599-04-2,"(3R)-3-hydroxy-4,4-dimethyloxolan-2-one",,,,1.000000e+00,Pa,,,,25.0,C,1.000000e+00,298.15,CC1(C)COC(=O)C1O,CC1(C)COC(=O)C1O,0.000000,9,
23,"Ô±,Ô±,Ô±-trifluoro-3-tolyl isocyanate",329-01-1,1-isocyanato-3-(trifluoromethyl)benzene,,,,1.720000e+02,Pa,,,,25.0,C,1.720000e+02,298.15,FC(F)(F)c1cccc(c1)N=C=O,O=C=Nc1cccc(C(F)(F)F)c1,2.235528,13,
25,"Ô±,Ô±,Ô±-trifluoro-m-toluidine",98-16-8,3-(trifluoromethyl)aniline,,,,8.650000e+01,Pa,,,,25.0,C,8.650000e+01,298.15,Nc1cccc(c1)C(F)(F)F,Nc1cccc(C(F)(F)F)c1,1.937016,11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4475,"trioctyl benzene-1,2,4-tricarboxylate",89-04-3,"trioctyl benzene-1,2,4-tricarboxylate",,,,1.270000e-03,Pa,,,,25.0,C,1.270000e-03,298.15,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(c1)C(=O)O...,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(C(=O)OCCC...,-2.896196,39,
4476,"trioctyl benzene-1,2,4-tricarboxylate",89-04-3,"trioctyl benzene-1,2,4-tricarboxylate",,,,2.580000e-02,Pa,,,,25.0,C,2.580000e-02,298.15,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(c1)C(=O)O...,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(C(=O)OCCC...,-1.588380,39,
4477,"trioctyl benzene-1,2,4-tricarboxylate",89-04-3,"trioctyl benzene-1,2,4-tricarboxylate",,,,1.680000e-07,Pa,,,,25.0,C,1.680000e-07,298.15,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(c1)C(=O)O...,CCCCCCCCOC(=O)c1ccc(C(=O)OCCCCCCCC)c(C(=O)OCCC...,-6.774691,39,
4494,"tris(2-ethylhexyl) 2-(acetyloxy)propane-1,2,3-...",144-15-0,"tris(2-ethylhexyl) 2-acetoxypropane-1,2,3-tric...",,,,2.380000e-04,mm Hg,,,,25.0,C,3.173064e-02,298.15,CCCCC(CC)COC(=O)CC(CC(=O)OCC(CC)CCCC)(OC(C)=O)...,CCCCC(CC)COC(=O)CC(CC(=O)OCC(CC)CCCC)(OC(C)=O)...,-1.498521,40,
