In [1]:
# test zone

def search_pubchem_by_name_substances(industry_name):
    '''
    This searches for a CTL material's CID in PubChem.
    Argument: industry_name (str) - the name of the material
    Value: CID (int) - the CID of the material
    '''
    url_subs = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/name/{industry_name}/cids/JSON"
    url_comp = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{industry_name}/cids/JSON"

    response = requests.get(url_comp)
    
    if response.status_code == 200:
        data = response.json()
        data = data['IdentifierList']['CID'][0]
        return data
    else:
        print('--- Initial search unsuccessful')
        
        # Check for the substance name
        response = requests.get(url_subs)
        if response.status_code == 200:
            data = response.json()
            CID = data['InformationList']['Information'][0]['CID'][0]
            print(f"SUCCESS: {CID} added to dictionary.")
            if CID is None:
                SID = data['InformationList']['Information'][0]['SID'][0]
                name_to_SID_dict[industry_name] = SID
                return "SIDavailable"
            return CID
        
        raise Exception(f"Error: Could not retrieve CID using the industry name alone. Status code: {response.status_code}")
        return None

In [2]:
# load dictionary
import pickle
with open('dict_all_cells.pkl', 'rb') as f:
    name_to_CID_dict = pickle.load(f)

# initialize SID dictionary
name_to_SID_dict = {}

In [3]:
# load dataset
with open('df_all_ctls.pkl', 'rb') as f:
    df_ctls = pickle.load(f)

# Create a set of unique 'etl' and 'htl' materials
unique_materials = set(df_ctls['etl'].sum() + df_ctls['htl'].sum())

In [4]:
import requests

if 'none' in unique_materials:
    unique_materials.remove('none')

total_materials = len(unique_materials)

for index, material in enumerate(unique_materials):
    print(f"Progress: {index+1}/{total_materials}")
    if name_to_CID_dict.get(material) is None:
        print(f"Next up: {material}")
        try:
            addition = search_pubchem_by_name_substances(material)
            name_to_CID_dict[material] = addition
        except:
            print(f"Error: Could not find CID in substances for {material}")
            continue

Progress: 1/2591
Next up: B4PyMPM
--- Initial search unsuccessful
SUCCESS: 67967165 added to dictionary.
Progress: 2/2591
Next up: SnO2-np; ZnO-np
--- Initial search unsuccessful
Error: Could not find CID in substances for SnO2-np; ZnO-np
Progress: 3/2591
Next up: Isopropanol-hydroquinolatolithium
--- Initial search unsuccessful
Error: Could not find CID in substances for Isopropanol-hydroquinolatolithium
Progress: 4/2591
Progress: 5/2591
Next up: HT-ZnPc
--- Initial search unsuccessful
Error: Could not find CID in substances for HT-ZnPc
Progress: 6/2591
Progress: 7/2591
Next up: PSS-Na
--- Initial search unsuccessful
Error: Could not find CID in substances for PSS-Na
Progress: 8/2591
Next up: NiO@C
--- Initial search unsuccessful
Error: Could not find CID in substances for NiO@C
Progress: 9/2591
Next up: HBZ-70
--- Initial search unsuccessful
Error: Could not find CID in substances for HBZ-70
Progress: 10/2591
Next up: MC8-9-NPC
--- Initial search unsuccessful
Error: Could not find CI

In [5]:
# potentially dangerous! Don't overwrite the runtime-expensive dictionary file
import pickle
with open('dict_all_cells_updated.pkl', 'wb') as f:
    pickle.dump(name_to_CID_dict, f)

In [6]:
count = sum(value is not None for value in name_to_CID_dict.values())

print(f"Total of unique materials: {len(unique_materials)}")
print(f"Successfully identified: {count}")
print(f"Proportion: {count/len(unique_materials)}")
print("419 were identified as compounds directly in PubChem.")
print("128 were identified after extracting the chemical name from the paper text.")
print("190 more were identified as substances in PubChem.")
print("The discrepancy of one entry remaining is due to some cells containing 'none' as ctl.")
print("runtime: ~11h")

Total of unique materials: 2591
Successfully identified: 738
Proportion: 0.2848321111539946
Of these, 419 were identified directly in PubChem, 128 were identifiedafter extracting the chemical name from the paper text.
The discrepancy of one entry remaining is due to some cells containing 'none' as ctl.
runtime: ~11h
