# CID retrieval

This script uses the industry name which is present in NOMAD and finds its PubChem CID. This happens in steps:
1. Attempt to get the PubChem CID with the industry name directly.
2. If that does not work, attempt to identify the more descriptive chemical name from the associated publication.
3. If that does not work, attempt to identify the chemical name from other publications using the same material.

In this manner, a dictionary of industry names and their PubChem CIDs is compiled.

In [30]:
import requests
import pickle
import os
import json
from groq import Groq
import httpx
import config # config needs to be a file containing a groq and elsevier api-key

# Step 1

In [31]:
def search_pubchem_by_name(industry_name):
    '''
    This searches for a CTL material's CID in PubChem.
    Argument: industry_name (str) - the name of the material
    Value: CID (int) - the CID of the material
    '''
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{industry_name}/cids/JSON"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        data = data['IdentifierList']['CID'][0]
        print('debug: success! found through search_pubchem_by_name!')
        return data
    else:
        print('debug: search_pubchem_by_name unsuccessful')
        raise Exception(f"Error: Could not retrieve CID using the industry name alone. Status code: {response.status_code}")
        return None

# Step 2 and 3

In [75]:
def find_CID_in_text(industry_name, 
                     CTL_DOIs, 
                     paper_index = -1):
    '''
    This function tries to find the chemical name in the paper text and to
    retrieve the CID from PubChem.
    Arguments: 
        industry_name (str) - the name of the material
        CTL_DOIs (list) - list of DOIs of papers with that CTL
        paper_index (int) - index of the paper in the list
    Value: 
        CID (int) - the CID of the material
    Dependencies: 
        paper_from_publisher, llm_retrieve_name, search_pubchem_by_name
    '''
    print(f'debug: length of CTL_DOIs (1): {len(CTL_DOIs)}. Paper Index (0): {paper_index}')
    if paper_index >= (len(CTL_DOIs) - 1):
        print('debug: unsucessfully searched through all papers associated with that material')
        raise Exception('All papers have been searched.')

    paper_index = paper_index + 1 #now 0
    DOI = CTL_DOIs[paper_index]
    try:
        paper_text = paper_from_publisher(DOI) # trying with first paper, fails
    except:
        print('debug: paper_from_publisher failed. Continuing with recurrence')
        CID = find_CID_in_text(industry_name, CTL_DOIs, paper_index) # result is unhandled Exception
        if CID is not None:
            return CID
    try:
        print('debug: trying llm_retrieve_name')
        chem_name = llm_retrieve_name(paper_text, industry_name)
    except:
        print('debug: llm_retrieve_name failed. Recurring...')
        CID = find_CID_in_text(industry_name, CTL_DOIs, paper_index)
        if CID is not None:
            return CID
    try:
        print('debug: trying search_pubchem_by_name(chem_name)')
        CID = search_pubchem_by_name(chem_name)
        return CID
    except:
        print('debug: trying search_pubchem_by_name(chem_name) failed. Recurring...')
        CID = find_CID_in_text(industry_name, CTL_DOIs, paper_index)
        if CID is not None:
            return CID


# Dependencies of the finding function
# search_pubchem_by_name is defined in previous code cell

# paper_from_publisher:
def paper_from_publisher(DOI):
    '''
    This function tries publisher APIs to retrieve paper texts from ScienceDirect.
    Argument: DOI (str) - the DOI of the paper
    Value: paper_text (str) - the plain text of the paper
    '''

    def scidir_retrieve_paper(DOI, apikey):
        apikey=apikey
        headers={
            "X-ELS-APIKey":apikey,
            "Accept":'application/json'
            }
        client = httpx.Client(headers=headers)
        query="&view=FULL"
        url=f"https://api.elsevier.com/content/article/doi/" + DOI
        r=client.get(url)
        print(f'debug: paper retrieval executed. This is the result: {r}')
        if r.status_code != 200:
            print('debug: paper retrieval exception will be raised')
            raise Exception(f"Error: The paper could not be found in ScienceDirect. Status code: {r.status_code}")
        return r

    # Get document
    try:
        scidir_response = scidir_retrieve_paper(DOI, config.api_key_elsevier)

        json_acceptable_string = scidir_response.text
        d = json.loads(json_acceptable_string)
        return d['full-text-retrieval-response']['coredata']['dc:description']
    except:
        print("debug: Paper not found in ScienceDirect.")
        raise Exception("Error: Paper not found in ScienceDirect.")

# llm_retrieve_name
def llm_retrieve_name(paper_text,
                      industry_name, 
                      api_key=config.api_key_groq):
    '''
    This function retrieves the chemical name of the compound from the paper text.
    Arguments: 
        paper_text (str) - the text of the paper
        industry_name (str) - the name of the material
    Value: 
        chem_name (str) - the chemical name of the material
    '''
    groq = Groq(api_key=api_key)
    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a solar cell scientist proficient in reading papers. You output only the chemical name of the compound asked for, nothing else.",
                #"content": "You are helpful memory recorder. Write outputs in JSON schema.\n",
                #f" The JSON object must use the schema: {json.dumps(my_schema.model_json_schema(), indent=1)}",
            },
            {
                "role": "user",
                "content": f"What is the chemical name pertaining to this abbreviation: {industry_name}? You can find it in this text: {paper_text}.",
            }
        ],
        model="llama3-70b-8192",
        #response_format={"type": "json_object"},
    )
    return chat_completion.choices[0].message.content



# Populating the dictionary

In [116]:
# function for populating the dictionary
def name_to_cid(industry_name, df):
    try:
        CID = search_pubchem_by_name(industry_name)
        name_to_CID_dict[industry_name] = CID
    except:
        print('debug: initial search_pubchem_by_name did not work. Trying paper reading.')
        # build list of DOIs that mention the material
        CTL_DOIs = []
        for i in range(len(df)):
            if any(industry_name in item for item in df.iloc[i]['etl']) or any(industry_name in item for item in df.iloc[i]['htl']):
                CTL_DOIs.append(df.iloc[i]['ref'].replace('https://doi.org/',''))
            CTL_DOIs = list(set(CTL_DOIs))
        if len(CTL_DOIs) == 0:
            print('debug: no references found in the dataframe for this material.')
        else:
            print('debug: list of papers successfully built')
        # go through the list of papers and try llm extraction
#        for i in range(len(CTL_DOIs)):
#        print('debug: i =', i)
        try:
            CID = find_CID_in_text(industry_name, CTL_DOIs)
        except Exception as e:
            if str(e) == 'All papers have been searched.':
                print('debug: stopped recursion as all papers have been searched')
                CID = None
        name_to_CID_dict[industry_name] = CID



# Main

In [96]:
# Initialize dictionary
name_to_CID_dict = {}

In [118]:
# which industry name are we identifying right now? later this will loop through
# all the materials in the dataframe
industry_name = "PEDOT:PSS"

# prepare df
with open('df_some_test_ctls.pkl', 'rb') as f:
    df_some_test_ctls = pickle.load(f)

name_to_cid(industry_name, df_some_test_ctls)

debug: search_pubchem_by_name unsuccessful
debug: initial search_pubchem_by_name did not work. Trying paper reading.
debug: list of papers successfully built
debug: length of CTL_DOIs (1): 6. Paper Index (0): -1
debug: paper retrieval executed. This is the result: <Response [404 Not Found]>
debug: paper retrieval exception will be raised
debug: Paper not found in Scopus.
debug: paper_from_publisher failed. Continuing with recurrence
debug: length of CTL_DOIs (1): 6. Paper Index (0): 0
debug: paper retrieval executed. This is the result: <Response [404 Not Found]>
debug: paper retrieval exception will be raised
debug: Paper not found in Scopus.
debug: paper_from_publisher failed. Continuing with recurrence
debug: length of CTL_DOIs (1): 6. Paper Index (0): 1
debug: paper retrieval executed. This is the result: <Response [404 Not Found]>
debug: paper retrieval exception will be raised
debug: Paper not found in Scopus.
debug: paper_from_publisher failed. Continuing with recurrence
debug: 

In [100]:
print(name_to_CID_dict)

{'spiro-meotad': 16161850, 'PCBM-60': None, 'D35': 445664}
