Description

This script performs the following tasks:

    Reads a CSV file containing a column named "InChIKey".
    Defines functions to retrieve the PubChem ID, preferred name, and synonyms for each InChIKey using the PubChem REST API.
    Iterates over the DataFrame with a progress bar to retrieve and store the information.
    Measures and prints the runtime of the script.


Instructions for Use

    Ensure you have the required libraries installed:

'''
        pip install pandas requests tqdm
'''

Update the input_file variable with the path to your input CSV file.

Run the script. The script will read the input CSV file, process the InChIKeys to retrieve PubChem IDs and compound information, and save the results to a new CSV file specified by output_file_path.


In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# Function to get PubChem ID from InChIKey
def get_pubchem_id_from_inchikey(inchikey):
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey"
    url = f"{base_url}/{inchikey}/cids/TXT"

    response = requests.get(url)

    if response.status_code == 200:
        pubchem_id = response.text.strip()
        return pubchem_id
    else:
        return "-"

# Function to get compound information from InChIKey
def get_compound_info(inchikey):
    # Base URL for PubChem REST API
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"

    # API endpoint for compound information and synonyms
    endpoint = f"{base_url}/compound/inchikey/{inchikey}/JSON?properties=IUPACName,Synonyms"

    # Send the GET request
    response = requests.get(endpoint)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()

        # Extract compound information
        compound_info = data['PC_Compounds'][0]

        # Find the 'IUPAC Name - Preferred' property
        preferred_name = ''
        for prop in compound_info.get('props', []):
            if prop['urn']['label'] == 'IUPAC Name' and prop['urn']['name'] == 'Preferred':
                preferred_name = prop['value'].get('sval', '')
                break

        # Extract the synonyms from the response
        synonyms = data["InformationList"]["Information"][0]["Synonym"]

        # Return the preferred name and the first synonym (usually the preferred name)
        return preferred_name, synonyms[0] if synonyms else ''
    else:
        # Handle the error case
        return f"Error: {response.status_code} - {response.text}", f"Error: {response.status_code} - {response.text}"

# Read the original CSV file
input_file = "input.csv"  # Update with your input file path
df = pd.read_csv(input_file)

# Check the structure of your DataFrame
print(df.head())

# Measure the runtime
start_time = time.time()

# Add columns to store the PubChem ID, preferred name, and synonyms
df['PubChem_ID'] = None
df['Preferred_Name'] = None
df['Synonym'] = None

# Iterate over the DataFrame with a progress bar to retrieve and store the information
for index, row in tqdm(df.iterrows(), total=len(df)):
    inchikey = row['InChIKey']
    pubchem_id = get_pubchem_id_from_inchikey(inchikey)
    preferred_name, synonym = get_compound_info(inchikey)

    df.at[index, 'PubChem_ID'] = pubchem_id
    df.at[index, 'Preferred_Name'] = preferred_name
    df.at[index, 'Synonym'] = synonym

end_time = time.time()

# Print the runtime
print(f"Runtime: {end_time - start_time:.2f} seconds")

# Save the updated DataFrame to a new CSV file
output_file_path = "output.csv"  # Update with your desired output file path
df.to_csv(output_file_path, index=False)

print(f"Updated data with PubChem IDs and compound information saved to '{output_file_path}'.")
