In [1]:
import logging
import time
from tqdm import tqdm
import pandas as pd
import os
from pubchempy import get_compounds, BadRequestError, get_synonyms, PubChemHTTPError
from rdkit.Chem import CanonSmiles

In [2]:
logger = logging.getLogger('__name__')
logger.setLevel('INFO')

In [3]:
cols = ['SureChEMBL_ID', 'InChIKey']

In [4]:
total_inchi_df = pd.read_csv('Inchikeys_10718652.tsv', sep=',', usecols = cols)
total_inchi_df

Unnamed: 0,SureChEMBL_ID,InChIKey
0,SCHEMBL4,FAKRSMQSSFJEIM-RQJHMYQMSA-N
1,SCHEMBL9,OYFJQPXVCSSHAI-QFPUQLAESA-N
2,SCHEMBL10,OYFJQPXVCSSHAI-BDURURIASA-N
3,SCHEMBL18,GBXSMTUPTTWBMN-XIRDDKMYSA-N
4,SCHEMBL23,PFGWGEPQIUAZME-NXSMLHPHSA-N
...,...,...
10718647,SCHEMBL24857923,RIJGJCBKSVIKAE-UHFFFAOYSA-N
10718648,SCHEMBL24863986,JTODHWHVVNHORP-UHFFFAOYSA-N
10718649,SCHEMBL24864054,KRZVSGUGLUVZPC-UHFFFAOYSA-N
10718650,SCHEMBL24869491,BRTUPHCRUZUMIG-UHFFFAOYSA-N


In [5]:
def check_smiles(smile: str):
    """Method to canonicalize the smiles string"""
    try:
        return CanonSmiles(smi=smile)
    except Exception:
        logger.info(f'Cannot canonicalize {smile}')
        return None

In [6]:
def get_cid_from_data(idx: str, idx_type: str):
    """Get PubChem ID from any query.

    Potential curies include : smiles, inchikey, inchi, name
    """
    if idx_type.lower() == 'smiles':
        idx = check_smiles(idx)

    if not idx:
        return None

    try:
        return get_compounds(idx, idx_type.lower())[0].cid
    except BadRequestError:
        logger.info(f'Issue with {idx}')
        return None
    except IndexError:
        return None

In [7]:
def get_chembl_id(pubchem_idx: str):
    """Map Pubchem CID to ChEMBL id"""
    try:
        other_identifiers = get_synonyms(identifier=pubchem_idx)
    except (PubChemHTTPError, BadRequestError):  # too many request
        time.sleep(3)
        try:
            other_identifiers = get_synonyms(identifier=pubchem_idx)
        except BadRequestError:  # incorrect pubchem id
            return None

    if len(other_identifiers) < 1:
        return None

    other_identifiers = other_identifiers[0]

    for idx in other_identifiers['Synonym']:
        if idx.startswith('CHEMBL'):
            return idx

    return None

In [None]:
if os.path.exists('PUBCHEM_CID_CHEMBL_ID.tsv'):
    df = pd.read_csv('PUBCHEM_CID_CHEMBL_ID.tsv', sep='\t')
else:
    df = pd.DataFrame(columns=['SureChEMBL_ID', 'InChIKey', 'PUBCHEM_CID', 'CHEMBL_ID'])
    


for index, row in tqdm(total_inchi_df.iterrows(), total=total_inchi_df.shape[0]):
    data=[]
    SCHEMBL_ID_ = total_inchi_df.iloc[index]['SureChEMBL_ID']
    inchikey_ = total_inchi_df.iloc[index]['InChIKey']

    if inchikey_ in df['InChIKey'].values.tolist() and SCHEMBL_ID_ in df['SureChEMBL_ID'].values.tolist():
        continue

    pubchem_cid = get_cid_from_data(inchikey_, 'inchikey')
    if pubchem_cid:
        chembl_id = get_chembl_id(pubchem_cid)
    else:
        chembl_id = None

    data.append({'SureChEMBL_ID': SCHEMBL_ID_, 'InChIKey': inchikey_, 'PUBCHEM_CID': pubchem_cid, 'CHEMBL_ID': chembl_id})

   
    data_df = pd.DataFrame(data)
    df = pd.concat([df, data_df], ignore_index=True)
    df['PUBCHEM_CID'] = df['PUBCHEM_CID'].astype(str).apply(lambda x: x.replace('.0', ''))
    df.to_csv('PUBCHEM_CID_CHEMBL_ID.tsv', index=False, sep='\t')

  0%|          | 938/10718652 [25:20<5002:25:58,  1.68s/it]

In [None]:
df.to_csv('PUBCHEM_CID_CHEMBL_ID.tsv', index=False, sep='\t')