In [6]:
import pandas as pd

from rdkit import Chem
from rdkit import RDLogger
# Disable RDKit error logging - prevent error messages from printing
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

import requests

from fcd_torch import FCD
import torch

In [102]:
# IMS data from Predicting differential ion mobility behaviour in silico using machine learning 
# --authors: Christian Ieritano, J. Larry Campbella, and W. Scott Hopkins
file_path = '/home/cmdunham/ChemicalDataGeneration/data/predicting_ion_mobility_iertiano_et_al_data.csv'
data = pd.read_csv(file_path)

file_path = '/home/cmdunham/ChemicalDataGeneration/data/predicting_ion_mobility_iertiano_et_al_inchikeys.csv'
inchikey_df = pd.read_csv(file_path)

# Importing the MoNA database of mass spec data, just for use as database to go from compound names to smiles and inchikeys. I haven't found a better way to do this efficiently.
file_path = '/home/cmdunham/ChemicalDataGeneration/data/MoNA_data/MoNA-export-GC-MS_Spectra.sdf'
suppl = Chem.SDMolSupplier(file_path)

In [3]:
data.head()

Unnamed: 0,Compound,m/z,SV 1500,SV 2000,SV 2500,SV 3000,SV 3250,SV 3500,SV 3750,SV 4000,Boltzmann-weighted CCS
0,5-Cl-MQOH,194.04,0.1,0.1,0.3,0.5,0.91,1.6,2.57,3.8,138.66
1,5-CN-MQOH,185.07,0.1,0.1,-0.2,-0.3,-0.14,0.2,0.77,1.6,132.21
2,5-F-MQOH,178.07,-0.2,-0.4,-0.5,-0.3,0.08,0.7,1.6,2.8,127.47
3,5-Me-MQOH,174.09,-0.2,-0.2,0.0,0.8,1.47,2.4,3.65,5.3,129.45
4,5-NO2-MQOH,205.06,0.2,-0.1,0.0,-0.4,-0.34,0.0,0.59,1.4,137.66


In [4]:
compounds = list(data['Compound'])
print(f'There are {len(set(compounds))} compounds in the ion mobility dataset.')

There are 409 compounds in the ion mobility dataset.


In [5]:
print(f"There are {len(data['Compound'])} compounds in the ion mobility dataset and {len(inchikey_df['Compound'])} compounds in the corresponding inchikey file.")
print(f"Of those, {len(set(inchikey_df['Compound'])& set(data['Compound']))} overlap between the two files.")

There are 409 compounds in the ion mobility dataset and 409 compounds in the corresponding inchikey file.
Of those, 406 overlap between the two files.


In [25]:
print(set(inchikey_df['Compound']) - set(data['Compound']))
print(set(data['Compound']) - set(inchikey_df['Compound']))

{'(+)-Propoxyphene', 'Norsertraline', 'Noroxycodone'}
{'Norsertraline HCL', 'Noroxycodone HCL', 'Propoxyphene'}


In [27]:
# changing the compound names in the inchikey df so they correspond to the names in the ims df
inchikey_df.loc[inchikey_df['Compound'] == '(+)-Propoxyphene'] = 'Propoxyphene'
inchikey_df.loc[inchikey_df['Compound'] == 'Norsertraline'] = 'Norsertraline HCL'
inchikey_df.loc[inchikey_df['Compound'] == 'Noroxycodone'] = 'Noroxycodone HCL'
print(set(data['Compound']) - set(inchikey_df['Compound']))

set()


In [8]:
# # creating a dict of compound names and inchikeys for all compounds in the ion mobility dataset
# compound_inchikey_dict = dict(zip(inchikey_df['Compound'], inchikey_df['InChIKey']))

In [57]:
inchikeys = []

for row in data.iterrows():
    try:
        compound = row[1][0]
        inchikey = inchikey_df.loc[inchikey_df['Compound'] == compound, 'InChI Key'].values[0]
        inchikeys.append(inchikey)
    except:
        inchikeys.append(None)

print(f'Of the {len(inchikeys)} compounds in the dataset, {round(100 * (1 - pd.Series(inchikeys).isna().sum()/len(inchikeys)), 2)}% have corresponding InChIKeys.')

Of the 409 compounds in the dataset, 96.82% have corresponding InChIKeys.


In [103]:
# add column for inchikeys to ims dataset
data_w_inchikey = data.copy()
data_w_inchikey['InChIKey'] = inchikeys

# double check that the InChIKeys column corresponds to correct compounds
total_errors = 0
for row in data_w_inchikey.iterrows():
    inchikey = row[1][-1]
    compound = row[1][0]
    try:
        correct_inchikey = inchikey_df.loc[inchikey_df['Compound'] == compound, 'InChI Key'].values[0]

    except:
        # for rows with no recorded inchikey, value = None
        correct_inchikey = None

    # if the inchikeys don't match and at least one of them is not none
    if inchikey != correct_inchikey:
        if inchikey == True or correct_inchikey == True:
            total_errors+=1

if not total_errors:
    print('All compound/InChIKey pairs are correct.')

All compound/InChIKey pairs are correct.


# Recording SMILES for each compound:
---
Chemception needs SMILES to generate embeddings. The ion mobility dataset has InCHIKey but no SMILES. Using [PubChem's API](https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest) and code from [this](https://bioinformatics.stackexchange.com/questions/10755/is-there-a-python-package-to-convert-inchi-to-molecular-structures) question on Stack Exchange.

In [107]:
# get the corresponding SMILES for each InChIKey in the dataset and store in a list
smiles_list = []
for inchikey in data_w_inchikey['InChIKey']:
    try:
        r = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/CanonicalSMILES/JSON').json()
        smiles = r['PropertyTable']['Properties'][0]['CanonicalSMILES']
        smiles_list.append(smiles)
    except:
        smiles_list.append(None)
smiles_list


In [122]:
all_comps = len(data_w_inchikey['InChIKey'])
comp_no_inchikey = data_w_inchikey['InChIKey'].isna().sum()
comp_w_inchikey = all_comps - comp_no_inchikey
comp_w_smiles = pd.Series(smiles_list).isna().sum()
print(f"Of the {comp_w_inchikey} compounds with InChIKeys in the dataset, {round(100 * (1 - (comp_w_smiles-data_w_inchikey['InChIKey'].isna().sum())/comp_w_inchikey), 2)}% have corresponding SMILES on PubChem.")

Of the 396 compounds with InChIKeys in the dataset, 93.69% have corresponding SMILES on PubChem.


In [123]:
data_w_inchikey['SMILES'] = smiles_list
data_w_inchikey.head()

Unnamed: 0,Compound,m/z,SV 1500,SV 2000,SV 2500,SV 3000,SV 3250,SV 3500,SV 3750,SV 4000,Boltzmann-weighted CCS,InChIKey,SMILES
0,5-Cl-MQOH,194.04,0.1,0.1,0.3,0.5,0.91,1.6,2.57,3.8,138.66,OPQODOXIDNYMKA-UHFFFAOYSA-N,CC1=NC2=C(C=CC(=C2C=C1)Cl)O
1,5-CN-MQOH,185.07,0.1,0.1,-0.2,-0.3,-0.14,0.2,0.77,1.6,132.21,NWUAFVKGPOBCOA-UHFFFAOYSA-N,CC1=NC2=C(C=CC(=C2C=C1)C#N)O
2,5-F-MQOH,178.07,-0.2,-0.4,-0.5,-0.3,0.08,0.7,1.6,2.8,127.47,YMDMCOFPYPOKJD-UHFFFAOYSA-N,CC1=NC2=C(C=CC(=C2C=C1)F)O
3,5-Me-MQOH,174.09,-0.2,-0.2,0.0,0.8,1.47,2.4,3.65,5.3,129.45,GQUFSGXAEOXQJC-UHFFFAOYSA-N,CC1=C2C=CC(=NC2=C(C=C1)O)C
4,5-NO2-MQOH,205.06,0.2,-0.1,0.0,-0.4,-0.34,0.0,0.59,1.4,137.66,XYPACLZTPMHPLB-UHFFFAOYSA-N,CC1=NC2=C(C=CC(=C2C=C1)[N+](=O)[O-])O


# Retrieving Chemception embeddings based on compound SMILES:
---

In [133]:
metadata = pd.read_feather('/home/cmdunham/ChemicalDataGeneration/data/BKG_SIM_ims_acbc_train_v1.1.09_meta.feather')
spectra = pd.read_feather('data/BKG_SIM_ims_acbc_train_v1.1.09_spectra.feather')
spectra.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/BKG_SIM_ims_acbc_train_v1.1.09_spectra.feather'

In [132]:
len('7495327EC0774220E92D21BDA65FE85126767BB41A298837780AA23DEB51B2E8')
len('9d64b368fb65d4faccfd2fb7be29948d9006f4aec658bb3f36a15bd77b1efcca')
len('BB124FC7D59E8733134E857C16E93A15ECCC4C94CCF4716BBF02B4ADADB24CE9')

64

In [129]:
metadata.head()

Unnamed: 0,level_0,index,Filename,TemperatureKelvin,PressureBar,PosPeak1Mobility,PosPeak2Mobility,PosPeak3Mobility,PosPeak1Amplitude,PosPeak2Amplitude,PosPeak3Amplitude,NegPeak1Mobility,NegPeak2Mobility,NegPeak3Mobility,NegPeak1Amplitude,NegPeak2Amplitude,NegPeak3Amplitude,Label
0,374419,0,JCAD_0053_2022-01-20-09-27-47_2022-01-20-14-27...,,1019.5,2.29,0.0,0.0,7776.0,0.0,0.0,2.129,0.0,0.0,7279.0,0.0,0.0,BKG
1,531524,0,IMS_VX_90%_0053_2022-04-25-09-10-20_2022-04-25...,,1020.3,2.283,0.0,0.0,7530.0,0.0,0.0,2.118,0.0,0.0,7062.0,0.0,0.0,BKG
2,315950,0,JCAD_0053_2022-01-20-09-34-32_2022-01-20-14-34...,,1019.5,2.291,0.0,0.0,7690.0,0.0,0.0,2.13,0.0,0.0,7317.0,0.0,0.0,BKG
3,374517,0,JCAD_0053_2022-01-20-09-24-09_2022-01-20-14-24...,,1019.5,2.291,0.0,0.0,7796.0,0.0,0.0,2.129,0.0,0.0,7312.0,0.0,0.0,BKG
4,374610,0,JCAD_0053_2022-01-20-09-17-04_2022-01-20-14-17...,,1019.2,2.289,0.0,0.0,7682.0,0.0,0.0,2.128,0.0,0.0,7373.0,0.0,0.0,BKG


# Using MoNA as InChIKey lookup:
---
**This was the previous approach before finding the PubChem API:**

Since the MoNA dataset has both SMILES and InChIKey, using that as a way to go from one to the other. I haven't found a way to do this more efficiently, the other option I'm aware of is manually looking up each compound's InCHIKey somewhere like NIST Webbook, which is VERY time consuming and prone to human error (namely me copying things incorrectly). 

MoNA samples contain SMILES in the comment, not as a property, so the SMILES must be pulled out of the comment to use.

In [12]:
def format_smiles(mol):
    """
    Extracts the SMILES string from the COMMENT property of an rdkit molecule.

    Parameters:
    mol: An rdkit molecule.

    Returns:
    str or None: The SMILES string if found, otherwise None.
    """
    
    comment = mol.GetProp('COMMENT').split('\n')
    smiles = None
    for line in comment:
        if line.split('=')[0] == 'SMILES':
            smiles = line.split('SMILES=')[1]

    return smiles

In [29]:
# get SMILES for any compounds in the MoNA database that are also in the Ion Mobiity dataset and create a dictionary with inchikeys as keys and SMILES as values
overlapping_mol_inchikey_smiles_dict = {}

for mol in suppl:
    if mol is not None:
        try:
            inchikey = mol.GetProp('INCHIKEY')

        # ignore any mols without inchikeys
        except:
            continue
        
        # if this compound is in the ion mobility dataset
        if inchikey in compound_inchikey_dict.values():
            # if this compound's smiles hasn't been recorded yet
            if not inchikey in overlapping_mol_inchikey_smiles_dict.keys():
                smiles = format_smiles(mol)
                if smiles:
                    overlapping_mol_inchikey_smiles_dict[inchikey] = smiles


In [30]:
# Get a list of compounds from the ion mobility dataset that also appear in the MoNA dataset.
compounds_in_both_datasets = []
for mol in suppl:
    if mol is not None:
        if mol.GetProp('NAME') in compounds:
            compounds_in_both_datasets.append(mol.GetProp('NAME'))

We see that there are more InChIKeys in common between the datasets than there are compounds names. This is likely due to the fact that InChIKeys are unique identifiers while the same chemical might can have multiple compound names, meaning that it could be listed one way in one dataset and another way in another.

In [31]:
print(f'There are {len(set(compounds_in_both_datasets))} compound names from the Ion Mobility dataset that appear in the MoNA dataset compared to {len(overlapping_mol_inchikey_smiles_dict)} inchikeys that appear in both datasets.')

There are 48 compound names from the Ion Mobility dataset that appear in the MoNA dataset compared to 76 inchikeys that appear in both datasets.


In [32]:
# make lists of inchikeys and SMILES to add as columns to the IMS dataframe
inchikeys = []
smiles = []

for mol in compounds:
    inchikey = compound_inchikey_dict[mol]
    
    # look mol up in dict and append corresponding inchikey to list
    inchikeys.append(inchikey)
    
    # if inchikey is in the dict of mols with SMILES from MoNA, get the SMILES and append to list
    if inchikey in overlapping_mol_inchikey_smiles_dict.keys():
        smiles.append(overlapping_mol_inchikey_smiles_dict[inchikey])
    else:
        smiles.append(None)

In [33]:
# Double checking that the recorded SMILES/InChIKey pairs correspond
total_errors = 0
for i, smile in enumerate(smiles):
    if smile:
        if not overlapping_mol_inchikey_smiles_dict[inchikeys[i]] == smiles[i]:
            print(f'SMILES/InChIKey pair at {i} does not match what is specified in the lookup dictionary.')
            total_errors+=1
if not total_errors:
    print('All SMILES/InChIKey pairs match what is specified in the lookup dictionary.')

All SMILES/InChIKey pairs match what is specified in the lookup dictionary.


Adding columns for InChIKey and SMILES. InChIKey will be needed for identification, SMILES for retrieving Chemception embeddings.

In [34]:
ims_data = data.copy()
ims_data['SMILES'] = smiles
ims_data['InChIKey'] = inchikeys
ims_data = ims_data.dropna(subset=['SMILES']).reset_index(drop=True)
# ims_data.drop(columns=['Compound'], inplace=True)
ims_data.head()

Unnamed: 0,Compound,m/z,SV 1500,SV 2000,SV 2500,SV 3000,SV 3250,SV 3500,SV 3750,SV 4000,Boltzmann-weighted CCS,SMILES,InChIKey
0,6-methyl-2-MQ,158.1,0.2,0.2,0.6,1.9,3.03,4.5,6.32,8.5,126.14,Cc(c2)cc(c1)c(c2)nc(C)c1,JJPSZKIOGBRMHK-UHFFFAOYSA-N
1,MQOH,160.08,-0.4,-0.6,-0.7,-0.4,0.06,0.8,1.89,3.4,125.75,Cc(c1)nc(c2)c(ccc2)c1,SMUQFGGVLNAIOZ-UHFFFAOYSA-N
2,Ametryn,228.13,0.44,1.15,2.38,4.04,5.07,6.37,7.67,9.29,151.92,CCNc(n1)nc(SC)nc(NC(C)C)1,RQVYBGPQFYCBGX-UHFFFAOYSA-N
3,Azoxystrobin,404.13,0.66,1.65,3.14,5.46,7.03,8.77,10.78,13.03,203.24,CO/C=C(\C1=CC=CC=C1OC2=NC=NC(=C2)OC3=CC=CC=C3C...,WFDXOXNFNRHQEC-GHRIWEEISA-N
4,Dimethoate,230.01,0.48,1.32,2.71,4.8,6.18,7.88,9.57,11.49,140.98,CNC(=O)CSP(=S)(OC)OC,MCWXGJITAZMZEV-UHFFFAOYSA-N
