In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem

In [2]:
# IMS data from Predicting differential ion mobility behaviour in silico using machine learning 
# --authors: Christian Ieritano, J. Larry Campbella, and W. Scott Hopkins
file_path = '/home/cmdunham/ChemicalDataGeneration/data/predicting_ion_mobility_iertiano_et_al_data.csv'
data = pd.read_csv(file_path)

file_path = '/home/cmdunham/ChemicalDataGeneration/data/predicting_ion_mobility_iertiano_et_al_inchikeys.csv'
inchikeys = pd.read_csv(file_path)

# Importing the MoNA database of mass spec data, just for use as database to go from compound names to smiles and inchikeys. I haven't found a better way to do this efficiently.
file_path = '/home/cmdunham/ChemicalDataGeneration/data/MoNA_data/MoNA-export-GC-MS_Spectra.sdf'
suppl = Chem.SDMolSupplier(file_path)

In [3]:
data.head()

Unnamed: 0,Compound,m/z,SV 1500,SV 2000,SV 2500,SV 3000,SV 3250,SV 3500,SV 3750,SV 4000,Boltzmann-weighted CCS
0,5-Cl-MQOH,194.04,0.1,0.1,0.3,0.5,0.91,1.6,2.57,3.8,138.66
1,5-CN-MQOH,185.07,0.1,0.1,-0.2,-0.3,-0.14,0.2,0.77,1.6,132.21
2,5-F-MQOH,178.07,-0.2,-0.4,-0.5,-0.3,0.08,0.7,1.6,2.8,127.47
3,5-Me-MQOH,174.09,-0.2,-0.2,0.0,0.8,1.47,2.4,3.65,5.3,129.45
4,5-NO2-MQOH,205.06,0.2,-0.1,0.0,-0.4,-0.34,0.0,0.59,1.4,137.66


In [9]:
compounds = list(data['Compound'])
print(f'There are {len(set(compounds))} compounds in the ion mobility dataset.')

There are 409 compounds in the ion mobility dataset.


In [55]:
# creating a dict of compound names and inchikeys for all compounds in the ion mobility dataset
# compound_inchikey_df = inchikeys[['Compound', 'InChIKey']]
# # compound_inchikey_df['InChIKey'].isnull().values.sum()
# compound_inchikey_df.set_index('Compound', inplace=True)
# compound_inchikey_dict = compound_inchikey_df.to_dict()
compound_inchikey_dict = dict(zip(inchikeys['Compound'], inchikeys['InChIKey']))

Chemception needs SMILES to generate embeddings. The ion mobility dataset has InCHIKey but no SMILES. Since the MoNA dataset has both I'm using that as a way to go from one to the other. I haven't found a way to do this more efficiently, the other option I'm aware of is manually looking up each compound's InCHIKey somewhere like NIST Webbook, which is VERY time consuming and error prone. 

MoNA samples contain SMILES in the comment, not as a property, so the SMILES must be pulled out of the comment to use.

In [85]:
def format_smiles(mol):
    """
    Extracts the SMILES string from the COMMENT property of a molecule.

    Parameters:
    mol: An object representing a molecule, which must have a 'COMMENT' property.

    Returns:
    str or None: The SMILES string if found, otherwise None.
    """
    
    comment = mol.GetProp('COMMENT').split('\n')
    smiles = None
    for line in comment:
        if line.split('=')[0] == 'SMILES':
            smiles = line.split('SMILES=')[1]

    return smiles

In [90]:
inchikey_smiles_dict = {}
inchikeys = []
for mol in suppl:
    if mol is not None:
        try:
            inchikey = mol.GetProp('INCHIKEY')
            # if this compound is in the ion mobility dataset
            if inchikey in compound_inchikey_dict.values():
                # if this compound's smiles hasn't been recorded yet
                if not inchikey in inchikey_smiles_dict.keys():
                    smiles = format_smiles(mol)
                    if smiles:
                        inchikey_smiles_dict[inchikey] = smiles

        except:
            pass



[16:40:26] ERROR: Cannot convert '>  ' to int on line 223368
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 223605
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 223869
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 224100
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 224357
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 224623
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 237235
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:26] ERROR: Cannot convert '>  ' to int on line 239071
[16:40:26] ERROR: moving to the beginning of the next molecule
[16:40:2

In [6]:
# Getting a list of compounds from the ion mobility dataset that also appear in the MoNA dataset.
compounds_in_both_datasets = []
for mol in suppl:
    if mol is not None:
        if mol.GetProp('NAME') in compounds:
            compounds_in_both_datasets.append(mol.GetProp('NAME'))

We see that there are more InChIKeys in common between the datasets than there are compounds names. This is likely due to the fact that InChIKeys are unique identifiers while the same chemical might can have multiple compound names, meaning that it could be listed one way in one dataset and another way in another.

In [91]:
print(f'There are {len(set(compounds_in_both_datasets))} compound names from the Ion Mobility dataset that appear in the MoNA dataset compared to {len(inchikey_smiles_dict)} inchikeys that appear in both datasets.')

There are 48 compounds from the Ion Mobility dataset that appear in the MoNA dataset compared to 77 inchikeys that appear in both datasets.
