# Import libraries

In [1]:
import pandas as pd
import numpy as np
import glob
import requests
import time
import json
import io

Glob GNPS output file

In [8]:
gnps_file = glob.glob('MOLECULAR-LIBRARYSEARCH-*.tsv')[0]
gnps_file

'MOLECULAR-LIBRARYSEARCH-V2-a92dcdef-download_all_identifications-main.tsv'

In [9]:
# import
df = pd.read_csv(gnps_file, sep = '\t')

# remove positive hits
df = df[df['IonMode'].isin(['Negative','negative'])].reset_index()

print(df.shape)
df.head()

(15, 45)


Unnamed: 0,index,SpectrumID,Compound_Name,Ion_Source,Instrument,Compound_Source,PI,Data_Collector,Adduct,Precursor_MZ,...,NumberHits,full_CCMS_path,tags,MoleculeExplorerDatasets,MoleculeExplorerFiles,InChIKey,InChIKey-Planar,superclass,class,subclass
0,0,CCMSLIB00003136732,"Spectral Match to 9-Hydroxy-10E,12Z-octadecadi...",ESI,IT/ion trap,Isolated,Data from Albert Rivas Ubach,Data deposited by ARU_Metabolomics,M-H,295.23,...,1,Cayalaortiz/bog_metabolomics/ms2spectra_consen...,,0,0,,,,,
1,3,CCMSLIB00004706448,"3-[(2S,3R,4S,5R,6R)-4-[(2S,3R,4S,5S,6R)-4,5-di...",,ESI-QFT,isolated,MoNA,MoNA:VF-NPL-QEHF015385,[M-H]-,757.183,...,1,Cayalaortiz/bog_metabolomics/ms2spectra_consen...,,0,0,RWMOXJYSVNRRGE-UKBOIMOLSA-N,RWMOXJYSVNRRGE,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
2,4,CCMSLIB00004690322,Khayanthone,,Linear Ion Trap,isolated,MoNA,MoNA:VF-NPL-LTQ006927,M-H,569.374,...,1,Cayalaortiz/bog_metabolomics/ms2spectra_consen...,,0,0,AKPJXLBXDLSOFY-YCUUMJCDSA-N,AKPJXLBXDLSOFY,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids
3,5,CCMSLIB00000845927,"NCGC00380643-01!3,4,5-trihydroxy-6-[5-hydroxy-...",LC-ESI,Maxis II HD Q-TOF Bruker,isolated,Jadhav/Dorrestein,lfnothias,M-H,505.099,...,1,Cayalaortiz/bog_metabolomics/ms2spectra_consen...,,0,0,IHGAMZBDEOTTEW-UHFFFAOYSA-N,IHGAMZBDEOTTEW,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
4,7,CCMSLIB00003109239,PI(20:4/0:0); [M-H]- C29H48O12P1,LC-ESI,CID; Velos,Commercial,Thomas Metz,Thomas Metz,M-H,620.296,...,1,Cayalaortiz/bog_metabolomics/ms2spectra_consen...,,11,353,,,,,


In [11]:
subset_df = df.copy()

subset_df = subset_df.loc[:, [
    'Adduct',
    'IonMode',
    'InChIKey',
    '#Scan#',
    'superclass',
    'class',
    'subclass',
]]

subset_df = subset_df[~subset_df['InChIKey'].isna()].reset_index(drop=True)
subset_df['#Scan#'] = "FT"+subset_df['#Scan#'].apply(lambda x: '{0:0>4}'.format(x))
subset_df.rename(columns={'#Scan#':'Features'}, inplace=True)
subset_df = subset_df.set_index('Features')
subset_df = subset_df.reset_index()
list_columns = list(subset_df.columns)
new_columns = ['MolecularFormula', 'ExactMass', 'InChI', 'CanonicalSmiles', 
               'PubChem_CID', 'KEGG', 'BioCyc', 'LIPIDMAPS', 'ChEBI', 'HMDB']

list_columns.extend(new_columns)
subset_df = subset_df.reindex(columns = list_columns, fill_value='')

print(subset_df.shape)
subset_df.head()

(8, 17)


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT0368,[M-H]-,negative,RWMOXJYSVNRRGE-UKBOIMOLSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,,,,,,
1,FT0173,M-H,negative,AKPJXLBXDLSOFY-YCUUMJCDSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,,,,,,,,,,
2,FT0109,M-H,negative,IHGAMZBDEOTTEW-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,,,,,,
3,FT0069,M-H,negative,MLSVKXQHUTYXHD-VIZOYTHASA-N,Benzenoids,Benzene and substituted derivatives,Benzophenones,,,,,,,,,,
4,FT0133,M+FA-H,negative,WLCHQSHZHFLMJH-GAVPBTRYSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,,,,,,,,,,


In [12]:
for index, row in subset_df.iterrows():
    #print(index, row['InChIKey'])
    
    query = row['InChIKey']
    
    ## translate inchikey using the Chemical Translation Service - CTS
    s = requests.Session()
    main_url = "https://cts.fiehnlab.ucdavis.edu/rest/"

    list_dbs = ['KEGG', 'BioCyc', 'LIPIDMAPS', 'ChEBI', 'Human Metabolome Database', "PubChem CID"]
    result_dict = {key:None for key in list_dbs}
    result_dict['InChIKey'] = query

    for db in list_dbs:

        r = s.get(main_url+'convert/InChIKey/'+db+'/'+query)
        r = json.loads(r.text)

        result = dict()

        if type(r) == dict: # error
            pass
        else:
            hit_dict = r[0]
            #print(hit_dict)
            if 'results' in hit_dict:
                if len(hit_dict['results']) == 1:
                    result_dict[db] = hit_dict['results'][0]
                elif len(hit_dict['results']) > 1:
                    result_dict[db] = ",".join(hit_dict['results'])
                else:
                    result_dict[db] = ""
            else:
                result_dict[db] = ""
    
    subset_df.loc[index,'KEGG'] = result_dict['KEGG']
    subset_df.loc[index,'BioCyc'] = result_dict['BioCyc']
    subset_df.loc[index,'LIPIDMAPS'] = result_dict['LIPIDMAPS']
    subset_df.loc[index,'ChEBI'] = result_dict['ChEBI']
    subset_df.loc[index,'HMDB'] = result_dict['Human Metabolome Database']
    subset_df.loc[index,'PubChem_CID'] = result_dict['PubChem CID']
    
print('Done')
subset_df.head()

Done


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT0368,[M-H]-,negative,RWMOXJYSVNRRGE-UKBOIMOLSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,44715568,,,,,
1,FT0173,M-H,negative,AKPJXLBXDLSOFY-YCUUMJCDSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,,,,,6708528,,,,,
2,FT0109,M-H,negative,IHGAMZBDEOTTEW-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,57509548,,,,,
3,FT0069,M-H,negative,MLSVKXQHUTYXHD-VIZOYTHASA-N,Benzenoids,Benzene and substituted derivatives,Benzophenones,,,,,38361890,,,,,
4,FT0133,M+FA-H,negative,WLCHQSHZHFLMJH-GAVPBTRYSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,,,,,10096940,,,,,


In [13]:
for index, row in subset_df.iterrows():
    #print(index, row['InChIKey'])
    
    query = row['InChIKey']
    
    ## search pubchem for molecular formula, exact mass, smiles and inchi using inchikey
    ## 'MolecularFormula', 'ExactMass', 'InChI', 'CanonicalSmiles',
    properties = 'MolecularFormula,MolecularWeight,ExactMass,CanonicalSMILES,InChI'

    r = s.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/'+query+'/property/'+properties+'/CSV').content

    result_df = pd.read_csv(io.StringIO(r.decode('utf-8')))
    
    if 'Status: 404' in result_df.columns:
        pass
    
    elif not result_df.empty == True:
        
        mol_formula = ",".join(list(result_df['MolecularFormula'].unique()))
        subset_df.loc[index,'MolecularFormula'] = mol_formula
        
        ex_mass = ",".join([str(x) for x in result_df['ExactMass'].unique()])
        subset_df.loc[index,'ExactMass'] = ex_mass
        
        can_smiles = ",".join(list(result_df['CanonicalSMILES'].unique()))
        subset_df.loc[index,'CanonicalSmiles'] = can_smiles
        
        inchi = ",".join(list(result_df['InChI'].unique()))
        subset_df.loc[index,'InChI'] = inchi
    
    else:
        pass
    
    
    if index%5 == 0:
        time.sleep(3)

    #print(index, row['InChIKey'], 'Done searches')
    
print('Done')
subset_df.head()

Done


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT0368,[M-H]-,negative,RWMOXJYSVNRRGE-UKBOIMOLSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,C32H38O21,758.190558,InChI=1S/C32H38O21/c33-6-16-20(41)23(44)29(53-...,C1C(C(C(C(O1)OC2C(C(C(OC2OC3C(C(OC(C3O)OC4=C(O...,44715568,,,,,
1,FT0173,M-H,negative,AKPJXLBXDLSOFY-YCUUMJCDSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,C32H42O9,570.282883,InChI=1S/C32H42O9/c1-16(33)38-22-14-23(39-17(2...,CC(=O)OC1CC2C(C(CC(C2(C3C1(C45C(O4)C(=O)C(C5(C...,6708528,,,,,
2,FT0109,M-H,negative,IHGAMZBDEOTTEW-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,C23H22O13,506.106041,InChI=1S/C23H22O13/c1-32-12-5-8(3-4-10(12)24)1...,COC1=C(C=CC(=C1)C2=C(C(=O)C3=C(C=C(C=C3O2)OC4C...,57509548,,,,,
3,FT0069,M-H,negative,MLSVKXQHUTYXHD-VIZOYTHASA-N,Benzenoids,Benzene and substituted derivatives,Benzophenones,C25H26O7,438.167853,InChI=1S/C25H26O7/c1-14(2)5-4-6-15(3)7-8-16-9-...,CC(=CCCC(=CCC1=C(C(=C(C=C1)O)C(=O)C2=C(C=C(C=C...,38361890,,,,,
4,FT0133,M+FA-H,negative,WLCHQSHZHFLMJH-GAVPBTRYSA-N,Lipids and lipid-like molecules,Prenol lipids,Triterpenoids,C30H46O5,486.334525,InChI=1S/C30H46O5/c1-16(2)17-10-13-30(25(34)35...,CC(=C)C1CCC2(C1C3CCC4C(C3(CC2)C)(CCC5C4(C(C(C5...,10096940,,,,,


In [14]:
subset_df.to_csv('../tables/summary_output_GNPS_e5.csv', index=False)