# Preprocess Raw Datasets
This notebook preprocesses raw csv data into data compatible with `specvae` codebase.
- Rename columns to have consistent naming,
- Assign numeric labels to categorical columns,
- Fix values.
- Export datasets as `MoNA_full.csv` and `HMDB_full.csv`.

In [1]:
import sys
import numpy as np
import pandas as pd

import specvae.utils as utils
import specvae.dataset as dt

In [2]:
tax_fields = ('kingdom', 'superclass', 'class', 'subclass')
def retrieve_fields(row):
    tax = row['taxonomy']
    arr = [f.split(':')[0] for f in tax.split(';')]
    for field_name in tax_fields:
        field = tax_fields.index(field_name)
        row[field_name] = arr[field] if len(arr) > field else ''
    return row

In [3]:
def assign_label(row, field_name, labels):
    value = row[field_name]
    try:
        return labels.index(value)
    except:
        return -1

## Preprocess MoNA dataset

In [4]:
data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA.csv'
df = dt.MoNA.open(data_path)
tax_data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_taxonomy.csv'
tax_df = pd.read_csv(tax_data_path)[['InChI', 'taxonomy']]
df

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,spectrum,InChI,molecular formula,total exact mass,SMILES,collision energy,ionization mode,instrument type,instrument,precursor m/z,precursor type,library,author,publication,structural_key,CASMI,split,collision_energy_new,id
0,52.073152:0.215740 53.039199:0.251984 55.05488...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,35HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,35.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
1,50.179433:0.988081 52.761359:0.667573 53.03928...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,45HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,45.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
2,50.382111:0.657423 52.393542:0.636186 52.67915...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,65.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
3,51.129190:0.047539 51.138777:0.057363 52.37786...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,35HCD,positive,ESI-QFT,Thermo Q Exactive HF,374.196198,[M+NH4]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,35.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
4,50.399188:0.071114 51.370479:0.060948 51.44307...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,45HCD,positive,ESI-QFT,Thermo Q Exactive HF,374.196198,[M+NH4]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,45.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126080,137.0243:19.582412335086758 147.0447:10.025338...,InChI=1S/C26H30O13/c27-9-19-20(31)21(32)22(39-...,C26H30O13,,c1(ccc2c(c1)OC(CC2=O)c1ccc(cc1)OC1C(C(C(C(O1)C...,,,,,,,,,,FTVKHUHJWDMWIR,CASMI2017,test,,FTVKHUHJWDMWIR-UHFFFAOYSA-N
126081,107.0482:15.626124145649307 108.0474:1.8386026...,InChI=1S/C20H22O9/c21-8-16-17(25)18(26)20(29-1...,C20H22O9,,c1(cc(c2c(c1)OC(C(C2)O)c1ccc(cc1)O)O)OC1OC(C(C...,,,,,,,,,,XWFQRXGYJJOFCO,CASMI2017,test,,XWFQRXGYJJOFCO-UHFFFAOYSA-N
126082,85.0172:1.4486432966359415 159.1119:1.70324618...,InChI=1S/C46H76O18/c1-19(2)20(3)13-28(61-44-41...,C46H76O18,,C1C(CC2=CCC3C(C2(C1O)C)CCC1(C3CCC1C(C(CC(=C(C)...,,,,,,,,,,HBCGNAJUSBIBGA,CASMI2017,test,,HBCGNAJUSBIBGA-UHFFFAOYSA-N
126083,67.0506:1.4928155919057327 69.0683:0.805449211...,"InChI=1S/C26H40O8/c1-15-5-8-19-25(2,14-33-24-2...",C26H40O8,,O([C@@H]1O[C@@H]([C@@H](O)[C@H](O)[C@H]1O)CO)C...,,,,,,,,,,YGCYRQKJYWQXHG,CASMI2017,test,,YGCYRQKJYWQXHG-UHFFFAOYSA-N


In [5]:
def rename_column(df, source, target, drop=True):
    if source in df:
        df[target] = df[source]
        df = df.drop([source], axis=1)
    else:
        print('\'%s\' doesn\'t exist' % source)
    return df

In [6]:
df = rename_column(df, 'molecular formula', 'molecular_formula')
df = rename_column(df, 'total exact mass', 'total_exact_mass')
df = rename_column(df, 'collision energy', 'collision_energy_old')
df = rename_column(df, 'ionization mode', 'ionization_mode')
df = rename_column(df, 'instrument type', 'instrument_type')
df = rename_column(df, 'precursor m/z', 'precursor_mz')
df = rename_column(df, 'precursor type', 'precursor_type')
df = rename_column(df, 'collision_energy_new', 'collision_energy')
df

Unnamed: 0,spectrum,InChI,SMILES,instrument,library,author,publication,structural_key,CASMI,split,id,molecular_formula,total_exact_mass,collision_energy_old,ionization_mode,instrument_type,precursor_mz,precursor_type,collision_energy
0,52.073152:0.215740 53.039199:0.251984 55.05488...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,C21H24O5,356.162374,35HCD,positive,ESI-QFT,395.125519,[M+K]+,35.0
1,50.179433:0.988081 52.761359:0.667573 53.03928...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,C21H24O5,356.162374,45HCD,positive,ESI-QFT,395.125519,[M+K]+,45.0
2,50.382111:0.657423 52.393542:0.636186 52.67915...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,C21H24O5,356.162374,65HCD,positive,ESI-QFT,395.125519,[M+K]+,65.0
3,51.129190:0.047539 51.138777:0.057363 52.37786...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,C21H24O5,356.162374,35HCD,positive,ESI-QFT,374.196198,[M+NH4]+,35.0
4,50.399188:0.071114 51.370479:0.060948 51.44307...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,C21H24O5,356.162374,45HCD,positive,ESI-QFT,374.196198,[M+NH4]+,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126080,137.0243:19.582412335086758 147.0447:10.025338...,InChI=1S/C26H30O13/c27-9-19-20(31)21(32)22(39-...,c1(ccc2c(c1)OC(CC2=O)c1ccc(cc1)OC1C(C(C(C(O1)C...,,,,,FTVKHUHJWDMWIR,CASMI2017,test,FTVKHUHJWDMWIR-UHFFFAOYSA-N,C26H30O13,,,,,,,
126081,107.0482:15.626124145649307 108.0474:1.8386026...,InChI=1S/C20H22O9/c21-8-16-17(25)18(26)20(29-1...,c1(cc(c2c(c1)OC(C(C2)O)c1ccc(cc1)O)O)OC1OC(C(C...,,,,,XWFQRXGYJJOFCO,CASMI2017,test,XWFQRXGYJJOFCO-UHFFFAOYSA-N,C20H22O9,,,,,,,
126082,85.0172:1.4486432966359415 159.1119:1.70324618...,InChI=1S/C46H76O18/c1-19(2)20(3)13-28(61-44-41...,C1C(CC2=CCC3C(C2(C1O)C)CCC1(C3CCC1C(C(CC(=C(C)...,,,,,HBCGNAJUSBIBGA,CASMI2017,test,HBCGNAJUSBIBGA-UHFFFAOYSA-N,C46H76O18,,,,,,,
126083,67.0506:1.4928155919057327 69.0683:0.805449211...,"InChI=1S/C26H40O8/c1-15-5-8-19-25(2,14-33-24-2...",O([C@@H]1O[C@@H]([C@@H](O)[C@H](O)[C@H]1O)CO)C...,,,,,YGCYRQKJYWQXHG,CASMI2017,test,YGCYRQKJYWQXHG-UHFFFAOYSA-N,C26H40O8,,,,,,,


### Preprocess columns

Preprocess ionization mode.

In [7]:
def ionstr2int(row):
    return 1 if str(row['ionization_mode']).lower() == 'positive' else 0

In [8]:
meta = {}
df['ionization_mode_id'] = df.apply(ionstr2int, axis=1)
meta['ionization_mode_id'] = {'n_class': 2, 'labels': ['negative', 'positive']}

Preprocess and classify instrument

In [9]:
instrument_labels = df['instrument'].unique()
#labels = labels[np.logical_not(np.isnan(labels))]
instrument_labels = instrument_labels[instrument_labels != ''].tolist()
instrument_labels = [l for l in instrument_labels if isinstance(l, str)]
df['instrument_id'] = df.apply(lambda row: assign_label(row, 'instrument', instrument_labels), axis=1)
meta['instrument_id'] = {'n_class': len(instrument_labels), 'labels': instrument_labels}
print(instrument_labels)
df

['Thermo Q Exactive HF', '6566 QTOF Agilent', '6567 QTOF Agilent', '6568 QTOF Agilent', '6569 QTOF Agilent', '6570 QTOF Agilent', '6571 QTOF Agilent', '6572 QTOF Agilent', '6573 QTOF Agilent', '6574 QTOF Agilent', '6575 QTOF Agilent', '6576 QTOF Agilent', '6577 QTOF Agilent', '6578 QTOF Agilent', '6579 QTOF Agilent', '6580 QTOF Agilent', '6581 QTOF Agilent', '6582 QTOF Agilent', '6583 QTOF Agilent', '6584 QTOF Agilent', '6585 QTOF Agilent', '6586 QTOF Agilent', '6587 QTOF Agilent', '6588 QTOF Agilent', '6589 QTOF Agilent', '6591 QTOF Agilent', '6592 QTOF Agilent', '6593 QTOF Agilent', '6594 QTOF Agilent', '6595 QTOF Agilent', '6596 QTOF Agilent', '6597 QTOF Agilent', '6598 QTOF Agilent', '6599 QTOF Agilent', '6601 QTOF Agilent', '6602 QTOF Agilent', '6603 QTOF Agilent', '6604 QTOF Agilent', '6605 QTOF Agilent', '6606 QTOF Agilent', '6607 QTOF Agilent', '6608 QTOF Agilent', '6609 QTOF Agilent', '6610 QTOF Agilent', '6611 QTOF Agilent', '6612 QTOF Agilent', '6613 QTOF Agilent', '6614 QTO

Unnamed: 0,spectrum,InChI,SMILES,instrument,library,author,publication,structural_key,CASMI,split,...,molecular_formula,total_exact_mass,collision_energy_old,ionization_mode,instrument_type,precursor_mz,precursor_type,collision_energy,ionization_mode_id,instrument_id
0,52.073152:0.215740 53.039199:0.251984 55.05488...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,C21H24O5,356.162374,35HCD,positive,ESI-QFT,395.125519,[M+K]+,35.0,1,0
1,50.179433:0.988081 52.761359:0.667573 53.03928...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,C21H24O5,356.162374,45HCD,positive,ESI-QFT,395.125519,[M+K]+,45.0,1,0
2,50.382111:0.657423 52.393542:0.636186 52.67915...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,C21H24O5,356.162374,65HCD,positive,ESI-QFT,395.125519,[M+K]+,65.0,1,0
3,51.129190:0.047539 51.138777:0.057363 52.37786...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,C21H24O5,356.162374,35HCD,positive,ESI-QFT,374.196198,[M+NH4]+,35.0,1,0
4,50.399188:0.071114 51.370479:0.060948 51.44307...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,C21H24O5,356.162374,45HCD,positive,ESI-QFT,374.196198,[M+NH4]+,45.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126080,137.0243:19.582412335086758 147.0447:10.025338...,InChI=1S/C26H30O13/c27-9-19-20(31)21(32)22(39-...,c1(ccc2c(c1)OC(CC2=O)c1ccc(cc1)OC1C(C(C(C(O1)C...,,,,,FTVKHUHJWDMWIR,CASMI2017,test,...,C26H30O13,,,,,,,,0,-1
126081,107.0482:15.626124145649307 108.0474:1.8386026...,InChI=1S/C20H22O9/c21-8-16-17(25)18(26)20(29-1...,c1(cc(c2c(c1)OC(C(C2)O)c1ccc(cc1)O)O)OC1OC(C(C...,,,,,XWFQRXGYJJOFCO,CASMI2017,test,...,C20H22O9,,,,,,,,0,-1
126082,85.0172:1.4486432966359415 159.1119:1.70324618...,InChI=1S/C46H76O18/c1-19(2)20(3)13-28(61-44-41...,C1C(CC2=CCC3C(C2(C1O)C)CCC1(C3CCC1C(C(CC(=C(C)...,,,,,HBCGNAJUSBIBGA,CASMI2017,test,...,C46H76O18,,,,,,,,0,-1
126083,67.0506:1.4928155919057327 69.0683:0.805449211...,"InChI=1S/C26H40O8/c1-15-5-8-19-25(2,14-33-24-2...",O([C@@H]1O[C@@H]([C@@H](O)[C@H](O)[C@H]1O)CO)C...,,,,,YGCYRQKJYWQXHG,CASMI2017,test,...,C26H40O8,,,,,,,,0,-1


In [10]:
instrument_type_labels = df['instrument_type'].unique()
#labels = labels[np.logical_not(np.isnan(labels))]
instrument_type_labels = instrument_type_labels[instrument_type_labels != ''].tolist()
instrument_type_labels = [l for l in instrument_type_labels if isinstance(l, str)]
df['instrument_type_id'] = df.apply(lambda row: assign_label(row, 'instrument_type', instrument_type_labels), axis=1)
meta['instrument_type_id'] = {'n_class': len(instrument_type_labels), 'labels': instrument_type_labels}
print(instrument_type_labels)

['ESI-QFT', 'LC-ESI-QTOF', 'LC-ESI-QFT', 'Q Exactive Focus Hybrid Quadrupole Orbitrap Mass Spectrometer (Thermo Fisher Scientific)', 'Q Exactive HF', 'LC-ESI-QEHF', 'ESI-ITFT', 'LC-ESI-ITFT', 'MALDI-QITTOF', 'ESI-QTOF', 'Linear Ion Trap', 'Waters SYNAPT', 'Quattro_QQQ', 'LC-APCI-ITFT', 'APCI-ITFT', 'LC-ESI-Q', 'LC-ESI-TOF', 'LC-ESI-QQ', 'LC-Q-TOF/MS', 'LC-ESI-HRMS', 'LC-ESI-HRMS-FT', 'LC-ESI-IT', 'Thermo LTQ', 'LC-ESI-QQQ', 'ESI-FT', 'LC-ESI-QIT', 'LC-ESI-ITTOF', 'Orbitrap', 'QIT', 'QqQ', 'Q-TOF', 'in source CID', 'LIT', 'LC-ESI-Q-Orbitrap', 'QIT-TOF', 'QIT-FT', 'LC-QTOF', 'APCI-ITTOF', 'LC-APPI-QQ']


In [11]:
precursor_type_labels = df['precursor_type'].unique()
#labels = labels[np.logical_not(np.isnan(labels))]
precursor_type_labels = precursor_type_labels[precursor_type_labels != ''].tolist()
precursor_type_labels = [l for l in precursor_type_labels if isinstance(l, str)]
df['precursor_type_id'] = df.apply(lambda row: assign_label(row, 'precursor_type', precursor_type_labels), axis=1)
meta['precursor_type_id'] = {'n_class': len(precursor_type_labels), 'labels': precursor_type_labels}
print(precursor_type_labels)

['[M+K]+', '[M+NH4]+', '[M+H]+', '[M-H]-', '[M+Na]+', '[M+2H]+', '[M]+', '[M+H+K]+', '[2M+Na]+', '[M+Cl]-', '[M-H2O+H]+', '[2M-H]-', '[2M+K]+', '[2M+NH4]+', '[M+ACN+H]+', '[2M+H]+', '[M+Na-2H]-', '[M+2Na-H]+', '[M-H2O-H]-', '[M+CH3OH+H]+', '[M+DMSO+H]+', '[M+Hac-H]-', '[M+FA-H]-', '[M+ACN+Na]+', '[M+K-2H]-', '[2M+ACN+H]+', '[M+H2+H]+', '[M+H]', 'M-H', 'M+Na', 'M+K', 'M+NH4', 'M+H', '[M+H-H2O]-', '[M-H]1-', 'M+Cl', '[2M+FA-H]-', '[M+H+Na]+', 'M+', '[M+H-H2O]+', '[M-H]', '[M-H]+', '[M]+*', '[M+2K-H]+', '[2M+3H2O+2H]+', '[M+2H]++', '[M]++', '[M-C3H7O2]-', '[M-2H]-', 'M+2Na', '[M+2Na]+', '[M+H-2H2O]+', '[M]-', 'carotenoid', '[M-C6H10O5+H]+', '[M+H-C9H10O5]+', '[M+H-NH3]+', '[M-OH]+', '[M+HCOO]-', '[M+Hac+Na-2H]-', 'M+H-H2O', '[M+HOO]-', '[M-CO2-H]-', '[M+CH3COO]-', '[M-C2H3O]-', '[M-2H2O+H]+', '[M+H-C12H20O9]+', '[M-2H]--', '[M-CH3]-', '[M+3Na]+', '[3M-H]-', '[M+CH3]+', '[M-H-CO2-2HF]-']


### Get taxonomy by matching on inchi column

In [12]:
print("Get taxonomy by matching on inchi column...")
tax_df = tax_df[tax_df['InChI'].isin(df['InChI'].unique())]
tax_df = tax_df.apply(retrieve_fields, axis=1)
tax_df = tax_df.drop(['taxonomy'], axis=1)
tax_df

Get taxonomy by matching on inchi column...


Unnamed: 0,InChI,kingdom,superclass,class,subclass
0,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins
1,InChI=1S/C15H26O2/c1-10-7-12(16)9-15(4)6-5-11(...,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Sesquiterpenoids
2,InChI=1S/C20H24O10/c1-11(13(22)9-21)19(27)28-1...,Organic compounds,Lipids and lipid-like molecules,Saccharolipids,
3,InChI=1S/C25H28O11/c26-10-17-20(28)21(29)22(30...,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides
4,InChI=1S/C36H52O13/c1-17-26(39)27(40)29(42)32(...,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,Steroid lactones
...,...,...,...,...,...
15926,InChI=1S/C36H42O19/c1-15(50-23(39)8-5-16-3-6-1...,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides
15927,InChI=1S/C18H20O4/c1-4-5-6-7-8-9-10-11-12-13-1...,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty alcohol esters
15928,InChI=1S/C30H44O8/c1-17-27(33)24(35-3)14-26(37...,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,Steroid lactones
15929,InChI=1S/C17H24O5/c1-9(6-5-7-21-12(4)18)14-10(...,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene lactones


In [13]:
def get_number_of_classes(df, field_name):
    labels = np.unique(df[field_name].to_numpy())
    labels = labels[labels != ''].tolist()
    return labels

### Derive distinct classes

In [14]:
labels = []
for i, field_name in enumerate(tax_fields):
    labels.append(get_number_of_classes(tax_df, field_name))
    meta[field_name + '_id'] = {'n_class': len(labels[i]), 'labels': labels[i]}
    print("Number of distinct %s: %d" % (field_name, len(labels[i])))

Number of distinct kingdom: 2
Number of distinct superclass: 19
Number of distinct class: 280
Number of distinct subclass: 476


In [15]:
meta

{'ionization_mode_id': {'n_class': 2, 'labels': ['negative', 'positive']},
 'instrument_id': {'n_class': 305,
  'labels': ['Thermo Q Exactive HF',
   '6566 QTOF Agilent',
   '6567 QTOF Agilent',
   '6568 QTOF Agilent',
   '6569 QTOF Agilent',
   '6570 QTOF Agilent',
   '6571 QTOF Agilent',
   '6572 QTOF Agilent',
   '6573 QTOF Agilent',
   '6574 QTOF Agilent',
   '6575 QTOF Agilent',
   '6576 QTOF Agilent',
   '6577 QTOF Agilent',
   '6578 QTOF Agilent',
   '6579 QTOF Agilent',
   '6580 QTOF Agilent',
   '6581 QTOF Agilent',
   '6582 QTOF Agilent',
   '6583 QTOF Agilent',
   '6584 QTOF Agilent',
   '6585 QTOF Agilent',
   '6586 QTOF Agilent',
   '6587 QTOF Agilent',
   '6588 QTOF Agilent',
   '6589 QTOF Agilent',
   '6591 QTOF Agilent',
   '6592 QTOF Agilent',
   '6593 QTOF Agilent',
   '6594 QTOF Agilent',
   '6595 QTOF Agilent',
   '6596 QTOF Agilent',
   '6597 QTOF Agilent',
   '6598 QTOF Agilent',
   '6599 QTOF Agilent',
   '6601 QTOF Agilent',
   '6602 QTOF Agilent',
   '6603 QTOF

In [16]:
np.save(utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_meta', meta)

### Classify each row in dataset and assign label (index to class)

In [17]:
def assign_tax_label(row, labels):
    for i, field_name in enumerate(tax_fields):
        tax = row[field_name]
        try:
            row[field_name + '_id'] = labels[i].index(tax)
        except:
            row[field_name + '_id'] = -1
    return row

In [18]:
df_merged = df.merge(tax_df, on='InChI', suffixes=('','_y')).drop(['InChI'], axis=1)

In [19]:
df = df_merged.apply(lambda row: assign_tax_label(row, labels), axis=1)
df

Unnamed: 0,spectrum,SMILES,instrument,library,author,publication,structural_key,CASMI,split,id,...,instrument_type_id,precursor_type_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id
0,52.073152:0.215740 53.039199:0.251984 55.05488...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
1,50.179433:0.988081 52.761359:0.667573 53.03928...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
2,50.382111:0.657423 52.393542:0.636186 52.67915...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
3,51.129190:0.047539 51.138777:0.057363 52.37786...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
4,50.399188:0.071114 51.370479:0.060948 51.44307...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,AWMHMGFGCLBSAY-UHFFFAOYSA-N,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125826,119.0492:12.408044100416802 120.0468:1.0420067...,O([C@@H]1O[C@@H]([C@@H](O)[C@H](O)[C@H]1O)CO)c...,,,,,AFYIWKNGSIYXCQ,CASMI2017,test,AFYIWKNGSIYXCQ-VMPITWQZSA-N,...,-1,-1,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene glycosides,1,6,198,446
125827,65.0322:1.6944718944370167 121.028:8.284892169...,C/C=C/C#CC#C/C=C/C=C/C(CCOC(=O)C)OC(=O)C,,,,,PGHCYQUSYHWJAI,CASMI2017,test,PGHCYQUSYHWJAI-NOPLWHKLSA-N,...,-1,-1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty alcohol esters,1,6,94,176
125828,335.1976:55.84251968503937 353.2096:100.0 354....,C1C(CC2C(C1)(C1C(CC2)C2(C(CC1)(C(CC2)C1=CC(=O)...,,,,,BVEFMGIAGANFEG,CASMI2017,test,BVEFMGIAGANFEG-UHFFFAOYSA-N,...,-1,-1,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,Steroid lactones,1,6,238,429
125829,69.0608:0.47073270568972575 69.0767:0.48864101...,[C@H]12[C@H]([C@@H](C(=C(C1)C)[C@H](CCCOC(=O)C...,,,,,QKUFZFLZBUSEHN,CASMI2017,test,QKUFZFLZBUSEHN-UHFFFAOYSA-N,...,-1,-1,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Terpene lactones,1,6,198,447


In [20]:
df.to_csv(utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_full.csv')

## Preprocess HMDB dataset

In [72]:
df_train = dt.HMDB.open(utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_train.csv')
df_valid = dt.HMDB.open(utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_valid.csv')
df_test = dt.HMDB.open(utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_test.csv')
train_tax_df = pd.read_csv(utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_train_taxonomy.csv')[['HMDB', 'taxonomy']]
test_tax_df = pd.read_csv(utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_test_taxonomy.csv')[['HMDB', 'taxonomy']]

In [91]:
meta = dict()
df_train['split'] = 'train'
df_valid['split'] = 'valid'
df_test['split'] = 'test'
df_train

Unnamed: 0.1,Unnamed: 0,HMDB,spectrum,collision_energy,ionization mode,HMDB_map,SMILES,split,id
0,0,HMDB31492,29.03912516:0.7243706179 57.03403978:4.5379311...,10,1,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492
1,2,HMDB57717,237.2218405:4.521440358 255.2324052:1.16365746...,10,1,HMDB:HMDB0057717,[H][C@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CCCCC...,train,HMDB57717
2,3,HMDB00897,123.0670712:0.0628874007 124.0510868:0.0811684...,10,1,HMDB:HMDB0000897,CN1C=NC2=C1C(=O)N=C(N)N2,train,HMDB00897
3,4,HMDB11742,43.01838972:2.011160033 61.0289544:4.798640999...,10,1,HMDB:HMDB0011742,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O[C@H]1O[C@...,train,HMDB11742
4,5,HMDB48637,219.2112759:1.542293612 223.242576:1.708992194...,10,1,HMDB:HMDB0048637,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCC)(COC(=O)CCC\...,train,HMDB48637
...,...,...,...,...,...,...,...,...,...
235279,241278,HMDB56410,41.00273965:0.9471206446 59.01330434:0.9488144...,40,0,HMDB:HMDB0056410,[H][C@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CCCCC...,train,HMDB56410
235280,241279,HMDB40272,25.00782503:5.369222268 27.0234751:0.696378008...,40,0,HMDB:HMDB0040272,CCCCC1=CC=CO1,train,HMDB40272
235281,241280,HMDB06227,43.01838972:2.430949615 83.04968984:5.37402223...,40,0,HMDB:HMDB0006227,C[C@H](\C=C\[C@@](C)(O)C(C)(O)C)[C@@]1([H])CC[...,train,HMDB06227
235282,241282,HMDB50061,41.00273965:3.928861584 43.01838972:2.37429696...,40,0,HMDB:HMDB0050061,[H]C(COC(=O)CCCCCCC\C=C/CCCCCCCC)(COC(=O)CCCCC...,train,HMDB50061


In [74]:
frames = [df_train, df_valid, df_test]
df_train.shape, df_valid.shape, df_test.shape

((235284, 9), (3000, 9), (3000, 9))

In [75]:
df = pd.concat(frames)
df

Unnamed: 0.1,Unnamed: 0,HMDB,spectrum,collision_energy,ionization mode,HMDB_map,SMILES,split,id
0,0,HMDB31492,29.03912516:0.7243706179 57.03403978:4.5379311...,10,1,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492
1,2,HMDB57717,237.2218405:4.521440358 255.2324052:1.16365746...,10,1,HMDB:HMDB0057717,[H][C@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CCCCC...,train,HMDB57717
2,3,HMDB00897,123.0670712:0.0628874007 124.0510868:0.0811684...,10,1,HMDB:HMDB0000897,CN1C=NC2=C1C(=O)N=C(N)N2,train,HMDB00897
3,4,HMDB11742,43.01838972:2.011160033 61.0289544:4.798640999...,10,1,HMDB:HMDB0011742,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O[C@H]1O[C@...,train,HMDB11742
4,5,HMDB48637,219.2112759:1.542293612 223.242576:1.708992194...,10,1,HMDB:HMDB0048637,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCC)(COC(=O)CCC\...,train,HMDB48637
...,...,...,...,...,...,...,...,...,...
2995,240837,HMDB58603,41.00273965:0.9909645101 59.01330434:1.4321801...,40,0,HMDB:HMDB0058603,[H][C@@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CC\C...,test,HMDB58603
2996,240962,HMDB59910,39.0234751:0.759115769 41.03912516:0.389504837...,40,0,HMDB:HMDB0059910,CC(C)C1CCC(C)C2=CC=C(C)C=C12,test,HMDB59910
2997,241127,HMDB07677,41.00273965:4.990915585 43.01838972:3.01300332...,40,0,HMDB:HMDB0007677,[H][C@](CO)(COC(=O)CCCCC\C=C/C\C=C/C\C=C/C\C=C...,test,HMDB07677
2998,241163,HMDB07663,41.00273965:4.270338041 43.01838972:2.94748694...,40,0,HMDB:HMDB0007663,[H][C@](CO)(COC(=O)CCCCCCCCCCC\C=C/C\C=C/CCCCC...,test,HMDB07663


### Preprocess ionization mode

In [76]:
if 'ionization mode' in df:
    df['ionization_mode'] = df['ionization mode']
    df['ionization_mode_id'] = df['ionization mode']
    df = df.drop(['ionization mode'], axis=1)
else:
    print('ionization mode doesn\'t exist')
df

Unnamed: 0.1,Unnamed: 0,HMDB,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id
0,0,HMDB31492,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1
1,2,HMDB57717,237.2218405:4.521440358 255.2324052:1.16365746...,10,HMDB:HMDB0057717,[H][C@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CCCCC...,train,HMDB57717,1,1
2,3,HMDB00897,123.0670712:0.0628874007 124.0510868:0.0811684...,10,HMDB:HMDB0000897,CN1C=NC2=C1C(=O)N=C(N)N2,train,HMDB00897,1,1
3,4,HMDB11742,43.01838972:2.011160033 61.0289544:4.798640999...,10,HMDB:HMDB0011742,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O[C@H]1O[C@...,train,HMDB11742,1,1
4,5,HMDB48637,219.2112759:1.542293612 223.242576:1.708992194...,10,HMDB:HMDB0048637,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCC)(COC(=O)CCC\...,train,HMDB48637,1,1
...,...,...,...,...,...,...,...,...,...,...
2995,240837,HMDB58603,41.00273965:0.9909645101 59.01330434:1.4321801...,40,HMDB:HMDB0058603,[H][C@@](O)(COP(O)(=O)OC[C@@]([H])(COC(=O)CC\C...,test,HMDB58603,0,0
2996,240962,HMDB59910,39.0234751:0.759115769 41.03912516:0.389504837...,40,HMDB:HMDB0059910,CC(C)C1CCC(C)C2=CC=C(C)C=C12,test,HMDB59910,0,0
2997,241127,HMDB07677,41.00273965:4.990915585 43.01838972:3.01300332...,40,HMDB:HMDB0007677,[H][C@](CO)(COC(=O)CCCCC\C=C/C\C=C/C\C=C/C\C=C...,test,HMDB07677,0,0
2998,241163,HMDB07663,41.00273965:4.270338041 43.01838972:2.94748694...,40,HMDB:HMDB0007663,[H][C@](CO)(COC(=O)CCCCCCCCCCC\C=C/C\C=C/CCCCC...,test,HMDB07663,0,0


In [92]:
ion_classes = df['ionization_mode'].unique()
meta['ionization_mode_id'] = {'n_class': 2, 'labels': ['negative', 'positive']}
ion_classes

array([1, 0], dtype=int64)

### Get taxonomy by matching on HMDB (id) column

In [79]:
train_tax_df.shape, test_tax_df.shape

((14986, 2), (500, 2))

In [80]:
tax_df = pd.concat([train_tax_df, test_tax_df])
tax_df.shape

(15486, 2)

In [81]:
def match_taxonomy_by_id(tdf, df):
    tdf = tdf[tdf['HMDB'].isin(df['id'].unique())]
    tdf = tdf.apply(retrieve_fields, axis=1)
    tdf = tdf.drop(['taxonomy'], axis=1)
    return tdf

In [82]:
tax_df = match_taxonomy_by_id(tax_df, df)
# test_tax_df = match_taxonomy_by_id(test_tax_df, df)
tax_df

Unnamed: 0,HMDB,kingdom,superclass,class,subclass
0,HMDB31492,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
1,HMDB57717,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols
2,HMDB00897,Organic compounds,Organoheterocyclic compounds,Imidazopyrimidines,Purines and purine derivatives
3,HMDB11742,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty acyl glycosides
4,HMDB48637,Organic compounds,Lipids and lipid-like molecules,Glycerolipids,Triradylcglycerols
...,...,...,...,...,...
495,HMDB58603,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols
496,HMDB59910,Organic compounds,Lipids and lipid-like molecules,Prenol lipids,Sesquiterpenoids
497,HMDB07677,Organic compounds,Lipids and lipid-like molecules,Glycerolipids,Diradylglycerols
498,HMDB07663,Organic compounds,Lipids and lipid-like molecules,Glycerolipids,Diradylglycerols


### Derive distinct classes

In [93]:
labels = []
for i, field_name in enumerate(tax_fields):
    labels.append(get_number_of_classes(tax_df, field_name))
    meta[field_name + '_id'] = {'n_class': len(labels[i]), 'labels': labels[i]}
    print("Number of distinct %s: %d" % (field_name, len(labels[i])))

Number of distinct kingdom: 2
Number of distinct superclass: 22
Number of distinct class: 253
Number of distinct subclass: 405


In [94]:
meta

{'ionization_mode_id': {'n_class': 2, 'labels': ['negative', 'positive']},
 'kingdom_id': {'n_class': 2,
  'labels': ['Inorganic compounds', 'Organic compounds']},
 'superclass_id': {'n_class': 22,
  'labels': ['Alkaloids and derivatives',
   'Benzenoids',
   'Homogeneous metal compounds',
   'Homogeneous non-metal compounds',
   'Hydrocarbon derivatives',
   'Hydrocarbons',
   'Lignans, neolignans and related compounds',
   'Lipids and lipid-like molecules',
   'Mixed metal/non-metal compounds',
   'Nucleosides, nucleotides, and analogues',
   'Organic 1,3-dipolar compounds',
   'Organic Polymers',
   'Organic acids and derivatives',
   'Organic nitrogen compounds',
   'Organic oxygen compounds',
   'Organic salts',
   'Organohalogen compounds',
   'Organoheterocyclic compounds',
   'Organometallic compounds',
   'Organophosphorus compounds',
   'Organosulfur compounds',
   'Phenylpropanoids and polyketides']},
 'class_id': {'n_class': 253,
  'labels': ["(5'->5')-dinucleotides",
   '2

In [95]:
np.save(utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_meta', np.array(meta, dtype=object))

### Classify each row in dataset and assign label (index to class)

In [86]:
df_merged = df.merge(tax_df, on='HMDB', suffixes=('','_y')).drop(['HMDB'], axis=1)
df_merged

Unnamed: 0,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id,kingdom,superclass,class,subclass
0,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
1,29.03912516:9.587454265 55.01838972:2.54439610...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
2,27.0234751:6.640847978 29.03912516:6.54806256 ...,40,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
3,27.0234751:0.03173920628 29.00273965:0.0149975...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
4,27.0234751:0.3171135208 29.00273965:0.47242358...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds
...,...,...,...,...,...,...,...,...,...,...,...,...
92911,41.03912516:3.321276734 43.05477522:1.41515277...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids
92912,27.0234751:1.818499485 41.03912516:10.00329657...,40,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids
92913,17.00273965:0.1727702035 43.05477522:0.1824458...,10,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids
92914,44.99765427:0.9915595017 59.01330434:5.8014561...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids


In [87]:
df = df_merged.apply(lambda row: assign_label(row, labels), axis=1)
df

Unnamed: 0,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id
0,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87
1,29.03912516:9.587454265 55.01838972:2.54439610...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87
2,27.0234751:6.640847978 29.03912516:6.54806256 ...,40,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87
3,27.0234751:0.03173920628 29.00273965:0.0149975...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87
4,27.0234751:0.3171135208 29.00273965:0.47242358...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92911,41.03912516:3.321276734 43.05477522:1.41515277...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137
92912,27.0234751:1.818499485 41.03912516:10.00329657...,40,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137
92913,17.00273965:0.1727702035 43.05477522:0.1824458...,10,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137
92914,44.99765427:0.9915595017 59.01330434:5.8014561...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137


In [88]:
# df = df.drop(['Unnamed: 0'], axis=1)
df['HMDB'] = df['id']
df

Unnamed: 0,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id,HMDB
0,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
1,29.03912516:9.587454265 55.01838972:2.54439610...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
2,27.0234751:6.640847978 29.03912516:6.54806256 ...,40,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
3,27.0234751:0.03173920628 29.00273965:0.0149975...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
4,27.0234751:0.3171135208 29.00273965:0.47242358...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92911,41.03912516:3.321276734 43.05477522:1.41515277...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92912,27.0234751:1.818499485 41.03912516:10.00329657...,40,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92913,17.00273965:0.1727702035 43.05477522:0.1824458...,10,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92914,44.99765427:0.9915595017 59.01330434:5.8014561...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534


In [89]:
df.to_csv(utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_full.csv')