# Prepare MS/MS dataset
Select n distinct compounds by unique InChIKey from specified dataset and export data to csv file.

In [3]:
import os
import numpy as np
import pandas as pd

from matchms.filtering import normalize_intensities, require_minimum_number_of_peaks, select_by_mz, select_by_relative_intensity
from matchms import calculate_scores, Spectrum

import specvae.utils as utils, specvae.dataset as dt
from specvae.dataset import MoNA

ModuleNotFoundError: No module named 'specvae'

In [2]:
def spectrum_processing(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=2500)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s

In [3]:
def parse_spectrum(row):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    idx = np.argsort(np.array(mzs))
    s = Spectrum(mz=np.array(mzs)[idx], intensities=np.array(ints)[idx])
    s = spectrum_processing(s)
    return row if s else None

## Load and preprocess dataset
Apply preprocessing function on selected subset of molecules:
- select spectra with peaks within m/z range \[0, 2500\],
- discard spectra with less than 10 peaks,
- discard spectra with intensities below 0.1

In [4]:
dataset = 'MoNA'
n_molecules = 1000

In [5]:
print("Load and preprocess %s validation data..." % dataset)
if dataset == 'HMDB':
    data_path = utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_train.csv'
    df = HMDB.get_n_molecules(n_molecules, filepath=data_path)

elif dataset == 'MoNA':
    data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA.csv'
    df, valid_df, test_df = MoNA.get_by_split(data_path)
    df = MoNA.get_unique(n_molecules, df=df)

X = df.apply(parse_spectrum, axis=1)
print("Preprocessing done!")

Load and preprocess MoNA validation data...
  exec(code_obj, self.user_global_ns, self.user_ns)
Preprocessing done!


In [6]:
X.columns

Index(['spectrum', 'InChI', 'molecular formula', 'total exact mass', 'SMILES',
       'InChIKey', 'collision energy', 'ionization mode', 'instrument type',
       'instrument', 'precursor m/z', 'precursor type', 'library', 'author',
       'publication', 'structural_key', 'CASMI', 'collision_energy_new', 'id'],
      dtype='object')

In [7]:
X = X.dropna(subset=['spectrum'])
X

Unnamed: 0,spectrum,InChI,molecular formula,total exact mass,SMILES,InChIKey,collision energy,ionization mode,instrument type,instrument,precursor m/z,precursor type,library,author,publication,structural_key,CASMI,collision_energy_new,id
0,52.073152:0.215740 53.039199:0.251984 55.05488...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,AWMHMGFGCLBSAY-UHFFFAOYSA-N,35HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,35.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
1,50.179433:0.988081 52.761359:0.667573 53.03928...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,AWMHMGFGCLBSAY-UHFFFAOYSA-N,45HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,45.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
2,50.382111:0.657423 52.393542:0.636186 52.67915...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,AWMHMGFGCLBSAY-UHFFFAOYSA-N,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,65.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
3,51.129190:0.047539 51.138777:0.057363 52.37786...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,AWMHMGFGCLBSAY-UHFFFAOYSA-N,35HCD,positive,ESI-QFT,Thermo Q Exactive HF,374.196198,[M+NH4]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,35.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
4,50.399188:0.071114 51.370479:0.060948 51.44307...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,AWMHMGFGCLBSAY-UHFFFAOYSA-N,45HCD,positive,ESI-QFT,Thermo Q Exactive HF,374.196198,[M+NH4]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,45.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125067,151.0042:17.148697 178.9990:8.265202 227.0368:...,InChI=1S/C21H20O11/c1-7-15(26)17(28)18(29)21(3...,C21H20O11,448.100561,C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC2=C(C3=C...,OXGUCUVFOIWWQJ-HQBVPOQASA-N,Ramp 5-60 V,negative,LC-ESI-QTOF,"UPLC Q-Tof Premier, Waters",447.092740,[M-H]-,MassBank,"Matsuda F, Suzuki M, Sawada Y, Plant Science C...",,OXGUCUVFOIWWQJ,,,OXGUCUVFOIWWQJ-HQBVPOQASA-N
125102,71.0507:16.496674 85.0291:21.834812 129.0546:8...,InChI=1S/C21H20O12/c1-6-14(26)17(29)18(30)21(3...,C21H20O12,464.095476,C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC2=C(C3=C...,DCYOADKBABEMIQ-OWMUPTOHSA-N,Ramp 5-60 V,positive,LC-ESI-QTOF,"UPLC Q-Tof Premier, Waters",465.103290,[M+H]+,MassBank,"Matsuda F, Suzuki M, Sawada Y, Plant Science C...",,DCYOADKBABEMIQ,,,DCYOADKBABEMIQ-OWMUPTOHSA-N
125105,73.0298:10.413540 137.0232:3.818349 153.0179:5...,InChI=1S/C20H18O11/c21-8-4-11(24)14-13(5-8)30-...,C20H18O11,434.084911,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O)O)O2)O[C...,PZZRDJXEMZMZFD-IEGSVRCHSA-N,Ramp 5-60 V,positive,LC-ESI-QTOF,"UPLC Q-Tof Premier, Waters",435.092730,[M+H]+,MassBank,"Matsuda F, Suzuki M, Sawada Y, Plant Science C...",,PZZRDJXEMZMZFD,,,PZZRDJXEMZMZFD-IEGSVRCHSA-N
125108,71.0504:16.853244 85.0284:21.963241 121.0277:1...,InChI=1S/C21H20O11/c1-7-15(26)17(28)18(29)21(3...,C21H20O11,448.100561,C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC2=C(C3=C...,OXGUCUVFOIWWQJ-HQBVPOQASA-N,Ramp 5-60 V,positive,LC-ESI-QTOF,"UPLC Q-Tof Premier, Waters",449.108380,[M+H]+,MassBank,"Matsuda F, Suzuki M, Sawada Y, Plant Science C...",,OXGUCUVFOIWWQJ,,,OXGUCUVFOIWWQJ-HQBVPOQASA-N


## Save csv file

In [8]:
# Save file in specified location
filepath = utils.get_project_path() / '.data' / dataset / ('%s_score.csv' % dataset)
X.to_csv(filepath)