In [1]:
import os
import numpy as np
import pandas as pd

from matchms.filtering import normalize_intensities, require_minimum_number_of_peaks, select_by_mz, select_by_relative_intensity
from matchms import calculate_scores, Spectrum

import specvae.utils as utils, specvae.dataset as dt
from specvae.dataset import MoNA

In [2]:
def spectrum_processing(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=2500)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s

In [3]:
def parse_spectrum(row):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    idx = np.argsort(np.array(mzs))
    s = Spectrum(mz=np.array(mzs)[idx], intensities=np.array(ints)[idx])
    s = spectrum_processing(s)
    return row if s else None

In [4]:
data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA.csv'
df, valid_df, test_df = MoNA.get_by_split(data_path)
df = df.loc[(df['instrument'] == 'Thermo Q Exactive HF') & (df['collision_energy_new'] == 65) & (df['instrument type'] == 'ESI-QFT')]

X = df.apply(parse_spectrum, axis=1)
print("Preprocessing done!")

  exec(code_obj, self.user_global_ns, self.user_ns)


Preprocessing done!


In [5]:
X = X.dropna(subset=['spectrum'])
X

Unnamed: 0,spectrum,InChI,molecular formula,total exact mass,SMILES,collision energy,ionization mode,instrument type,instrument,precursor m/z,precursor type,library,author,publication,structural_key,CASMI,collision_energy_new,id
2,50.382111:0.657423 52.393542:0.636186 52.67915...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,395.125519,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,65.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
5,50.983944:0.348803 51.940649:1.119231 52.42125...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,374.196198,[M+NH4]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,65.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
8,50.469246:0.453788 50.570254:0.476217 51.79498...,"InChI=1S/C21H24O5/c1-7-20(3,4)15-9-13-8-14-10-...",C21H24O5,356.162374,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,357.169647,[M+H]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,65.0,AWMHMGFGCLBSAY-UHFFFAOYSA-N
11,50.152537:0.013816 51.570298:0.006645 51.87599...,InChI=1S/C15H26O2/c1-10-7-12(16)9-15(4)6-5-11(...,C15H26O2,238.193280,OC1CC(=C)C2CC(CCC2(C)C1)C(O)(C)C,65HCD,negative,ESI-QFT,Thermo Q Exactive HF,237.185989,[M-H]-,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,XZXBGGYJQALVAW,,65.0,XZXBGGYJQALVAW-VSUHMYSYSA-N
14,50.248304:0.728947 50.392709:0.896914 51.50882...,InChI=1S/C15H26O2/c1-10-7-12(16)9-15(4)6-5-11(...,C15H26O2,238.193280,OC1CC(=C)C2CC(CCC2(C)C1)C(O)(C)C,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,261.182495,[M+Na]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,XZXBGGYJQALVAW,,65.0,XZXBGGYJQALVAW-VSUHMYSYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124827,50.327864:0.078436 50.337454:0.108352 50.80890...,InChI=1S/C30H22O11/c31-14-5-1-12(2-6-14)28-24(...,C30H22O11,558.116212,O=C1C2=C(O)C=C(O)C(=C2OC(C3=CC=C(O)C=C3)C1O)C4...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,559.123474,[M+H]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AFDANKUHSLVEBJ,,65.0,AFDANKUHSLVEBJ-MBOHLOIDSA-N
124830,67.017618:1.906251 67.643275:0.050316 67.68749...,InChI=1S/C41H32O27/c42-15-1-10(2-16(43)26(15)5...,C41H32O27,956.113096,O=C(O)CC1C(=O)OC2C(OC(OC(=O)C3=CC(O)=C(O)C(O)=...,65HCD,negative,ESI-QFT,Thermo Q Exactive HF,955.105835,[M-H]-,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,YGVHOSGNOYKRIH,,65.0,YGVHOSGNOYKRIH-UKTHYFLESA-N
124833,68.313860:0.031497 68.335651:0.042430 68.41591...,InChI=1S/C41H32O27/c42-15-1-10(2-16(43)26(15)5...,C41H32O27,956.113096,O=C(O)CC1C(=O)OC2C(OC(OC(=O)C3=CC(O)=C(O)C(O)=...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,979.102295,[M+Na]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,YGVHOSGNOYKRIH,,65.0,YGVHOSGNOYKRIH-UKTHYFLESA-N
124836,69.007298:0.015418 69.034202:1.744775 69.07054...,InChI=1S/C41H32O27/c42-15-1-10(2-16(43)26(15)5...,C41H32O27,956.113096,O=C(O)CC1C(=O)OC2C(OC(OC(=O)C3=CC(O)=C(O)C(O)=...,65HCD,positive,ESI-QFT,Thermo Q Exactive HF,995.076233,[M+K]+,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,YGVHOSGNOYKRIH,,65.0,YGVHOSGNOYKRIH-UKTHYFLESA-N


In [6]:
x = X.to_numpy()
x.shape

(8046, 18)