# Prepare MS/MS dataset
Select n distinct compounds by unique InChIKey from specified dataset and export data to csv file.

In [1]:
import os
import numpy as np
import pandas as pd

from matchms.filtering import normalize_intensities, require_minimum_number_of_peaks, select_by_mz, select_by_relative_intensity
from matchms import calculate_scores, Spectrum

import specvae.utils as utils, specvae.dataset as dt
from specvae.dataset import Spectra

In [2]:
def spectrum_processing(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=2500)
    s = require_minimum_number_of_peaks(s, n_required=1)
    # s = select_by_relative_intensity(s, intensity_from=0.001, intensity_to=1.0)
    return s

In [3]:
def parse_spectrum(row):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    idx = np.argsort(np.array(mzs))
    s = Spectrum(mz=np.array(mzs)[idx], intensities=np.array(ints)[idx])
    s = spectrum_processing(s)
    return row if s else None

## Load and preprocess dataset
Apply preprocessing function on selected subset of molecules:
- select spectra with peaks within m/z range \[0, 2500\],
- discard spectra with less than 10 peaks,
- discard spectra with intensities below 0.1

In [4]:
dataset = 'HMDB'
n_molecules = 10000

In [5]:
print("Load and preprocess %s validation data..." % dataset)
data_path = utils.get_project_path() / '.data' / dataset / ('%s_full.csv' % dataset)
df = Spectra.get_unique(n_molecules, csv_file=data_path)
X = df.apply(parse_spectrum, axis=1)
print("Preprocessing done!")

Load and preprocess HMDB validation data...
Preprocessing done!


In [6]:
X.columns

Index(['Unnamed: 0', 'spectrum', 'collision_energy', 'HMDB_map', 'SMILES',
       'split', 'id', 'ionization_mode', 'ionization_mode_id', 'kingdom',
       'superclass', 'class', 'subclass', 'kingdom_id', 'superclass_id',
       'class_id', 'subclass_id', 'HMDB'],
      dtype='object')

## Analyze dataset
Compute maximum and average number of peaks across the entire dataset.

In [7]:
def to_num_peaks(row):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    return len(mzs)

In [8]:
X = X.dropna(subset=['spectrum'])
X

Unnamed: 0.1,Unnamed: 0,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id,HMDB
0,0,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
1,1,29.03912516:9.587454265 55.01838972:2.54439610...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
2,2,27.0234751:6.640847978 29.03912516:6.54806256 ...,40,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
3,3,27.0234751:0.03173920628 29.00273965:0.0149975...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
4,4,27.0234751:0.3171135208 29.00273965:0.47242358...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,59995,223.242576:2.727977493 233.2269259:2.281121111...,20,HMDB:HMDB0050033,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COC(=O)CC...,train,HMDB50033,1,1,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols,1,7,88,167,HMDB50033
59996,59996,219.2112759:2.333845237 221.2269259:2.41941652...,40,HMDB:HMDB0050033,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COC(=O)CC...,train,HMDB50033,1,1,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols,1,7,88,167,HMDB50033
59997,59997,233.2269259:0.5569748378 237.2582261:1.3758476...,10,HMDB:HMDB0050033,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COC(=O)CC...,train,HMDB50033,0,0,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols,1,7,88,167,HMDB50033
59998,59998,59.01330434:0.9224031294 233.2269259:1.0072093...,20,HMDB:HMDB0050033,[H][C@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COC(=O)CC...,train,HMDB50033,0,0,Organic compounds,Lipids and lipid-like molecules,Glycerophospholipids,Glycerophosphoglycerophosphoglycerols,1,7,88,167,HMDB50033


In [9]:
Y = X.apply(to_num_peaks, axis=1)
Y

0         6
1         6
2         6
3        23
4        23
         ..
59995    31
59996    31
59997    31
59998    31
59999    31
Length: 60000, dtype: int64

In [10]:
mm = Y.max()
Y.median(), Y.mean(), Y.max(), Y.min()

(31.0, 27.581766666666667, 31, 1)

In [11]:
Y.to_numpy()

array([ 6,  6,  6, ..., 31, 31, 31], dtype=int64)

In [12]:
from specvae.visualize import plot_distribution
import plotly.express as px
# plot_distribution(Y.to_numpy(), 'peaks numbers', 'sdc', 'sdc', bins=200)
px.histogram(Y, x=0)

## Save csv file

In [13]:
# Save file in specified location
# filepath = utils.get_project_path() / '.data' / dataset / ('%s_score.csv' % dataset)
# X.to_csv(filepath)