# Compute spectra similarities
Given preselected spectra, we compute several similarity scores. Obtained pairwise similarities are saved into file.

In [1]:
import os
import numpy as np
import pandas as pd

from matchms.filtering import normalize_intensities, add_precursor_mz
from matchms import calculate_scores, Spectrum

import specvae.utils as utils, specvae.dataset as dt
from specvae.dataset import MoNA

In [2]:
def parse_spectrum(row, normalize=True):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    idx = np.argsort(np.array(mzs))
    mzs, ints = np.array(mzs)[idx], np.array(ints)[idx]
    pre = mzs.min() if len(mzs) > 0 else 0.01
    s = Spectrum(mz=mzs, intensities=ints, 
            metadata={
                'inchi': str(row['InChI']), 
                'smiles': str(row['SMILES']),
                'precursor_mz': pre if pre > 0 else 0.01 # required for ModifiedCosine score
            })
    if normalize:
        s = normalize_intensities(s)
    s = add_precursor_mz(s)
    return s

## Load data

In [3]:
dataset = 'MoNA'

In [4]:
%%time
print("Load and preprocess %s data..." % dataset)
if dataset == 'HMDB':
    valid_data_path = utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_valid.csv'
    df_valid = pd.read_csv(valid_data_path)

elif dataset == 'MoNA':
    data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_score.csv'
    df = pd.read_csv(data_path)

X = df.apply(lambda row: parse_spectrum(row, True), axis=1)
X

Load and preprocess MoNA data...
Wall time: 16.2 s


0        <matchms.Spectrum.Spectrum object at 0x000001B...
1        <matchms.Spectrum.Spectrum object at 0x000001B...
2        <matchms.Spectrum.Spectrum object at 0x000001B...
3        <matchms.Spectrum.Spectrum object at 0x000001B...
4        <matchms.Spectrum.Spectrum object at 0x000001B...
                               ...                        
12193    <matchms.Spectrum.Spectrum object at 0x000001B...
12194    <matchms.Spectrum.Spectrum object at 0x000001B...
12195    <matchms.Spectrum.Spectrum object at 0x000001B...
12196    <matchms.Spectrum.Spectrum object at 0x000001B...
12197    <matchms.Spectrum.Spectrum object at 0x000001B...
Length: 12198, dtype: object

In [5]:
spectra = X.to_numpy()

### VAE similarity score

In [12]:
X1 = df.apply(lambda row: parse_spectrum(row, False), axis=1)
X1

0        <matchms.Spectrum.Spectrum object at 0x0000021...
1        <matchms.Spectrum.Spectrum object at 0x0000021...
2        <matchms.Spectrum.Spectrum object at 0x0000021...
3        <matchms.Spectrum.Spectrum object at 0x0000021...
4        <matchms.Spectrum.Spectrum object at 0x0000021...
                               ...                        
12193    <matchms.Spectrum.Spectrum object at 0x0000021...
12194    <matchms.Spectrum.Spectrum object at 0x0000021...
12195    <matchms.Spectrum.Spectrum object at 0x0000021...
12196    <matchms.Spectrum.Spectrum object at 0x0000021...
12197    <matchms.Spectrum.Spectrum object at 0x0000021...
Length: 12198, dtype: object

In [13]:
spectra1 = X1.to_numpy()

In [14]:
import torch
use_cuda = False
cpu_device = torch.device('cpu')
if torch.cuda.is_available() and use_cuda:
    device = torch.device('cuda:0')
    print('GPU device count:', torch.cuda.device_count())
else:
    device = torch.device('cpu')
print('Device in use: ', device)

Device in use:  cpu


In [15]:
import specvae.vae as vae
# Load VAE model
model_name = 'specvae_2500-500-50-500-2500 (20-06-2021_16-39-42)'
print("Load model: %s..." % model_name)
model_path = utils.get_project_path() / '.model' / dataset / model_name / 'model.pth'
model = vae.BaseVAE.load(model_path, device)
model.eval()

Load model: specvae_2500-500-50-500-2500 (20-06-2021_16-39-42)...


SpecVEA(
  (encoder_): Sequential(
    (en_lin_1): Linear(in_features=2500, out_features=500, bias=True)
    (en_lin_batchnorm_1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (en_act_1): ReLU()
  )
  (en_mu): Linear(in_features=500, out_features=50, bias=True)
  (en_mu_batchnorm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (en_log_var): Linear(in_features=500, out_features=50, bias=True)
  (en_log_var_batchnorm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (sample): SampleZ()
  (decoder): Sequential(
    (de_lin_1): Linear(in_features=50, out_features=500, bias=True)
    (de_lin_batchnorm_1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (de_act_1): ReLU()
    (de_lin_2): Linear(in_features=500, out_features=2500, bias=True)
    (de_act_2): ReLUlimit()
  )
)

In [16]:
%%time
from specvae.similarity import VAEScore
eu_a = [1.0, 0.5, 0.2, 0.1, 0.05]
kl_a = 0.0005
score = VAEScore(model)

Wall time: 130 ms


In [17]:
%%time 
d = []
for a in eu_a:
    d.append(score.euclidean(spectra1, spectra1, relative=True, a=a))

Wall time: 6min 32s


In [18]:
# %%time 
# kl = score.kl_divergence(spectra1, spectra1, relative=True, a=kl_a)

In [19]:
print(d)

[array([[1.        , 0.05536823, 0.04458951, ..., 0.04838866, 0.04671478,
        0.04479793],
       [0.05536823, 1.        , 0.07783285, ..., 0.05096755, 0.04965293,
        0.0468361 ],
       [0.04458951, 0.07783285, 1.        , ..., 0.05036995, 0.04945119,
        0.04599875],
       ...,
       [0.04838866, 0.05096755, 0.05036995, ..., 1.        , 0.16473678,
        0.11902787],
       [0.04671478, 0.04965293, 0.04945119, ..., 0.16473678, 1.        ,
        0.16802372],
       [0.04479793, 0.0468361 , 0.04599875, ..., 0.11902787, 0.16802372,
        1.        ]], dtype=float32), array([[1.        , 0.10492684, 0.08537231, ..., 0.09231053, 0.0892598 ,
        0.08575425],
       [0.10492684, 1.        , 0.1444247 , ..., 0.09699167, 0.09460828,
        0.08948126],
       [0.08537231, 0.1444247 , 1.        , ..., 0.09590898, 0.09424201,
        0.08795183],
       ...,
       [0.09231053, 0.09699167, 0.09590898, ..., 1.        , 0.28287384,
        0.21273442],
       [0.0892598 

In [20]:
filepath = utils.get_project_path() / '.data' / dataset / ('%s_vaesim_eu' % dataset)
np.savez(filepath, vae_eu=d)

### Cosine similarity score

In [15]:
%%time
from matchms.similarity import CosineGreedy
sm = CosineGreedy(tolerance=0.005, mz_power=0, intensity_power=1.0)
scores = sm.matrix(spectra, spectra, is_symmetric=True)

Wall time: 3.01 s


In [16]:
scores.shape

(100, 100)

In [17]:
scores

array([[(1.        ,  383), (0.86781405,  183), (0.4104528 ,  107), ...,
        (0.01279384,   27), (0.02088227,   32), (0.05963318,  219)],
       [(0.86781405,  183), (1.        ,  257), (0.72771441,  116), ...,
        (0.01080062,   22), (0.02803177,   32), (0.06915549,  144)],
       [(0.4104528 ,  107), (0.72771441,  116), (1.        ,  241), ...,
        (0.01956091,   23), (0.07163905,   40), (0.05929435,  126)],
       ...,
       [(0.01279384,   27), (0.01080062,   22), (0.01956091,   23), ...,
        (1.        ,  454), (0.8822523 ,   47), (0.13876879,   79)],
       [(0.02088227,   32), (0.02803177,   32), (0.07163905,   40), ...,
        (0.8822523 ,   47), (1.        ,  417), (0.16698591,   69)],
       [(0.05963318,  219), (0.06915549,  144), (0.05929435,  126), ...,
        (0.13876879,   79), (0.16698591,   69), (1.        , 1420)]],
      dtype=[('score', '<f8'), ('matches', '<i4')])

In [18]:
def extract_index(scores, index=0):
    sim_scores = np.zeros(scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            sim_scores[i, j] = scores[i, j][index]
    return sim_scores

In [19]:
sim_scores = extract_index(scores, index=0)
match_scores = extract_index(scores, index=1)

In [20]:
match_scores

array([[ 383.,  183.,  107., ...,   27.,   32.,  219.],
       [ 183.,  257.,  116., ...,   22.,   32.,  144.],
       [ 107.,  116.,  241., ...,   23.,   40.,  126.],
       ...,
       [  27.,   22.,   23., ...,  454.,   47.,   79.],
       [  32.,   32.,   40., ...,   47.,  417.,   69.],
       [ 219.,  144.,  126., ...,   79.,   69., 1420.]])

In [21]:
# Save file in specified location
filepath = utils.get_project_path() / '.data' / dataset / ('%s_cossim.npy' % dataset)
np.savez(filepath, sim=sim_scores, match=match_scores)

### Modified cosine similarity score

In [22]:
%%time
from matchms.similarity import ModifiedCosine
sm = ModifiedCosine(tolerance=0.005, mz_power=0, intensity_power=1.0)
scores = sm.matrix(spectra, spectra, is_symmetric=True)

Wall time: 904 ms


In [23]:
scores.shape

(100, 100)

In [24]:
scores

array([[(1.        ,  383), (0.86781851,  184), (0.41054337,  110), ...,
        (0.0128058 ,   30), (0.02142101,   40), (0.06393118,  223)],
       [(0.86781851,  184), (1.        ,  257), (0.7277428 ,  118), ...,
        (0.01107618,   29), (0.02815849,   36), (0.07125633,  150)],
       [(0.41054337,  110), (0.7277428 ,  118), (1.        ,  241), ...,
        (0.01958546,   27), (0.07171012,   43), (0.05929805,  129)],
       ...,
       [(0.0128058 ,   30), (0.01107618,   29), (0.01958546,   27), ...,
        (1.        ,  454), (0.88225747,   52), (0.13887608,   97)],
       [(0.02142101,   40), (0.02815849,   36), (0.07171012,   43), ...,
        (0.88225747,   52), (1.        ,  417), (0.16770879,   86)],
       [(0.06393118,  223), (0.07125633,  150), (0.05929805,  129), ...,
        (0.13887608,   97), (0.16770879,   86), (1.        , 1420)]],
      dtype=[('score', '<f8'), ('matches', '<i4')])

In [25]:
sim_scores = extract_index(scores, index=0)
match_scores = extract_index(scores, index=1)

In [38]:
# Save file in specified location
filepath = utils.get_project_path() / '.data' / dataset / ('%s_modcossim.npy' % dataset)
np.savez(filepath, sim=sim_scores, match=match_scores)

### Spec2Vec score

In [6]:
import gensim
from spec2vec import SpectrumDocument, Spec2Vec

# Import pre-trained word2vec model (see code example above)
model_file = str(utils.get_project_path() / '.model' / 'spec2vec' / ("spec2vec_%s.model" % dataset))
model = gensim.models.Word2Vec.load(model_file)

# Define similarity_function
spec2vec_similarity = Spec2Vec(
    model=model, intensity_weighting_power=0.5,
    allowed_missing_percentage=5.0)

In [7]:
%%time
# Select spectrum:
refs, quer = spectra, spectra

# Calculate scores on all combinations of reference spectrums and queries
scores = calculate_scores(refs, quer, spec2vec_similarity)

Wall time: 1min 38s


In [8]:
scores.scores.shape

(12198, 12198)

In [9]:
scores.scores

array([[1.        , 0.83185001, 0.50568573, ..., 0.0476396 , 0.05109908,
        0.04008313],
       [0.83185001, 1.        , 0.7694769 , ..., 0.0122301 , 0.03733944,
        0.01075595],
       [0.50568573, 0.7694769 , 1.        , ..., 0.00651696, 0.03075593,
        0.01066027],
       ...,
       [0.0476396 , 0.0122301 , 0.00651696, ..., 1.        , 0.61333389,
        0.56526084],
       [0.05109908, 0.03733944, 0.03075593, ..., 0.61333389, 1.        ,
        0.94501653],
       [0.04008313, 0.01075595, 0.01066027, ..., 0.56526084, 0.94501653,
        1.        ]])

In [10]:
# Save file in specified location
filepath = utils.get_project_path() / '.data' / dataset / ('%s_spec2vec' % dataset)
np.savez(filepath, sim=scores.scores)