# Compute fingerprint based similarities
Given preprocessed spectra, we derive molecular fingerprints and compute pairwise similarity with Jaccard metric.

In [1]:
import os
import numpy as np
import pandas as pd
import gensim

from matchms.filtering import normalize_intensities, reduce_to_number_of_peaks, require_minimum_number_of_peaks
from spec2vec import SpectrumDocument, Spec2Vec
from spec2vec.model_building import train_new_word2vec_model
from matchms import calculate_scores, Spectrum

import specvae.utils, specvae.dataset as dt
from specvae.dataset import MoNA

In [2]:
def spectrum_processing(s):
    s = normalize_intensities(s)
    return s

In [3]:
def parse_spectrum(row):
    string = row['spectrum']
    m = dt.SplitSpectrum()(string)
    mzs, ints = zip(*m)
    idx = np.argsort(np.array(mzs))
    s = Spectrum(mz=np.array(mzs)[idx], intensities=np.array(ints)[idx], 
            metadata={
                'inchi': str(row['InChI']), 
                'smiles': str(row['SMILES'])
            })
    s = spectrum_processing(s)
    return s

## Load data

In [4]:
dataset = 'MoNA'

In [5]:
print("Load and preprocess %s data..." % dataset)
if dataset == 'HMDB':
    valid_data_path = utils.get_project_path() / '.data' / 'HMDB' / 'hmdb_cfmid_dataset_valid.csv'
    df_valid = pd.read_csv(valid_data_path)

elif dataset == 'MoNA':
    data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_score.csv'
    df = pd.read_csv(data_path)

X = df.apply(parse_spectrum, axis=1)
X

Load and preprocess MoNA validation data...


0        <matchms.Spectrum.Spectrum object at 0x0000020...
1        <matchms.Spectrum.Spectrum object at 0x0000020...
2        <matchms.Spectrum.Spectrum object at 0x0000020...
3        <matchms.Spectrum.Spectrum object at 0x0000020...
4        <matchms.Spectrum.Spectrum object at 0x0000020...
                               ...                        
12193    <matchms.Spectrum.Spectrum object at 0x0000020...
12194    <matchms.Spectrum.Spectrum object at 0x0000020...
12195    <matchms.Spectrum.Spectrum object at 0x0000020...
12196    <matchms.Spectrum.Spectrum object at 0x0000020...
12197    <matchms.Spectrum.Spectrum object at 0x0000020...
Length: 12198, dtype: object

## Add molecular fingerprints


In [6]:
from matchms.filtering.add_fingerprint import add_fingerprint
spectra = [add_fingerprint(s, fingerprint_type="daylight", nbits=2048) for s in X]

In [10]:
no_f, w_f = 0, 0
for i, spec in enumerate(spectra):
    if spec.get("fingerprint") is None:
        no_f += 1
    elif spec.get("fingerprint").sum() < 1:
        w_f += 1
print("No fingerprint:", no_f)
print("Weird:", w_f)

No fingerprint: 172
Weird: 0


## Compute similarity score

In [11]:
from matchms.similarity import FingerprintSimilarity
sm = FingerprintSimilarity(similarity_measure="jaccard")
scores = sm.matrix(spectra, spectra)

In [12]:
scores

array([[1.        , 1.        , 1.        , ..., 0.45114213, 0.45363409,
        0.45363409],
       [1.        , 1.        , 1.        , ..., 0.45114213, 0.45363409,
        0.45363409],
       [1.        , 1.        , 1.        , ..., 0.45114213, 0.45363409,
        0.45363409],
       ...,
       [0.45114213, 0.45114213, 0.45114213, ..., 1.        , 0.97419859,
        0.97419859],
       [0.45363409, 0.45363409, 0.45363409, ..., 0.97419859, 1.        ,
        1.        ],
       [0.45363409, 0.45363409, 0.45363409, ..., 0.97419859, 1.        ,
        1.        ]])

In [13]:
# Save file in specified location
filepath = utils.get_project_path() / '.data' / dataset / ('%s_sim_jaccard.npy' % dataset)
np.save(filepath, scores)