# Generate artificail spectra

In [1]:
import numpy as np
import sys, os
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import pandas as pd

import torchvision as tv
import specvae.vae as vae, specvae.utils as utils
import specvae.dataset as dt

In [2]:
use_cuda = False
cpu_device = torch.device('cpu')
if torch.cuda.is_available() and use_cuda:
    device = torch.device('cuda:0')
    print('GPU device count:', torch.cuda.device_count())
else:
    device = torch.device('cpu')
print('Device in use: ', device)

Device in use:  cpu


In [3]:
def generate_spectra(mzs_range, ints_range, n=1):
    if n == 1:
        x = [ints_range if i < n else mzs_range for i in range(2*n)]
        X = np.meshgrid(*x)
        return np.vstack(tuple([Xi.flatten() for Xi in reversed(X)])).T
    else:
        x = [mzs_range if i < n else ints_range for i in range(2*n)]
        X = np.meshgrid(*x)
        return np.vstack(tuple([Xi.flatten() for Xi in X])).T

In [4]:
def spectra2string(spectra, transform):
    db = []
    for i, spectrum in enumerate(spectra):
        db.append(transform(spectrum))
    return np.array(db)

In [5]:
def spectra2csv(X, filepath, n=1):
    # Convert spectra to string format
    revtv = tv.transforms.Compose([
        dt.ToMZIntDeConcat(max_num_peaks=n),
        dt.ToString()
    ])
    db = spectra2string(X, revtv)
    # Save as CSV files
    pd.DataFrame({'spectrum': db}).to_csv(filepath, index_label='id')

## Load model
This part of code prepares the VAE model and defines a method to convert input data to the format suitable for the model.

In [6]:
model_name = 'alt_specvae_2000-1538-30-1538-2000 (28-06-2021_14-05-29)'
dataset = 'MoNA'
root_dir = utils.get_project_path() / '.data' / 'fake'
sroot_dir = utils.get_project_path() / '.data' / 'latent'
max_num_peaks = 1000
min_intensity = 0.1
spec_max_mz = 2500
batch_size = 100000
generate_imgs = True

In [7]:
# Load VAE model
print("Load model: %s..." % model_name)
model_path = utils.get_project_path() / '.model' / dataset / model_name / 'model.pth'
model = vae.BaseVAE.load(model_path, device)
model.eval()

Load model: alt_specvae_2000-1538-30-1538-2000 (28-06-2021_14-05-29)...


SpecAltVEA(
  (encoder_): Sequential(
    (en_lin_1): Linear(in_features=2000, out_features=1538, bias=True)
    (en_lin_batchnorm_1): BatchNorm1d(1538, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (en_act_1): ReLU()
  )
  (en_mu): Linear(in_features=1538, out_features=30, bias=True)
  (en_mu_batchnorm): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (en_log_var): Linear(in_features=1538, out_features=30, bias=True)
  (en_log_var_batchnorm): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (sample): SampleZ()
  (decoder): Sequential(
    (de_lin_1): Linear(in_features=30, out_features=1538, bias=True)
    (de_lin_batchnorm_1): BatchNorm1d(1538, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (de_act_1): ReLU()
    (de_lin_2): Linear(in_features=1538, out_features=2000, bias=True)
    (de_act_2): ReLUlimit()
  )
)

In [8]:
def vae_encode(filepath, n_samples, device):
    # Load and convert data
    # Data processing
#     n_samples = batch_size # -1 if all
    savefig = True
    cols = ['spectrum']

    transform = tv.transforms.Compose([
        dt.SplitSpectrum(),
        dt.FilterPeaks(max_mz=spec_max_mz, min_intensity=min_intensity),
#         dt.Normalize(intensity=True, mass=True, max_mz=spec_max_mz),
        dt.ToMZIntConcatAlt(max_num_peaks=max_num_peaks)
    ])
    revtv = tv.transforms.Compose([])
    
    # Load and transform dataset:
    print("Load data")
    data = dt.Spectra.preload_tensor(
        device=device, filepath=filepath,
        transform=transform, limit=n_samples)

    if data is None:
        print("No dataset specified, script terminates.")
        
    # Set data loaders:
    test_loader = DataLoader(
        dt.Spectra(data=data, device=device, columns=cols),
        batch_size=batch_size, shuffle=False)
    
    print("Encode N=%d spectra from %s dataset..." % (n_samples, dataset))
    spectrum_batch, id_batch = next(iter(test_loader))
    mu, logvar = model.encode(spectrum_batch)
    latent_batch = mu
    X = latent_batch.data.cpu().numpy()
    ids = np.array(id_batch)
    return X, ids

## Generate single peak database
This part of code is responsible for generating spectra with a single peak. The peak is distributed uniformely in given m/z range, per each m/z value we have many intensity values as well.
- csv file with single peal with m/z values in range [mz_from, mz_to, mz_step] and intensity values in range [ints_from, ints_to, ints_step],

### Generate spectra and save csv file

In [9]:
# single peak with m/z values in range [mz_from, mz_to, mz_step] and intensity values in range [ints_from, ints_to, ints_step]
spectra_array = generate_spectra(np.arange(10., 2501., 10.), np.arange(10., 101., 10.), n=1)

In [10]:
spectra_array.shape

(2500, 2)

In [11]:
# Save as CSV files
spectra2csv(spectra_array, root_dir / 'single_peak.csv', n=1)

### Find latent representations and save in file

In [12]:
X, ids = vae_encode(root_dir / 'single_peak.csv', batch_size, device)
X.shape, ids.shape

Load data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Progress: 50%
Progress: 55%
Progress: 60%
Progress: 65%
Progress: 70%
Progress: 75%
Progress: 80%
Progress: 85%
Progress: 90%
Progress: 95%
Convert data to pytorch tensors...
Encode N=100000 spectra from MoNA dataset...


((2500, 30), (2500,))

In [13]:
print("Export data")
np.savez(sroot_dir / 'single_peak.npz', X=X, ids=ids)
print("Done!")

Export data
Done!


## Generate two peaks spectra database
This part of code is responsible for generating spectra with two peaks. Peaks is distributed uniformely in given m/z range, per each m/z value we have many intensity values as well.
- csv file with all combinations of two peaks in descrete space [mz_from, mz_to, mz_step]

In [27]:
spectra_array = generate_spectra(np.arange(10., 2501, 50.), np.arange(1., 101., 20.), n=2)

In [28]:
spectra_array.shape

(62500, 4)

In [29]:
spectra2csv(spectra_array, root_dir / 'two_peaks.csv', n=2)

In [30]:
X, ids = vae_encode(root_dir / 'two_peaks.csv', batch_size, device)
X.shape, ids.shape

Load data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Progress: 50%
Progress: 55%
Progress: 60%
Progress: 65%
Progress: 70%
Progress: 75%
Progress: 80%
Progress: 85%
Progress: 90%
Progress: 95%
Convert data to pytorch tensors...
Encode N=100000 spectra from MoNA dataset...


((62500, 30), (62500,))

In [31]:
print("Export data")
np.savez(sroot_dir / 'two_peaks.npz', X=X, ids=ids)
print("Done!")

Export data
Done!


## Generate three peaks spectra database
This part of code is responsible for generating spectra with three peaks. Peaks is distributed uniformely in given m/z range, per each m/z value we have many intensity values as well.
- csv file with all combinations of three peaks in descrete space [mz_from, mz_to, mz_step]

In [19]:
spectra_array = generate_spectra(np.arange(10., 2501, 100.), np.arange(1., 101., 20.), n=3)
spectra2csv(spectra_array, root_dir / 'three_peaks.csv', n=3)
X, ids = vae_encode(root_dir / 'three_peaks.csv', batch_size, device)
X.shape, ids.shape

Load data
Load and transform...
Convert data to pytorch tensors...
Encode N=100000 spectra from MoNA dataset...


((100000, 30), (100000,))

In [20]:
print("Export data")
np.savez(sroot_dir / 'three_peaks.npz', X=X, ids=ids)
print("Done!")

Export data
Done!


## Generate four peaks spectra database
This part of code is responsible for generating spectra with four peaks. Peaks is distributed uniformely in given m/z range, per each m/z value we have many intensity values as well.
- csv file with all combinations of three peaks in descrete space [mz_from, mz_to, mz_step]

In [13]:
spectra_array = generate_spectra(np.arange(10., 1001, 100.), np.arange(10., 101., 30.), n=4)
print(spectra_array.shape)
spectra2csv(spectra_array, root_dir / 'four_peaks.csv', n=4)
X, ids = vae_encode(root_dir / 'four_peaks.csv', 1000000, device)
X.shape, ids.shape

(2560000, 8)
Load data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Convert data to pytorch tensors...
Encode N=1000000 spectra from MoNA dataset...


((100000, 30), (100000,))

In [14]:
print("Export data")
np.savez(sroot_dir / 'four_peaks.npz', X=X, ids=ids)
print("Done!")

Export data
Done!


## Generate five peaks spectra database

In [9]:
spectra_array = generate_spectra(np.arange(10., 2501, 300.), np.arange(10., 101., 50.), n=5)
spectra_array.shape

(1889568, 10)

In [10]:
spectra2csv(spectra_array, root_dir / 'five_peaks.csv', n=5)
X, ids = vae_encode(root_dir / 'five_peaks.csv', 1000000, device)
X.shape, ids.shape

Load data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Progress: 50%
Convert data to pytorch tensors...
Encode N=1000000 spectra from MoNA dataset...


((100000, 30), (100000,))

In [12]:
print("Export data")
np.savez(sroot_dir / 'five_peaks.npz', X=X, ids=ids)
print("Done!")

Export data
Done!


## Generate six peaks spectra database

In [9]:
spectra_array = generate_spectra(np.arange(10., 2501, 400.), np.arange(10., 101., 50.), n=6)
spectra_array.shape

(7529536, 12)

In [11]:
spectra2csv(spectra_array, root_dir / 'six_peaks.csv', n=6)
X, ids = vae_encode(root_dir / 'six_peaks.csv', 1000000, device)
X.shape, ids.shape

Load data
Load and transform...
Progress: 5%
Progress: 10%
Convert data to pytorch tensors...
Encode N=1000000 spectra from MoNA dataset...


((100000, 30), (100000,))

In [12]:
print("Export data")
np.savez(sroot_dir / 'six_peaks.npz', X=X, ids=ids)
print("Done!")

Export data
Done!
