# Export Latent Data Points
This notebook loads a VAE model given py path, encodes the input dataset into latent space of that model, and stores data in numpy file.
If `generate_imgs=True`, the notebook will generate spectra plots per each data point and it will append it to the final output file.

In [78]:
import sys, os
import torch
import numpy as np
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import pandas as pd
import torchvision as tv

In [79]:
from vae import BaseVAE
import specvae.dataset as dt
import specvae.utils as utils

In [80]:
device, cpu = utils.device(use_cuda=True)

GPU device count: 1
Device in use:  cuda:0


In [81]:
# Parameters
dataset = "MoNA"
model_name = "specgvae_100-60-5-60-100 (19-10-2021_18-17-41)"
model_dir = utils.get_project_path() / '.model' / dataset / 'vae_latent_size' / model_name
generate_imgs = True
n_molecules = 500

In [82]:
print("Load model: %s..." % model_name)
model_path = model_dir / 'model.pth'
model = BaseVAE.load(model_path, device)
model.eval()

In [83]:
model.config

## Load data

In [84]:
from specvae.dataset import Spectra

columns = ['collision_energy', 'ionization_mode_id', 'kingdom_id', 'superclass_id', 'class_id', 'subclass_id']
input_columns = model.config['input_columns'] + ['id']
input_types = model.config['types']

if dataset == 'MoNA':
    columns += ['precursor_type_id', 'total_exact_mass', 'instrument_type_id']
    data_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_full.csv'
    metadata_path = utils.get_project_path() / '.data' / 'MoNA' / 'MoNA_meta.npy'
elif dataset == 'HMDB':
    data_path = utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_full.csv'
    metadata_path = utils.get_project_path() / '.data' / 'HMDB' / 'HMDB_meta.npy'

columns += ['id']

metadata = None
if os.path.exists(metadata_path):
    metadata = np.load(metadata_path, allow_pickle=True).item()


## Export all features

In [85]:
meta_df = Spectra.get_unique(n_molecules, columns=columns, csv_file=data_path)
meta_df

Unnamed: 0.1,Unnamed: 0,spectrum,SMILES,instrument,library,author,publication,structural_key,CASMI,split,...,instrument_type_id,precursor_type_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id
0,0,52.073152:0.215740 53.039199:0.251984 55.05488...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
1,1,50.179433:0.988081 52.761359:0.667573 53.03928...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
2,2,50.382111:0.657423 52.393542:0.636186 52.67915...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
3,3,51.129190:0.047539 51.138777:0.057363 52.37786...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
4,4,50.399188:0.071114 51.370479:0.060948 51.44307...,O=C1OC=2C=C3OC(CC3=CC2C=C1C(C=C)(C)C)C(OC(=O)C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,AWMHMGFGCLBSAY,,train,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Coumarins and derivatives,Furanocoumarins,1,18,70,184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8845,8845,50.004555:0.014915 50.093514:0.277573 50.23632...,O=C(OCC1OC(OC(CC(=O)CCC2=CC=C(O)C=C2)CCC3=CC=C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,MHWAHKIDOYBXCN,,train,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Diarylheptanoids,Linear diarylheptanoids,1,18,76,255
8846,8846,50.291130:0.041717 50.455574:0.316375 50.74178...,O=C(OCC1OC(OC(CC(=O)CCC2=CC=C(O)C=C2)CCC3=CC=C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,MHWAHKIDOYBXCN,,train,...,0,0,Organic compounds,Phenylpropanoids and polyketides,Diarylheptanoids,Linear diarylheptanoids,1,18,76,255
8847,8847,50.134292:0.002200 50.330351:0.003480 50.43869...,O=C(OCC1OC(OC(CC(=O)CCC2=CC=C(O)C=C2)CCC3=CC=C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,MHWAHKIDOYBXCN,,train,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Diarylheptanoids,Linear diarylheptanoids,1,18,76,255
8848,8848,50.027900:0.006457 50.046494:0.006555 50.08917...,O=C(OCC1OC(OC(CC(=O)CCC2=CC=C(O)C=C2)CCC3=CC=C...,Thermo Q Exactive HF,Vaniya/Fiehn Natural Products Library,Arpana Vaniya,,MHWAHKIDOYBXCN,,train,...,0,1,Organic compounds,Phenylpropanoids and polyketides,Diarylheptanoids,Linear diarylheptanoids,1,18,76,255


In [86]:
data_np = {}
for col in columns:
    data_np[col] = meta_df[col].to_numpy()
data_np

Index(['Unnamed: 0', 'spectrum', 'SMILES', 'instrument', 'library', 'author',
       'publication', 'structural_key', 'CASMI', 'split', 'id',
       'molecular_formula', 'total_exact_mass', 'collision_energy_old',
       'ionization_mode', 'instrument_type', 'precursor_mz', 'precursor_type',
       'collision_energy', 'ionization_mode_id', 'instrument_id',
       'instrument_type_id', 'precursor_type_id', 'kingdom', 'superclass',
       'class', 'subclass', 'kingdom_id', 'superclass_id', 'class_id',
       'subclass_id'],
      dtype='object')

In [61]:
print("Export data")
filename = "%s_features.npz" % dataset
filepath = utils.get_project_path() / '.data' / 'latent' / filename
np.savez(filepath, **data_np)
print("File", filepath, "saved!")

Export data
File  D:\Workspace\SpecVAE\.data\latent\MoNA_features.npz  saved!


## Export model specific data

In [62]:
df = Spectra.get_unique(n_molecules, columns=input_columns, csv_file=data_path)
df

Unnamed: 0,spectrum,id
0,52.073152:0.215740 53.039199:0.251984 55.05488...,AWMHMGFGCLBSAY-UHFFFAOYSA-N
1,50.179433:0.988081 52.761359:0.667573 53.03928...,AWMHMGFGCLBSAY-UHFFFAOYSA-N
2,50.382111:0.657423 52.393542:0.636186 52.67915...,AWMHMGFGCLBSAY-UHFFFAOYSA-N
3,51.129190:0.047539 51.138777:0.057363 52.37786...,AWMHMGFGCLBSAY-UHFFFAOYSA-N
4,50.399188:0.071114 51.370479:0.060948 51.44307...,AWMHMGFGCLBSAY-UHFFFAOYSA-N
...,...,...
8845,50.004555:0.014915 50.093514:0.277573 50.23632...,MHWAHKIDOYBXCN-FHVVLDLYSA-N
8846,50.291130:0.041717 50.455574:0.316375 50.74178...,MHWAHKIDOYBXCN-FHVVLDLYSA-N
8847,50.134292:0.002200 50.330351:0.003480 50.43869...,MHWAHKIDOYBXCN-FHVVLDLYSA-N
8848,50.027900:0.006457 50.046494:0.006555 50.08917...,MHWAHKIDOYBXCN-FHVVLDLYSA-N


In [63]:
data = Spectra.preload_tensor(
    device=device, 
    data_frame=df, 
    transform=model.config['transform'], 
    limit=-1, 
    types=input_types)

Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Progress: 50%
Progress: 55%
Progress: 60%
Progress: 65%
Progress: 70%
Progress: 75%
Progress: 80%
Progress: 85%
Progress: 90%
Progress: 95%
Convert data to pytorch tensors...


In [64]:
data, data['id'].shape

({'spectrum': tensor([[ 213.0906, 2500.0000,  192.0778,  ...,  172.8315,  133.1010,
            167.1809],
          [ 213.0906, 2500.0000,  192.0779,  ...,  226.5244,   87.0444,
            223.2525],
          [ 134.9962, 2500.0000,  191.0699,  ...,  224.5507,  131.0853,
            217.2379],
          ...,
          [ 147.0442, 2500.0000,  107.0496,  ...,    0.0000,    0.0000,
              0.0000],
          [ 147.0442, 2500.0000,  107.0497,  ...,    0.0000,    0.0000,
              0.0000],
          [ 147.0442, 2500.0000,  119.0496,  ...,    0.0000,    0.0000,
              0.0000]], device='cuda:0'),
  'id': array(['AWMHMGFGCLBSAY-UHFFFAOYSA-N', 'AWMHMGFGCLBSAY-UHFFFAOYSA-N',
         'AWMHMGFGCLBSAY-UHFFFAOYSA-N', ..., 'MHWAHKIDOYBXCN-FHVVLDLYSA-N',
         'MHWAHKIDOYBXCN-FHVVLDLYSA-N', 'MHWAHKIDOYBXCN-FHVVLDLYSA-N'],
        dtype='<U27')},
 (8850,))

In [65]:
spectra_str = df['spectrum'].to_numpy()
spectra_str = np.array(spectra_str).flatten().tolist()
len(spectra_str)

8850

In [66]:
print("Encode N=%d compounds from %s dataset..." % (n_molecules, dataset))
X, ids = data['spectrum'], data['id'] # TODO: handle the case for concatanated input
Xrecon, z, latent_dist = model.forward_(X)
z.shape

Encode N=500 compounds from MoNA dataset...


torch.Size([8850, 5])

In [67]:
data_np = {}
data_np['X'] = X.data.cpu().numpy()
data_np['Xrecon'] = Xrecon.data.cpu().numpy()
data_np['z'] = z.data.cpu().numpy()
# data_np['spectra_str'] = spectra_str # might take a lot of disk space
data_np

{'X': array([[ 213.09058 , 2500.      ,  192.0778  , ...,  172.83145 ,
          133.10095 ,  167.18088 ],
        [ 213.09064 , 2500.      ,  192.07787 , ...,  226.5244  ,
           87.04438 ,  223.25247 ],
        [ 134.99625 , 2500.      ,  191.0699  , ...,  224.55072 ,
          131.08527 ,  217.2379  ],
        ...,
        [ 147.04422 , 2500.      ,  107.04961 , ...,    0.      ,
            0.      ,    0.      ],
        [ 147.04416 , 2500.      ,  107.049675, ...,    0.      ,
            0.      ,    0.      ],
        [ 147.04419 , 2500.      ,  119.04955 , ...,    0.      ,
            0.      ,    0.      ]], dtype=float32),
 'Xrecon': array([[ 194.65103 , 2500.      ,  194.29349 , ...,  140.7937  ,
          178.83199 ,  136.37784 ],
        [ 133.109   , 2500.      ,  130.08397 , ...,  237.51591 ,
          143.11197 ,  230.99094 ],
        [  74.608025, 2500.      ,  105.4842  , ...,  230.41103 ,
           80.23023 ,  223.89334 ],
        ...,
        [ 191.59422 , 25

In [68]:
print("Export data")
filename = "%s-%s.npz" % (dataset, model_name)
filepath = utils.get_project_path() / '.data' / 'latent' / filename
np.savez(filepath, **data_np)
print("File", filepath, "saved!")

Export data


## Pre-generate image labels

In [16]:
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
import base64, io
import specvae.visualize as vis

In [17]:
def generate_image(spectrum, mode, energy, id, scale=0.5):
    try:
        fig, ax = plt.subplots()
        meta = {
            'collision energy': energy,
            'ionization mode': mode
        }
        vis.plot_spectrum(spectrum, name=id, meta=meta, ax=ax, resolution=0.5, max_mz=2500, figsize=(5, 5))

        with io.BytesIO() as io_buf:
            fig.savefig(io_buf, format='raw')
            io_buf.seek(0)
            img_arr = np.reshape(np.frombuffer(io_buf.getvalue(), dtype=np.uint8),
                newshape=(int(fig.bbox.bounds[3]), int(fig.bbox.bounds[2]), -1))
        plt.close()
        buffer = BytesIO()
        image = Image.fromarray(img_arr)
        width, height = image.size
        image = image.resize((int(scale * width), int(scale * height)))
        image.save(buffer, format='png')
        for_encoding = buffer.getvalue()
        return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()
    except:
        print('Unable to vis spectrum')
        return None


In [19]:
if generate_imgs:
    ssize = len(spectra_str)
    last = 0
    images = []
    for i, spectrum in enumerate(spectra_str):
        try:
            images.append(generate_image(spectrum, data_np['ionization_mode_id'][i][0], data_np['collision_energy'][i][0], ids[i], scale=0.4))
            if (i / ssize - last) > 0.05:
                last = i / ssize
                print("Progress {}%".format(int(last * 100)))
        except IndexError as ie:
            print("Error: {0}".format(ie))
        except:
            print("Unknown error has occurred")
    print("DONE!")

Progress 5%
Progress 10%
Progress 15%
Progress 20%
Progress 25%
Progress 30%
Progress 35%
Progress 40%
Progress 45%
Progress 50%
Progress 55%
Progress 60%
Progress 65%
Progress 70%
Progress 75%
Progress 80%
Progress 85%
Progress 90%
Progress 95%


In [20]:
if generate_imgs:
    filepath = utils.get_project_path() / '.data' / 'latent' / 'spectra_images.npz'
    print("Save file ", filepath)
    np.savez(filepath, imgs=images)

Save file  D:\Workspace\SpecVAE\.data\latent\spectra_images.npz
