In [None]:
import numpy as np
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(24,20)})

import torchvision as tv
from specvae.dataset import MoNA
import specvae.vae as vae, specvae.utils as utils
import specvae.dataset as dt

import plotly.express as px
import pandas as pd

In [None]:
use_cuda = False
cpu_device = torch.device('cpu')
if torch.cuda.is_available() and use_cuda:
    device = torch.device('cuda:0')
    print('GPU device count:', torch.cuda.device_count())
else:
    device = torch.device('cpu')
print('Device in use: ', device)

In [None]:
# Processing parameters:
dataset = 'MoNA' # HMDB and MoNA
model_name = 'alt_specvae_2000-1538-30-1538-2000 (28-06-2021_14-05-29)'

In [None]:
print("Load data")
filename = "%s-%s.npz" % (dataset, model_name)
filepath = utils.get_project_path() / '.data' / 'latent' / filename
b = np.load(filepath)
X, mode, energy, tax, ids, classes = b['X'], b['mode'], b['energy'], b['tax'], b['ids'], b['classes']

In [None]:
X.shape

In [None]:
tax

In [None]:
# colors = np.array(list(map(lambda x: 'negative' if x==0 else 'positive', mode)))
colors = np.array(list(map(lambda x: classes[x], tax)))
colors

In [None]:
colors.shape

In [None]:
print("Load fake spectra")
fake1 = np.load(utils.get_project_path() / '.data' / 'latent' / 'single_peak.npz')
X_fake1, ids_fake1 = fake1['X'], fake1['ids']

fake2 = np.load(utils.get_project_path() / '.data' / 'latent' / 'two_peaks.npz')
X_fake2, ids_fake2 = fake2['X'], fake2['ids']

fake3 = np.load(utils.get_project_path() / '.data' / 'latent' / 'three_peaks.npz')
X_fake3, ids_fake3 = fake3['X'], fake3['ids']

fake4 = np.load(utils.get_project_path() / '.data' / 'latent' / 'four_peaks.npz')
X_fake4, ids_fake4 = fake4['X'], fake4['ids']

fake5 = np.load(utils.get_project_path() / '.data' / 'latent' / 'five_peaks.npz')
X_fake5, ids_fake5 = fake5['X'], fake5['ids']

fake6 = np.load(utils.get_project_path() / '.data' / 'latent' / 'six_peaks.npz')
X_fake6, ids_fake6 = fake6['X'], fake6['ids']

In [None]:
X.shape, X_fake1.shape, X_fake2.shape, X_fake3.shape, X_fake4.shape, X_fake5.shape, X_fake6.shape

In [None]:
ids_fake2

In [None]:
print("Compute PCA for n_components=2")
red = PCA(2)
X_c = np.vstack((X, X_fake1, X_fake2, X_fake3, X_fake4, X_fake5, X_fake6))
# X_c = np.vstack((X_fake2))
print(X_c.shape)
color_c = []
for i in range(X_c.shape[0]):
    if i < X.shape[0]:
        color_c.append(dataset)
    elif i < (X.shape[0] + X_fake1.shape[0]):
        color_c.append('1 peak')
    elif i < (X.shape[0] + X_fake1.shape[0] + X_fake2.shape[0]):
        color_c.append('2 peaks')
    elif i < (X.shape[0] + X_fake1.shape[0] + X_fake2.shape[0] + X_fake3.shape[0]):
        color_c.append('3 peaks')
    elif i < (X.shape[0] + X_fake1.shape[0] + X_fake2.shape[0] + X_fake3.shape[0] + X_fake4.shape[0]):
        color_c.append('4 peaks')
    elif i < (X.shape[0] + X_fake1.shape[0] + X_fake2.shape[0] + X_fake3.shape[0] + X_fake4.shape[0] + X_fake5.shape[0]):
        color_c.append('5 peaks')
    else:
        color_c.append('6 peaks')
color_c = np.array(color_c)
# color_c = np.array([dataset if i < X.shape[0] else 'One peak' for i in range(X_c.shape[0])])
ids_c = np.hstack((ids, ids_fake1.astype(str), ids_fake2.astype(str), ids_fake3.astype(str), ids_fake4.astype(str), ids_fake5.astype(str), ids_fake6.astype(str)))
# ids_c = np.hstack((ids_fake2.astype(str)))
data = red.fit_transform(X_c)
data.shape, color_c.shape

In [None]:
fig = px.scatter(data, x=0, y=1, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, width=1200, height=1200)
fig.show()

In [None]:
print("Compute PCA for n_components=3")
red = PCA(5)
data = red.fit_transform(X_c)

In [None]:
fig = px.scatter_3d(data, x=0, y=1, z=2, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, width=1200, height=1200)
fig.update_traces(marker=dict(
                    size=3,
#                     line=dict(
#                         width=1,
#                         color='DarkSlateGrey'
#                     )
                ),
                selector=dict(mode='markers'))
fig.show()

In [None]:
pca_comp = 10
print("Compute PCA for n_components=%d" % pca_comp)
red = PCA(pca_comp)
data = red.fit_transform(X_c)

print("PCA:")
print("\t      explained_variance:", red.explained_variance_)
print("\texplained_variance_ratio:", red.explained_variance_ratio_)

In [None]:
n_components = 2
print("Compute tSNE for n_components=%d" % n_components)
r = TSNE(n_components)
tdata = r.fit_transform(data)

print("TSNE:")
print("\t      kl_divergence:", r.kl_divergence_)

In [None]:
fig = px.scatter(tdata, x=0, y=1, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, width=1200, height=1200)
fig.show()

In [None]:
n_components = 3
print("Compute tSNE for n_components=%d" % n_components)
r = TSNE(n_components)
tdata = r.fit_transform(data)

print("TSNE:")
print("\t      kl_divergence:", r.kl_divergence_)

In [None]:
fig = px.scatter_3d(tdata, x=0, y=1, z=2, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, width=1200, height=1200)
fig.show()

In [None]:
def draw_umap(n_neighbors=15, min_dist=0.25, n_components=2, metric='euclidean', title=''):
    fit = UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
    )
    u = fit.fit_transform(data)
    if n_components == 1:
        fig = px.scatter(u, x=0, y=1, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, title=title)
        fig.show()
    if n_components == 2:
        fig = px.scatter(u, x=0, y=1, color=color_c, template='plotly_white', hover_data={'ID': ids_c}, title=title, width=1200, height=1200)
        fig.show()
    if n_components == 3:
        fig = px.scatter_3d(u, x=0, y=1, z=2, color=colors, template='plotly_white', hover_data={'ionization mode': mode, 'collision energy': energy, 'InChIKey': ids}, title=title)
        fig.show()
    

In [None]:
for n in (10, 20, 50, 100, 200):
    draw_umap(n_neighbors=n, title='n_neighbors = {}'.format(n))

In [None]:
for d in (0.0, 0.1, 0.25, 0.5, 0.8, 0.99):
    draw_umap(min_dist=d, title='min_dist = {}'.format(d))

# Plot by metric type

In [None]:
for m in ("euclidean", "correlation", "chebyshev", "minkowski"):
    name = m if type(m) is str else m.__name__
    draw_umap(n_components=2, n_neighbors=200, metric=m, title='metric = {}'.format(name))

In [None]:
draw_umap(n_components=3, metric='euclidean', title='metric = euclidean')