In [1]:
import black
import jupyter_black

jupyter_black.load()

In [2]:
import numpy as np
import openTSNE as TSNE
import matplotlib.pyplot as plt
from dataclasses import dataclass
import importlib

In [41]:
import utils.Utils as Utils
from utils.Utils import GoodnessOfFit

importlib.reload(Utils)

<module 'utils.Utils' from '/Users/a1/Documents/t-SNE-project/utils/Utils.py'>

In [8]:
# set random seed
np.random.seed(42)
ALPHAS = np.logspace(-0.5, 2, 10)
print(ALPHAS)

[  0.31622777   0.59948425   1.13646367   2.15443469   4.08423865
   7.74263683  14.67799268  27.82559402  52.74997064 100.        ]


In [9]:
tsne_kwargs = {
    "n_iter": 1000,
    "early_exaggeration": 12,
    "learning_rate": 200,
    "n_jobs": -1,
    "random_state": 42,
    "perplexity": 50,
}

## Swiss-roll dataset

## N-samples = 500

In [89]:
n_samples = 500
sr_points, sr_color = Utils.generate_swiss_roll(
    n_samples=n_samples, noise=0.0, plot="plotly", save_fig=True
)

In [11]:
tsne_results_sr500 = []
for α in ALPHAS:
    tsne = TSNE.TSNE(
        perplexity=50,
        n_jobs=-1,
        random_state=42,
        dof=α,
        verbose=True,
    )
    tsne_result = tsne.fit(sr_points)
    tsne_results_sr500.append(tsne_result)

--------------------------------------------------------------------------------
TSNE(dof=0.31622776601683794, early_exaggeration=12, n_jobs=-1, perplexity=50,
     random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 150 nearest neighbors using exact search using euclidean distance...
   --> Time elapsed: 0.03 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=12.00, lr=41.67 for 250 iterations...
Iteration   50, KL divergence 2.1977, 50 iterations in 0.6005 sec
Iteration  100, KL divergence 2.2077, 50 iterations in 0.6093 sec
Iteration  150, KL divergence 2.2068, 50 iterations in 0.6409 sec
Iteration  200, KL divergence 2.2068, 50 iterations in 0.6133 sec
Iteration  250, KL divergence 2.2068, 50 iterations in 0.6775 sec
   --> Time elapsed: 3.14 seconds
===> Running

In [93]:
importlib.reload(Utils)

<module 'utils.Utils' from '/Users/a1/Documents/t-SNE-project/utils/Utils.py'>

In [86]:
# fig, ax = plt.subplots(2, 5, figsize=(20, 8))
Utils.plot_TSNE_plotly(
    raw_data=sr_points, tsne_results=tsne_results_sr500, labels=sr_color, alphas=ALPHAS
)

In [160]:
importlib.reload(Utils)
results = Utils.compute_goodness_of_fit(
    raw_data=sr_points, tsne_data_list=tsne_results_sr500, dofs=ALPHAS, plot=True
)

## N-samples = 1000

In [161]:
n_samples = 1000
sr_points, sr_color = Utils.generate_swiss_roll(
    n_samples=n_samples, noise=0.0, plot="plotly", save_fig=True
)

In [162]:
tsne_results_sr1000 = []
for α in ALPHAS:
    tsne = TSNE.TSNE(
        perplexity=50,
        n_jobs=-1,
        random_state=42,
        dof=α,
        verbose=True,
    )
    tsne_result = tsne.fit(sr_points)
    tsne_results_sr1000.append(tsne_result)

--------------------------------------------------------------------------------
TSNE(dof=0.31622776601683794, early_exaggeration=12, n_jobs=-1, perplexity=50,
     random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 150 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 0.22 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.05 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=12.00, lr=83.33 for 250 iterations...
Iteration   50, KL divergence 2.7506, 50 iterations in 1.5230 sec
Iteration  100, KL divergence 2.7543, 50 iterations in 1.3342 sec
Iteration  150, KL divergence 2.7544, 50 iterations in 1.3506 sec
Iteration  200, KL divergence 2.7544, 50 iterations in 1.3084 sec
Iteration  250, KL divergence 2.7544, 50 iterations in 1.4198 sec
   --> Time elapsed: 6.94 seconds


In [167]:
importlib.reload(Utils)
Utils.plot_TSNE_plotly(
    raw_data=sr_points,
    tsne_results=tsne_results_sr1000,
    labels=sr_color,
    alphas=ALPHAS,
    display_metrics=True,
)

In [168]:
results = Utils.compute_goodness_of_fit(
    raw_data=sr_points, tsne_data_list=tsne_results_sr1000, dofs=ALPHAS, plot=True
)

### N-samples = 5000

In [178]:
importlib.reload(Utils)

<module 'utils.Utils' from '/Users/a1/Documents/t-SNE-project/utils/Utils.py'>

In [171]:
n_samples = 5000
sr_points, sr_color = Utils.generate_swiss_roll(
    n_samples=5000, noise=0.0, plot="plotly"
)

In [179]:
tsne_results_sr5000 = Utils.compute_tsne_embedding(raw_data=sr_points, alphas=ALPHAS)

Computing the shared affinities...




Computing the PCA initialization...
Computing t-SNE embedding for alpha=0.31622776601683794...




Computing t-SNE embedding for alpha=0.599484250318941...




Computing t-SNE embedding for alpha=1.1364636663857248...




Computing t-SNE embedding for alpha=2.154434690031884...




Computing t-SNE embedding for alpha=4.084238652674522...




Computing t-SNE embedding for alpha=7.742636826811269...




Computing t-SNE embedding for alpha=14.677992676220699...




Computing t-SNE embedding for alpha=27.825594022071257...




Computing t-SNE embedding for alpha=52.749970637026195...




Computing t-SNE embedding for alpha=100.0...


In [180]:
Utils.plot_TSNE_plotly(
    raw_data=sr_points,
    tsne_results=tsne_results_sr5000,
    labels=sr_color,
    alphas=ALPHAS,
    display_metrics=True,
)

In [181]:
results = Utils.compute_goodness_of_fit(
    raw_data=sr_points, tsne_data_list=tsne_results_sr5000, dofs=ALPHAS, plot=True
)

## Digits dataset

In [None]:
digits = sklearn.datasets.load_digits(n_class=6)
digit_colors = sns.color_palette("Spectral", as_cmap=True)
X, y = digits.data, digits.target
print(y)
n_samples, n_features = X.shape
n_neighbors = 30

In [None]:
fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(6, 6))
for idx, ax in enumerate(axs.ravel()):
    ax.imshow(X[idx].reshape((8, 8)), cmap="grey")
    ax.axis("off")
_ = fig.suptitle("A selection from the 64-dimensional digits dataset", fontsize=16)

In [23]:
X = MinMaxScaler().fit_transform(X)
digits_tsne = TSNE.TSNE(perplexity=30).fit(X)

In [None]:
# add labels to the embedding
digits_tsne = np.hstack([digits_tsne, y[:, np.newaxis]])
print(digits_tsne)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
for n in range(int(digits_tsne[:, 2].max()) + 1):
    ax.scatter(
        digits_tsne[digits_tsne[:, 2] == n, 0],
        digits_tsne[digits_tsne[:, 2] == n, 1],
        s=50,
        alpha=0.8,
        cmap=digit_colors,
        label=f"digit {n}",
    )
sns.despine(left=True, bottom=True)
ax.set_title("Digits dataset in TSNE space")
ax.set_xticks([])
ax.set_yticks([])
ax.legend()

In [None]:
dofs = np.linspace(0, 2, 10)
tsne_results_digits = []
for dof in dofs:
    tsne = TSNE.TSNE(
        perplexity=30,
        dof=dof,
        verbose=True,
    )
    tsne_results_digits.append(tsne.fit(X))

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()
for i, dof in enumerate(dofs):
    plot_TSNE(
        tsne_result=tsne_results_digits[i],
        ax=axes[i],
        labels=y,
        raw_data=X,
        display_metrics="all",
    )
    axes[i].set_title(r"$\alpha$" + f"={dof:.2f}")

In [None]:
optimal_dof, optimal_kl = compute_optimal_dof(
    raw_data=X, tsne_data_list=tsne_results_digits, dofs=dofs, metric="all", plot=True
)