In [None]:
import os, sys

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Load Data

In [None]:
# Load a Raw Formatted DataFrame or a PreCleaned DataFrame.
directory = 'INSERT DIRECTORY' # Directory to a folder containing DataFrame generated by BJ_formatting.ipynb or to a foldering containing DataFramge genereated by BJ_filter.ipynb.

file = 'INSERT FILENAME'
all_df = pd.read_pickle(os.path.join(directory, file)).reset_index(drop=True)
all_df.head()

## Dimensionality Reduction

In [None]:
# Selection of Data Subset
df = all_df.loc[all_df.trial == 'Trial 2']
traces = np.vstack(df.logG.values)
traces.shape

In [None]:
# Selection of Passed BJs or not.
clean = True
if clean:
    df = df.loc[df.passed == 1]
    traces = np.vstack(df.logG.values)
traces.shape

In [None]:
# PCA DR
pca = PCA(random_state=42)
pca_red = pca.fit_transform(traces)

In [None]:
plt.scatter(pca_red[:, 0], pca_red[:, 1], s=1)

In [None]:
# t-SNE DR
perp = 1900
tsne = TSNE(perplexity=perp, random_state=42)
tsne_red = tsne.fit_transform(traces)

In [None]:
plt.scatter(tsne_red[:, 0], tsne_red[:, 1], s=1)

In [None]:
# UMAP DR
reducer = umap.UMAP(n_components=2, n_neighbors=int(perp), min_dist=0.0, random_state=42)
umap_red = reducer.fit_transform(traces)

In [None]:
plt.scatter(umap_red[:, 0], umap_red[:, 1], s=1)

## Manual Cluster Selection

In [None]:
# Use a Lasso Selector to manually inspect a cluster.
from LassoSelector import SelectFromCollection
%matplotlib qt

data_to_cluster = pca_red
fig, ax = plt.subplots()
pts = ax.scatter(data_to_cluster[:, 0], data_to_cluster[:, 1], s=5)
idxs = []

selector = SelectFromCollection(ax, pts)

def accept(event):
    if event.key == "enter":
        idxs = selector.ind
        passed_traces = np.vstack(df.iloc[idxs].logG.values)
        passed_Zs = np.vstack(df.iloc[idxs].Z.values)
                              
        fig, ax = plt.subplots()
        h = ax.hist2d(passed_Zs.flatten(), passed_traces.flatten(), bins=128)

fig.canvas.mpl_connect('key_press_event', accept)
ax.set_title("Press enter to accept selected points.")
plt.show()

## Clustering

In [None]:
%matplotlib inline

In [None]:
s_size = 0.5 # scatter plot mark size
n_clusts = 10 # number of clusters

In [None]:
# kmeans
p_kmeans = KMeans(n_clusters=n_clusts, random_state=42).fit(pca_red)
t_kmeans = KMeans(n_clusters=n_clusts, random_state=42).fit(tsne_red)
u_kmeans = KMeans(n_clusters=n_clusts, random_state=42).fit(umap_red)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(6, 2), dpi=600)
axs[0].scatter(pca_red[:, 0], pca_red[:, 1], s=s_size, c=p_kmeans.labels_)
axs[1].scatter(tsne_red[:, 0], tsne_red[:, 1], s=s_size, c=t_kmeans.labels_)
axs[2].scatter(umap_red[:, 0], umap_red[:, 1], s=s_size, c=u_kmeans.labels_)

In [None]:
# Agglomerative
p_agg = AgglomerativeClustering(n_clusters=n_clusts).fit(pca_red)
t_agg = AgglomerativeClustering(n_clusters=n_clusts).fit(tsne_red)
u_agg = AgglomerativeClustering(n_clusters=n_clusts).fit(umap_red)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(6, 2), dpi=600)
axs[0].scatter(pca_red[:, 0], pca_red[:, 1], s=s_size, c=p_agg.labels_)
axs[1].scatter(tsne_red[:, 0], tsne_red[:, 1], s=s_size, c=t_agg.labels_)
axs[2].scatter(umap_red[:, 0], umap_red[:, 1], s=s_size, c=u_agg.labels_)

In [None]:
# GMM
p_gmm = GaussianMixture(n_components=n_clusts, random_state=42).fit(pca_red)
t_gmm = GaussianMixture(n_components=n_clusts, random_state=42).fit(tsne_red)
u_gmm = GaussianMixture(n_components=n_clusts, random_state=42).fit(umap_red)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(6, 2), dpi=600)
axs[0].scatter(pca_red[:, 0], pca_red[:, 1], s=s_size, c=p_gmm.predict(pca_red))
axs[1].scatter(tsne_red[:, 0], tsne_red[:, 1], s=s_size, c=t_gmm.predict(tsne_red))
axs[2].scatter(umap_red[:, 0], umap_red[:, 1], s=s_size, c=u_gmm.predict(umap_red))

In [None]:
# Merge all cluster labels into a single DataFrame
columns = ['reducer', 'trace_idx', 'kmeans', 'agglomerative', 'GMM']
pca_df = pd.DataFrame(dict(zip(columns, [np.repeat('pca', pca_red.shape[0]), list(range(pca_red.shape[0])), p_kmeans.labels_, p_agg.labels_, p_gmm.predict(pca_red)])))
tsne_df = pd.DataFrame(dict(zip(columns, [np.repeat('tsne', tsne_red.shape[0]), list(range(tsne_red.shape[0])), t_kmeans.labels_, t_agg.labels_, t_gmm.predict(tsne_red)])))
umap_df = pd.DataFrame(dict(zip(columns, [np.repeat('umap', umap_red.shape[0]), list(range(umap_red.shape[0])), u_kmeans.labels_, u_agg.labels_, u_gmm.predict(umap_red)])))

clust_df = pd.concat((pca_df, tsne_df, umap_df))
clust_df = clust_df.set_index(['reducer', 'trace_idx'])
clust_df

# Save Cluster Labels

In [None]:
clust_df.to_pickle(os.path.join(directory, 'INSERT FILENAME'))

# Figures

In [None]:
# Inspect cluster labels
reducer = 'tsne'
clusterer = 'kmeans'
labels = [0, 1, 3, 5, 6, 8, 9] # PCA 5 6 TSNE 1
subset = clust_df.loc[reducer, :]
selected = subset[clusterer].isin(labels)
subset = subset.loc[selected]
idxs = subset.index.values
%matplotlib inline
fig, ax = plt.subplots()
if reducer == 'pca':
    ax.scatter(pca_red[:, 0], pca_red[:, 1], c=selected)
if reducer == 'tsne':
    ax.scatter(tsne_red[:, 0], tsne_red[:, 1], c=selected)
if reducer == 'umap':
    ax.scatter(umap_red[:, 0], umap_red[:, 1], c=selected)

## Figures

In [None]:
# Plot the 2D spaces with corresponding kmeans cluster labels
fig, axs = plt.subplots(1, 3, figsize=(6, 2), dpi=600)

axs[0].scatter(pca_red[:, 0], pca_red[:, 1], s=s_size, c=p_kmeans.labels_)
axs[1].scatter(tsne_red[:, 0], tsne_red[:, 1], s=s_size, c=t_kmeans.labels_)
axs[2].scatter(umap_red[:, 0], umap_red[:, 1], s=s_size, c=u_kmeans.labels_)


axs[0].tick_params(width=1.5)
axs[1].tick_params(width=1.5)
axs[2].tick_params(width=1.5)
for axis in ['top', 'right', 'bottom', 'left']:
    axs[0].spines[axis].set_linewidth(1.5)
    axs[1].spines[axis].set_linewidth(1.5)
    axs[2].spines[axis].set_linewidth(1.5)

labels = ['(a)', '(b)','(c)']
for i in range(len(axs)):
    axs[i].annotate(labels[i], xy=(-0.25, 1.05), xytext=(0, 0), xycoords='axes fraction', textcoords='offset pixels', weight='bold')

fig.tight_layout()
# fig.savefig('Clean DR.png')

In [None]:
# BJ histogram constants
logG_bins = 128
z_bins = 115

logG_max, logG_min = 0.8, -6
z_max, z_min = 2E-3, -0.3E-3

logG_max_counts = 15000
logGz_max_counts = 1500

In [None]:
# Plot the 2D embedding and selected clusters along with the corresponding BJ histograms
fig = plt.figure(figsize=(6, 3), dpi=600)

gs0 = gridspec.GridSpec(1, 2, figure=fig, wspace=0.3)
gs1 = gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=gs0[1], width_ratios=(2, 1), height_ratios=(2, 1))
ax1 = fig.add_subplot(gs0[0])
ax2 = fig.add_subplot(gs1[0, 0])
ax3 = fig.add_subplot(gs1[1, 0])
ax4 = fig.add_subplot(gs1[0, 1])

norm = plt.Normalize(0, logGz_max_counts)
h1 = ax2.hist2d(selected_Z.flatten(), selected_logG.flatten(), bins=(z_bins, logG_bins), range=((z_min, z_max), (logG_min, logG_max)), cmap='Blues', norm=norm)

sm = plt.cm.ScalarMappable(cmap='Blues', norm=norm)
sm.set_array([])
cax = ax2.inset_axes([0.75, 0.48, 0.05, 0.47])
cbar = plt.colorbar(sm, ax=ax2, cax=cax)
cax.tick_params(axis='y', labelsize=6, right=False, length=-2)#, color='White', )#, labelcolor='White')

bin_edges = np.histogram_bin_edges(selected_logG.flatten(), bins=logG_bins)
bin_mids = bin_edges[:-1] + (np.diff(bin_edges) / 2)
counts, _ = np.histogram(selected_logG.flatten(), bins=bin_edges)
ax4.barh(bin_mids, counts, height=np.diff(bin_edges))
ax4.set(xlim=(0, logG_max_counts), ylim=(logG_min, logG_max))

bin_edges = np.linspace(z_min, z_max, z_bins+1)
bin_mids = bin_edges[:-1] + (np.diff(bin_edges) / 2)
counts, _ = np.histogram(selected_lens[~pd.isnull(selected_lens)], bins=bin_edges)
ax3.bar(bin_mids, counts, width=np.diff(bin_edges))
ax3.set(xlim=(z_min, z_max))

if reducer == 'pca':
    ax1.scatter(pca_red[:, 0], pca_red[:, 1], c=selected, s=2)
if reducer == 'tsne':
    ax1.scatter(tsne_red[:, 0], tsne_red[:, 1], c=selected, s=2)
if reducer == 'umap':
    ax1.scatter(umap_red[:, 0], umap_red[:, 1], c=selected, s=2)


ax2.set(xticklabels=[])
ax4.set(xticklabels=[], yticklabels=[], xticks=[])
ax3.set(yticklabels=[], yticks=[])

ax2.set_ylabel('log(G/G0)', weight='bold')
ax3.set_ylabel('Counts', weight='bold')
ax3.set_xlabel('Displacement /' + chr(956) +  'm', weight='bold')
ax4.set_xlabel('Counts', weight='bold')

for axis in ['top', 'right', 'bottom', 'left']:
    ax1.spines[axis].set_linewidth(1.5)
    ax2.spines[axis].set_linewidth(1.5)
    ax3.spines[axis].set_linewidth(1.5)
    ax4.spines[axis].set_linewidth(1.5)

ax1.annotate('(a)', xy=(-0.2, 1.05), xytext=(0, 0), xycoords='axes fraction', textcoords='offset pixels', weight='bold')
ax2.annotate('(b)', xy=(-0.35, 1.05), xytext=(0, 0), xycoords='axes fraction', textcoords='offset pixels', weight='bold')
ax4.annotate('(c)', xy=(-0.3, 1.05), xytext=(0, 0), xycoords='axes fraction', textcoords='offset pixels', weight='bold')
ax3.annotate('(d)', xy=(-0.35, 1.05), xytext=(0, 0), xycoords='axes fraction', textcoords='offset pixels', weight='bold')

# fig.savefig("BPy Clean+ tSNE.png", bbox_inches='tight')