In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import glob
import os
import seaborn as sns
import tensorflow as tf
import scipy
import glob
import sklearn 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers, models, optimizers
from tensorflow.keras.layers import Input, Activation, Dense, LeakyReLU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam, SGD
from keras_tuner import BayesianOptimization, HyperParameters

***
### Clustering

In [None]:
folder_name_1 = 'AET_CF_Trial_12'
file_name_1 = '12_LSP_AET_CF_Predictions56'

folder_name_2 = 'AET_CF_All_Cluster_Trial_1'
if not os.path.exists(folder_name_2):
    os.makedirs(folder_name_2)
file_name_2 = '1_LSP_AET_CF_'

folder_name_3 = 'AET_CF_1_Cluster_Trial_1'

In [None]:
# read in and plot the original data set
latent = pd.read_pickle(f'{folder_name_1}/{file_name_1}')
latent = latent.replace({'trajectory-0.0': 0, 'trajectory-1.0': 1})

In [None]:
# Set the 'index' column as the new index
latent.set_index('index', inplace=True)

# Step 3: Remove the name of the new index
latent.index.name = None

In [None]:
print(latent)

In [None]:
plt.figure(figsize=(18,12))
plt.scatter (latent.x,latent.y, c=latent.z, cmap="coolwarm", s = 1, alpha=1)
plt.title('Filtered, epoch set = 56', fontsize = 20)
plt.xlabel('x', fontsize = 16)
plt.ylabel('y', fontsize=16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
# plt.savefig("56_data.png", dpi = 300)

plt.show()

In [None]:
# remove the trajectory identity column for further processing
latent_n = latent.drop(['z'], axis = 1)
print(latent_n)

In [None]:
def plot_knn_distances(latent_n, k=4):
    """
    Plot the k-nearest neighbors distance for each point in the dataset to help
    determine a good 'eps' value for DBSCAN.

    Parameters:
    - data: The dataset (as a numpy array or similar).
    - k: The number of neighbors to consider (typically the same as 'min_samples' in DBSCAN).
    """
    # Compute the nearest neighbors
    nn = NearestNeighbors(n_neighbors=k).fit(latent_n)
    distances, _ = nn.kneighbors(latent_n)

    # Sort and plot the distances
    sorted_distances = np.sort(distances[:, k-1], axis=0)
    plt.figure(figsize=(10, 6))
    plt.plot(sorted_distances)
    plt.xlabel('Points sorted by distance to the {}-th nearest neighbor'.format(k))
    plt.ylabel('{}-th nearest neighbor distance'.format(k))
    plt.title('K-Nearest Neighbors Distance Plot')
    plt.grid(True)
    plt.show()

# Then call the function with your data
# plot_knn_distances(df55.values, k=4) 


plot_knn_distances(latent.values, k=4)  # Adjust 'k' as needed

In [None]:
y_pred = DBSCAN(eps=0.015, min_samples=60).fit_predict(latent_n)

plt.rcParams['font.size'] = '18'
plt.figure(figsize=(18,10))

# Scatter plot assigning to a variable
scatter = plt.scatter(latent_n.iloc[:,0], latent_n.iloc[:,1], c=y_pred, s=1)

# Creating a legend
unique_labels = np.unique(y_pred)
# Filter out the noise label (-1) if present
unique_labels = unique_labels[unique_labels != -1]
legend_labels = ['Cluster {}'.format(lbl) for lbl in unique_labels]
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels, loc='upper right')

# Setting labels, titles, and ticks
plt.xlabel('x', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title('DBSCAN Clustering of the Latent Layer Representation of the Input data')

# Print the number of clusters
print('Number of clusters: {}'.format(len(unique_labels)))

In [None]:
# latent_n is DataFrame after removing the 'z' column
# latent is the original DataFrame with the 'z' column

# Add the cluster labels to your DataFrame
latent_n['cluster'] = y_pred
latent['cluster'] = y_pred

# Reattach the 'z' column
latent_n['z'] = latent['z']

# Group by cluster and trajectory identity, then count the occurrences
cluster_trajectory_count = latent_n.groupby(['cluster', 'z']).size().reset_index(name='count')

print(cluster_trajectory_count)

In [None]:
# 0 is WT and 1 is D132H
cluster_trajectory_count['z'] = cluster_trajectory_count['z'].replace({0: 'WT', 1: 'D132H'})

# Create the bar chart
plt.figure(figsize=(10, 6))
ax = sns.barplot(x='cluster', y='count', hue='z', data=cluster_trajectory_count)

# Setting labels and title
plt.xlabel('Cluster Number')
plt.ylabel('Number of Frames')
plt.title('Number of WT and D132H Frames in Each Cluster')
plt.xticks(rotation=0)
plt.legend(title='Trajectory Identity')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')

# Show the plot
plt.show()

In [None]:
print(latent)

In [None]:
# Drop the first 2 columns
latent.drop(['x', 'y'], axis=1, inplace=True)

In [None]:
print(latent)

In [None]:
# Load recluster data
# recluster = pd.read_csv(f'{folder_name_2}/recluster.csv', index_col=0)
y_valid_f = pd.read_csv(f'{folder_name_1}/y_valid_f.csv', index_col=0)
X_valid_f = pd.read_csv(f'{folder_name_1}/X_valid_f.csv', index_col=0)
X_train_f = pd.read_csv(f'{folder_name_1}/X_train_f.csv', index_col=0)

In [None]:
# Read the original data
original_data = pd.read_csv(f'{folder_name_1}/X_valid_f.csv', index_col=0)

# Assuming 'latent_n' is a DataFrame with cluster information
unique_clusters = latent_n['cluster'].unique()

for cluster_number in unique_clusters:
    # Select indices of the current cluster
    selected_cluster_indices = latent_n[latent_n['cluster'] == cluster_number].index

    # Filter the data for the current cluster using loc for label-based indexing
    filtered_data = original_data.loc[selected_cluster_indices]

    # Save the filtered data for the current cluster
    filtered_data.to_csv(f'{folder_name_2}/Recluster_Latent_{cluster_number}.csv')

    # Save indices of the filtered data for the current cluster
    # Assuming 'latent' is defined and has the relevant indices
    latent[latent['cluster'] == cluster_number].to_csv(f'{folder_name_2}/Recluster_Index_{cluster_number}.csv')

# Latent Space of Clusters

In [None]:
# Loading autoencoder model from second round of training
model_path = f'{folder_name_3}/models/saved_model_1_LSP_AET_CF_Trial_1'
autoencoder = tf.keras.models.load_model(model_path)

dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('ae_input').input,
                                 outputs=autoencoder.get_layer('ae_latent').output)

In [None]:
clusters = [1, 2, 3]  # Replace with desired clusters

for cluster in clusters:
    # Load the data for the current cluster
    latent_file = f'{folder_name_2}Recluster_Latent_{cluster}.csv'
    index_file = f'{folder_name_2}Recluster_Index_{cluster}.csv'

    latent_data = pd.read_csv(latent_file)
    index_data = pd.read_csv(index_file, header=None, names=['z', 'cluster'], index_col=0)

    # Convert the data to numpy array if necessary
    X_batch = latent_data.values

    # Generate latent space representations
    latent_representations = dr_model.predict(X_batch)

    # Create DataFrame for visualization and saving
    df = pd.DataFrame(latent_representations, columns=['x', 'y'])
    df['z'] = index_data['z']
    df['cluster'] = index_data['cluster']

    # Visualization and saving .png file
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x='x', y='y', hue='cluster', data=df, s=10, palette='viridis')
    plt.title(f'Latent Space Representation - Cluster {cluster}')
    plt.legend(title='Cluster')
    png_path = os.path.join(folder_name_2, f'1_LSP_AET_CF_Cluster_{cluster}.png')
    plt.savefig(png_path, dpi=300)
    plt.close()

    # Saving the data in pickle format
    pickle_path = os.path.join(folder_name_2, f'1_LSP_AET_CF_Cluster_{cluster}.pkl')
    df.to_pickle(pickle_path)

print("Processing and saving completed for all specified clusters.")

# Clustering round 2

In [None]:
folder_name = 'AET_CF_All_Cluster_Trial_1'
file_name = '1_LSP_AET_CF_Predictions56'

In [None]:
print(latent)

In [None]:
# read in and plot the original data set
latent = pd.read_pickle(f'{folder_name}/{file_name}')
latent = latent.replace({'trajectory0': 0, 'trajectory1': 1})

In [None]:
print(latent)

In [None]:
plt.figure(figsize=(18,12))
plt.scatter (latent.x,latent.y, c=latent.z, cmap="coolwarm", s = 1, alpha=1)
plt.title('Filtered, epoch set = 56', fontsize = 20)
plt.xlabel('x', fontsize = 16)
plt.ylabel('y', fontsize=16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
# plt.savefig("56_data.png", dpi = 300)

plt.show()

In [None]:
# remove the trajectory identity column for further processing
latent_n = latent.drop(['z', 'index'], axis = 1)
print(latent_n)

In [None]:
y_pred = DBSCAN(eps=0.015, min_samples=60).fit_predict(latent_n)
plt.rcParams['font.size'] = '18'
plt.figure(figsize=(18,10))
plt.scatter(latent_n.iloc[:,0], latent_n.iloc[:,1], c=y_pred,s = 1)
plt.xlabel('x', fontsize = 16)
plt.ylabel('y', fontsize=16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.title('DBSCAN Clustering of the Latent Layer Representation of the Input data')
print('Number of clusters: {}'.format(len(set(y_pred[np.where(y_pred != -1)]))))

In [None]:
# Add the cluster labels to your DataFrame
latent_n['cluster'] = y_pred

# Reattach the 'z' column
latent_n['z'] = latent['z']

# Group by cluster and trajectory identity, then count the occurrences
cluster_trajectory_count = latent_n.groupby(['cluster', 'z']).size().reset_index(name='count')

print(cluster_trajectory_count)

In [None]:
# 0 is WT and 1 is D132H
cluster_trajectory_count['z'] = cluster_trajectory_count['z'].replace({0: 'WT', 1: 'D132H'})

# Create the bar chart
plt.figure(figsize=(10, 6))
ax = sns.barplot(x='cluster', y='count', hue='z', data=cluster_trajectory_count)

# Setting labels and title
plt.xlabel('Cluster Number')
plt.ylabel('Number of Frames')
plt.title('Number of WT and D132H Frames in Each Cluster')
plt.xticks(rotation=0)
plt.legend(title='Trajectory Identity')

# Annotate each bar with its count
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')

# Show the plot
plt.show()