In [1]:
import numpy as np
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

import os, sys
from tqdm.notebook import tqdm
sys.path.append(r"C:\Users\chataint\Documents\projet\humanlisbet\lisbet_code")
from utils import load_embedding


In [2]:
data_r = r"C:\Users\chataint\Documents\projet\humanlisbet\results\bet_embedders\bet_embedders\13879972"
seed = 42

In [3]:

datapath = os.path.join(data_r, "embedding_train.numpy")
dataval = os.path.join(data_r, "embedding_test.numpy")

# Load your temporal encoded data (shape: n_samples, 128 dimensions)
train_data =  load_embedding(datapath)# Replace with your data
td = train_data.drop(columns='video').to_numpy()


In [None]:

# Step 1: Dimensionality Reduction with UMAP
umap_reducer = umap.UMAP(n_components=2)
data_umap = umap_reducer.fit_transform(td)

In [None]:
# Step 2: K-Means Clustering and Finding the Optimal Number of Clusters
def kmeans_clustering(data, max_clusters=10, min_clusters=2, step=1):
    inertia = []
    silhouette = []

    for k in tqdm(range(min_clusters, max_clusters + 1, step)):
        kmeans = KMeans(n_clusters=k, random_state=seed)
        labels = kmeans.fit_predict(data)
        inertia.append(kmeans.inertia_)
        silhouette.append(silhouette_score(data, labels))
    return inertia, silhouette

# Run clustering for up to 10 clusters
min_clusters = 5
max_clusters = 100
step=5
inertia, silhouette = kmeans_clustering(data_umap, max_clusters, min_clusters, step)

In [None]:
# Step 3: Plot Inertia and Silhouette Scores
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(range(2, max_clusters + 1), inertia, marker='o')
ax[0].set_title("Elbow Method (Inertia)")
ax[0].set_xlabel("Number of Clusters")
ax[0].set_ylabel("Inertia")

ax[1].plot(range(2, max_clusters + 1), silhouette, marker='o')
ax[1].set_title("Silhouette Score")
ax[1].set_xlabel("Number of Clusters")
ax[1].set_ylabel("Silhouette Score")

plt.show()

In [None]:
# Step 4: Visualization of Clusters
optimal_clusters = silhouette.index(max(silhouette)) + 2
kmeans = KMeans(n_clusters=optimal_clusters, random_state=seed)
labels = kmeans.fit_predict(data_umap)

plt.scatter(data_umap[:, 0], data_umap[:, 1], c=labels, cmap='viridis', s=10)
plt.title(f"K-Means Clustering on UMAP with {optimal_clusters} Clusters")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="Cluster Label")
plt.show()
