In [None]:
# Importiamo le librerie necessarie
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import umap

# Carica gli embeddings salvati in 'cache/embeddings.npy'
embeddings = np.load('../cache/embeddings.npy')
print("Forma degli embeddings:", embeddings.shape)

def cluster_embeddings(embeddings, algorithm, **kwargs):
    """
    Perform clustering on the embeddings using the specified algorithm.
    """

    if algorithm == 'hdbscan':
        clusterer = HDBSCAN(metric='euclidean', **kwargs)
        cluster_labels = clusterer.fit_predict(embeddings)
    elif algorithm == 'kmeans':
        clusterer = KMeans(**kwargs)
        cluster_labels = clusterer.fit_predict(embeddings)
    elif algorithm == 'agglomerative':
        clusterer = AgglomerativeClustering(metric='euclidean', **kwargs)
        cluster_labels = clusterer.fit_predict(embeddings)
    else:
        raise ValueError(f"Invalid clustering algorithm: {algorithm}")
    return cluster_labels


In [None]:

for algorithm in ['kmeans']:#, 'hdbscan', 'agglomerative']:
    print(f"\n --- {algorithm} ---")
    
    if algorithm == 'hdbscan':
        cluster_labels = cluster_embeddings(embeddings, algorithm, min_cluster_size=5, min_samples=1)
    elif algorithm == 'kmeans':
        cluster_labels = cluster_embeddings(embeddings, algorithm, n_clusters=8)
    elif algorithm == 'agglomerative':
        cluster_labels = cluster_embeddings(embeddings, algorithm, n_clusters=5)

    print("Etichette cluster uniche:", np.unique(cluster_labels))

    # Calcoliamo il silhouette score per valutare la qualità del clustering
    sil_score = silhouette_score(embeddings, cluster_labels)
    print("Silhouette Score:", sil_score)

    reducer = umap.UMAP(n_components=2) #random_state=42)
    embeddings_umap = reducer.fit_transform(embeddings)
    print("Forma degli embeddings UMAP:", embeddings_umap.shape)

    # Riduciamo la dimensionalità degli embeddings a 2D per la visualizzazione
    # Opzione 1: t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
    embeddings_tsne = tsne.fit_transform(embeddings)
    print("Forma degli embeddings t-SNE:", embeddings_tsne.shape)


    plt.figure(figsize=(12, 6))
    # Visualizzazione con t-SNE
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1],
                        c=cluster_labels, cmap='viridis', alpha=0.7)
    plt.title(f'Visualizzazione t-SNE dei Cluster {algorithm}')
    plt.xlabel('Dimensione 1')
    plt.ylabel('Dimensione 2')
    plt.colorbar(scatter, label='Etichetta Cluster')


    plt.subplot(1, 2, 2)
    scatter = plt.scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
    plt.title(f'Visualizzazione UMAP dei Cluster {algorithm}')
    plt.xlabel('Dimensione 1')
    plt.ylabel('Dimensione 2')
    plt.colorbar(scatter, label='Etichetta Cluster')

    plt.tight_layout()
    plt.show()
