teste simples para avaliar o funcionamento das 4 funções de clusterização(run_kmeans,run_agnes,run_pam,run_clarans)

In [None]:
import os
import pandas as pd
import numpy as np
import torch
# from torch_kmeans import KMeans
from sklearn.cluster import KMeans
from pyclustering.cluster.agglomerative import agglomerative as agnes
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.clarans import clarans
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils import distance_metric,type_metric
from sklearn.neighbors import NearestNeighbors
from 

In [2]:
def optimize_embeddings(embeddings):
    """Converte embeddings para float32 e CPU de forma segura"""
    if isinstance(embeddings, torch.Tensor):
        return embeddings.float().cpu()  # Reduz precisão e remove GPU
    return torch.tensor(embeddings).float().cpu()
from sklearn.decomposition import PCA

def smart_reduce_dim(embeddings, variance_threshold=0.95):
    """Reduz dimensão preservando variância explicada"""
    pca = PCA(n_components=variance_threshold)
    reduced = pca.fit_transform(embeddings.numpy())
    print(f"Reduzido de {embeddings.shape[1]} para {pca.n_components_} componentes (preserva {variance_threshold*100}% da variância)")
    return torch.tensor(reduced)

In [None]:
def run_kmeans(embeddings, n_clusters):
    embeddings = optimize_embeddings(embeddings)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    cluster_ids = kmeans.fit_predict(embeddings.numpy())
    return cluster_ids, kmeans
def run_agnes(embeddings, n_clusters, sample_size=None):
    embeddings = optimize_embeddings(embeddings)
    data = embeddings.numpy()
    
    if sample_size and len(data) > sample_size:
        data = data[np.random.choice(len(data), sample_size, replace=False)]
    
    model = agnes(data.tolist(), n_clusters)
    model.process()
    
    if sample_size:
        
        knn = NearestNeighbors(n_neighbors=1).fit(data)
        _, indices = knn.kneighbors(embeddings.numpy())
        sample_clusters = model.get_clusters()
        cluster_ids = np.array([sample_clusters[i[0]] for i in indices])
    else:
        clusters = model.get_clusters()
        cluster_ids = np.zeros(len(data), dtype=int)
        for cluster_id, indices in enumerate(clusters):
            cluster_ids[indices] = cluster_id
    
    return cluster_ids, model
def run_clarans(embeddings, n_clusters, numlocal=2, maxneighbor=3, sample_frac=0.3):
    embeddings = optimize_embeddings(embeddings)
    data = embeddings.numpy()
    
    if sample_frac < 1.0:
        sample = data[np.random.choice(len(data), int(len(data)*sample_frac), replace=False)]
    else:
        sample = data
        
    model = clarans(sample.tolist(), n_clusters, numlocal, maxneighbor)
    model.process()
    
    if sample_frac < 1.0:
        knn = NearestNeighbors(n_neighbors=1).fit(sample)
        _, indices = knn.kneighbors(data)
        sample_clusters = model.get_clusters()
        cluster_ids = np.array([sample_clusters[i[0]] for i in indices])
    else:
        clusters = model.get_clusters()
        cluster_ids = np.zeros(len(data), dtype=int)
        for cluster_id, indices in enumerate(clusters):
            cluster_ids[indices] = cluster_id
    
    return cluster_ids, model
def run_pam(embeddings, n_clusters, sample_frac=0.5):
    embeddings = optimize_embeddings(embeddings)
    data = embeddings.numpy()
    
    if sample_frac < 1.0:
        sample = data[np.random.choice(len(data), int(len(data)*sample_frac), replace=False)]
    else:
        sample = data
    
    initial_medoids = kmeans_plusplus_initializer(sample.tolist(), n_clusters).initialize()
    model = kmedoids(sample.tolist(), initial_medoids, data_type='points')
    model.process()
    
    if sample_frac < 1.0:
        knn = NearestNeighbors(n_neighbors=1).fit(sample)
        _, indices = knn.kneighbors(data)
        sample_clusters = model.get_clusters()
        cluster_ids = np.array([sample_clusters[i[0]] for i in indices])
    else:
        clusters = model.get_clusters()
        cluster_ids = np.zeros(len(data), dtype=int)
        for cluster_id, indices in enumerate(clusters):
            cluster_ids[indices] = cluster_id
    
    return cluster_ids, model

In [4]:
def safe_cluster(method, embeddings, n_clusters, **kwargs):
    try:
        # Redução inteligente de dimensionalidade para métodos lentos
        if method.__name__ in ['run_agnes', 'run_pam']:
            embeddings = smart_reduce_dim(embeddings)
        
        return method(embeddings, n_clusters, **kwargs)
    except Exception as e:
        print(f"Falha no {method.__name__}: {str(e)}")
        return np.zeros(len(embeddings), dtype=int), None

In [5]:
import time
import psutil

In [6]:
def test_method(name, func, embeddings, k):
    print(f"\nTestando {name} com k={k}...")
    start_time = time.time()
    
    cluster_ids, model = safe_cluster(func, embeddings, k)
    
    elapsed = time.time() - start_time
    num_clusters = len(set(cluster_ids))
    
    print(f"Concluído em {elapsed:.2f}s | Clusters: {num_clusters}")
    print(f"Memória: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")
    
    assert len(cluster_ids) == len(embeddings), "Tamanho inconsistente!"
    return cluster_ids

In [7]:
from clusters.clusters import *
dataset = 'Beauty'
embedding_path = f'../data_preprocessing/{dataset}/{dataset}-similarity-values-thenlper_gte-large.pt'
embeddings = load_embeddings(embedding_path)

In [8]:
methods = [
    ("KMeans", run_kmeans),
    ("CLARANS", run_clarans),
    ("PAM", run_pam),
    ("AGNES", run_agnes)
]

for name, func in methods:
    test_method(name, func, embeddings, k=6)


Testando KMeans com k=6...
Concluído em 3.43s | Clusters: 6
Memória: 543.29 MB

Testando CLARANS com k=6...


KeyboardInterrupt: 