In [112]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import plotly.express as px
from tqdm import tqdm
from itertools import product
from sklearn.metrics import silhouette_score


In [None]:
metric = 'cosine'
linkage = 'complete'
n_clusters = 3

In [113]:
# 1. Carica il DataFrame
user = pd.read_csv(
    '../datasets/ml-100k/u.user',
    sep='|',
    names=['user_id', 'age', 'gender', 'occupation', 'zip_code']
)

In [114]:
# 2. Prepara “sentences” per Word2Vec
sentences = user[['gender', 'occupation', 'zip_code']].astype(str).values.tolist()

# 3. Allena il modello Word2Vec
w2v = Word2Vec(
    sentences,
    vector_size=8,    # dimensione embedding
    window=2,
    min_count=1,
    epochs=100,
    seed=42
)

In [115]:
# 4. Funzione per embedding medio per riga
def embed_row(row):
    vecs = [
        w2v.wv[row['gender']],
        w2v.wv[row['occupation']],
        w2v.wv[row['zip_code']]
    ]
    return sum(vecs) / len(vecs)

In [116]:
# 5. Calcola gli embedding e componi il DataFrame
embeddings = user.apply(embed_row, axis=1)
df_emb = pd.DataFrame(
    embeddings.tolist(),
    columns=[f'emb_{i}' for i in range(w2v.vector_size)]
)
# Aggiungiamo l’età come ulteriore feature numerica (opzionale)
df_emb['age'] = user['age']

In [None]:
# 6) Ricerca k ottimale via Silhouette Score
sil_scores = {}
for k in range(2, 11):
    agg = AgglomerativeClustering(n_clusters=k,
                                  metric=metric,
                                  linkage=linkage)
    labels = agg.fit_predict(df_emb)
    sil = silhouette_score(df_emb, labels, metric='euclidean')
    sil_scores[k] = sil
    print(f"k = {k} → silhouette = {sil:.4f}")

best_k = max(sil_scores, key=sil_scores.get)
print(f"\n→ k ottimale = {best_k}, silhouette = {sil_scores[best_k]:.4f}\n")

k = 2 → silhouette = 0.3257
k = 3 → silhouette = 0.1872
k = 4 → silhouette = 0.1795
k = 5 → silhouette = 0.3793
k = 6 → silhouette = 0.3465
k = 7 → silhouette = 0.3434
k = 8 → silhouette = 0.3368
k = 9 → silhouette = 0.2734
k = 10 → silhouette = 0.2739

→ k ottimale = 5, silhouette = 0.3793



In [None]:
# 7) Addestra AgglomerativeClustering col k ottimale
agg_final = AgglomerativeClustering(n_clusters=best_k,
                                    metric='cosine',
                                    linkage='complete')
user['cluster_agg'] = agg_final.fit_predict(df_emb)


In [119]:
pca = PCA(n_components=3, random_state=42)
coords3d = pca.fit_transform(df_emb)
df_plot = pd.DataFrame(coords3d, columns=['PC1','PC2','PC3'])
df_plot['cluster'] = user['cluster_agg'].astype(str)

fig = px.scatter_3d(df_plot,
                    x='PC1', y='PC2', z='PC3',
                    color='cluster',
                    title=f'Agglomerative Clustering (k={best_k}) su Word2Vec Embedding')
fig.update_traces(marker=dict(size=4))
fig.show()


In [120]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from itertools import product
from tqdm import tqdm

def agglomerative_grid_search(
    data: pd.DataFrame,
    k_range=range(2, 21),
    linkage_options=('ward', 'average', 'complete'),
    metric_options=('euclidean', 'manhattan', 'cosine')
) -> pd.DataFrame:
    """
    Esegue una ricerca a griglia sugli iperparametri di Agglomerative Clustering
    e restituisce un DataFrame ordinato per silhouette score decrescente.
    """
    # 1) Verifica che 'data' sia numerico
    if not all(dt.kind in 'fi' for dt in pd.DataFrame(data).dtypes):
        raise ValueError("Il DataFrame 'data' deve contenere solo colonne numeriche.")

    results = []

    # 2) Genera tutte le combinazioni valide
    for linkage, metric in tqdm(list(product(linkage_options, metric_options)),
                                desc="Testing linkage/metric"):
        # Il linkage 'ward' richiede metrica euclidea
        if linkage == 'ward' and metric != 'euclidean':
            continue

        for k in k_range:
            try:
                model = AgglomerativeClustering(
                    n_clusters=k,
                    linkage=linkage,
                    metric=metric  
                )
                labels = model.fit_predict(data)

                # Serve almeno 2 cluster distinti per silhouette_score
                if len(set(labels)) < 2:
                    continue

                score = silhouette_score(data, labels, metric=metric)
                results.append({
                    'n_clusters': k,
                    'linkage': linkage,
                    'metric': metric,
                    'silhouette_score': score
                })
            except Exception:
                # Ignora combinazioni non valide o errori interni
                continue

    # 3) Costruisci il DataFrame con i risultati
    result_df = pd.DataFrame(results,
                             columns=['n_clusters', 'linkage', 'metric', 'silhouette_score'])

    # 4) Se non ho risultati validi, restituisco un DataFrame vuoto con le colonne giuste
    if result_df.empty:
        return result_df

    # 5) Ordino per silhouette_score e resetto l’indice
    return (result_df
            .sort_values(by='silhouette_score', ascending=False)
            .reset_index(drop=True))


In [121]:
# Supponendo tu abbia già costruito il DataFrame df_emb (solo colonne numeriche!)
result = agglomerative_grid_search(df_emb)

if result.empty:
    print("Nessuna combinazione valida trovata: controlla i tuoi dati.")
else:
    print(result.head(10))

Testing linkage/metric: 100%|██████████| 9/9 [00:03<00:00,  2.79it/s]

   n_clusters   linkage     metric  silhouette_score
0           2   average     cosine          0.934277
1           2  complete     cosine          0.934277
2           3   average     cosine          0.911290
3           4   average     cosine          0.636708
4           3  complete     cosine          0.634701
5           5   average     cosine          0.632799
6           4  complete     cosine          0.632065
7           2  complete  manhattan          0.606042
8           5  complete     cosine          0.590806
9           2  complete  euclidean          0.588775



