In [10]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.metrics import silhouette_score

In [11]:
# 1. Carica il DataFrame
user = pd.read_csv(
    '../../datasets/ml-100k/u.user',
    sep='|',
    names=['user_id', 'age', 'gender', 'occupation', 'zip_code']
)

In [12]:
# 2. Prepara “sentences” per Word2Vec includendo anche zip_code
sentences = user[['gender', 'occupation', 'zip_code']].astype(str).values.tolist()

# 3. Allena il modello Word2Vec
w2v = Word2Vec(
    sentences,
    vector_size=8,    # dimensione del vettore di embedding
    window=2,
    min_count=1,
    epochs=100,
    seed=42
)

In [13]:
# 4. Definisci funzione per calcolare embedding medio per riga
def embed_row(row):
    vecs = [w2v.wv[row['gender']], 
            w2v.wv[row['occupation']], 
            w2v.wv[row['zip_code']]]
    # media vettoriale
    return sum(vecs) / len(vecs)


In [14]:
# 5. Applica embedding a tutto il DataFrame
embeddings = user.apply(embed_row, axis=1)
df_emb = pd.DataFrame(
    embeddings.tolist(),
    columns=[f'emb_{i}' for i in range(w2v.vector_size)]
)
# Aggiungiamo eventualmente altre feature numeriche (es. age)
df_emb['age'] = user['age']

In [15]:
sil_scores = {}
range_n_clusters = range(2, 30)
for k in range_n_clusters:
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(df_emb)
    sil = silhouette_score(df_emb, labels)
    sil_scores[k] = sil
    print(f"k = {k} → silhouette score = {sil:.4f}")

best_k = max(sil_scores, key=sil_scores.get)
print(f"\nNumero ottimale di cluster: k = {best_k} (silhouette = {sil_scores[best_k]:.4f})")

k = 2 → silhouette score = 0.6278
k = 3 → silhouette score = 0.5485
k = 4 → silhouette score = 0.5237
k = 5 → silhouette score = 0.5299
k = 6 → silhouette score = 0.5314
k = 7 → silhouette score = 0.5213
k = 8 → silhouette score = 0.5218
k = 9 → silhouette score = 0.5160
k = 10 → silhouette score = 0.5196
k = 11 → silhouette score = 0.5205
k = 12 → silhouette score = 0.5169
k = 13 → silhouette score = 0.5203
k = 14 → silhouette score = 0.5114
k = 15 → silhouette score = 0.5076
k = 16 → silhouette score = 0.5121
k = 17 → silhouette score = 0.5207
k = 18 → silhouette score = 0.5300
k = 19 → silhouette score = 0.5448
k = 20 → silhouette score = 0.5495
k = 21 → silhouette score = 0.5323
k = 22 → silhouette score = 0.5307
k = 23 → silhouette score = 0.5364
k = 24 → silhouette score = 0.5358
k = 25 → silhouette score = 0.5323
k = 26 → silhouette score = 0.5544
k = 27 → silhouette score = 0.5644
k = 28 → silhouette score = 0.5737
k = 29 → silhouette score = 0.5831

Numero ottimale di cluster:

In [16]:
# 6. Clustering sui vettori di embedding
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
user['cluster_w2v'] = kmeans_final.fit_predict(df_emb)

In [17]:
# 7. Riduzione a 3D per visualizzazione
pca = PCA(n_components=3, random_state=42)
coords3d = pca.fit_transform(df_emb)
df_plot = pd.DataFrame(coords3d, columns=['PC1', 'PC2', 'PC3'])
df_plot['cluster'] = user['cluster_w2v'].astype(str)

In [18]:
# 8. Grafico 3D interattivo con Plotly
fig = px.scatter_3d(
    df_plot,
    x='PC1', y='PC2', z='PC3',
    color='cluster',
    title='Word2Vec Embedding + KMeans: visualizzazione 3D',
    labels={'cluster':'Cluster'}
)
fig.update_traces(marker=dict(size=4))
fig.show()