In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder, TargetEncoder
from sklearn.feature_extraction import FeatureHasher
from kmodes.kprototypes import KPrototypes
import plotly.express as px


In [28]:
ratings = pd.read_csv('../datasets/ml-100k/u.data', 
                      sep='\t', 
                      names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings = ratings.drop(columns='timestamp')

user_item_dense = ratings.pivot(
    index='user_id',
    columns='item_id',
    values='rating'
).fillna(0)

In [29]:
user = pd.read_csv('../datasets/ml-100k/u.user', 
                      sep='|', 
                      names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

merged = user.merge(user_item_dense, on='user_id')

user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [30]:
best_k

2

In [31]:
best_score

0.03218405460267304

In [32]:
# K-PROTOTYPE
le_gender = LabelEncoder()
le_occ    = LabelEncoder()
le_zip    = LabelEncoder()

user['gender_le']     = le_gender.fit_transform(user['gender'])
user['occupation_le'] = le_occ.fit_transform(user['occupation'])
user['zip_le']        = le_zip.fit_transform(user['zip_code'])


matrix = user[['age', 'zip_le', 'gender_le', 'occupation_le']].to_numpy()


In [33]:
sil_scores = {}
range_n_clusters = range(2, 11)  # prova da 2 a 10 cluster

for k in range_n_clusters:
    # fit k‑Prototypes
    kp = KPrototypes(n_clusters=k, init='Cao', verbose=0, random_state=42)
    labels = kp.fit_predict(matrix, categorical=[1, 2, 3])
    # silhouette score (usiamo distanza euclidea sulla matrice numerica codificata)
    score = silhouette_score(matrix, labels, metric='euclidean')
    sil_scores[k] = score
    print(f"k = {k} → silhouette score = {score:.4f}")

# Trova il k con il silhouette migliore
best_k = max(sil_scores, key=sil_scores.get)
best_score = sil_scores[best_k]
print(f"\nNumero ottimale di cluster: k = {best_k} (silhouette = {best_score:.4f})")

k = 2 → silhouette score = 0.0322
k = 3 → silhouette score = -0.0103
k = 4 → silhouette score = -0.0335
k = 5 → silhouette score = -0.0374
k = 6 → silhouette score = -0.0482
k = 7 → silhouette score = -0.0670
k = 8 → silhouette score = -0.0728
k = 9 → silhouette score = -0.0764
k = 10 → silhouette score = -0.0894

Numero ottimale di cluster: k = 2 (silhouette = 0.0322)


In [34]:
kp = KPrototypes(n_clusters=best_k, init='Cao', verbose=1, random_state=42)
clusters = kp.fit_predict(matrix, categorical=[1, 2, 3])  # zip_code, gender, occupation
user['cluster_kp'] = clusters

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 48, ncost: 51207.88267390665
Run: 1, iteration: 2/100, moves: 12, ncost: 51130.4130074675
Run: 1, iteration: 3/100, moves: 0, ncost: 51130.4130074675
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 33, ncost: 51149.4980275898
Run: 2, iteration: 2/100, moves: 6, ncost: 51130.4130074675
Run: 2, iteration: 3/100, moves: 0, ncost: 51130.4130074675
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 30, ncost: 51245.5194240735
Run: 3, iteration: 2/100, moves: 15, ncost: 51122.87982118437
Run: 3, iteration: 3/100, moves: 0, ncost: 51122.87982118437
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 17, ncost: 51130.413007

In [None]:
# 5. Riduzione a 3 dimensioni (PCA) per la visualizzazione
pca = PCA(n_components=3)
coords_3d = pca.fit_transform(matrix)
df_plot = pd.DataFrame(coords_3d, columns=['PC1', 'PC2', 'PC3'])
df_plot['cluster'] = clusters.astype(str)

In [None]:
# 6. Grafico 3D interattivo con Plotly
fig = px.scatter_3d(
    df_plot,
    x='PC1', y='PC2', z='PC3',
    color='cluster',
    title='Visualizzazione 3D interattiva dei cluster k‑Prototypes',
    labels={'cluster': 'Cluster'}
)
fig.update_traces(marker=dict(size=4))
fig.show()