In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder, TargetEncoder
from sklearn.feature_extraction import FeatureHasher
from kmodes.kprototypes import KPrototypes
import plotly.express as px


In [None]:
ratings = pd.read_csv('../datasets/ml-100k/u.data', 
                      sep='\t', 
                      names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings = ratings.drop(columns='timestamp')

user_item_dense = ratings.pivot(
    index='user_id',
    columns='item_id',
    values='rating'
).fillna(0)

In [None]:
user = pd.read_csv('../datasets/ml-100k/u.user', 
                      sep='|', 
                      names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

merged = user.merge(user_item_dense, on='user_id')

user

In [None]:
# K-PROTOTYPE
le_gender = LabelEncoder()
le_occ    = LabelEncoder()
le_zip    = LabelEncoder()

user['gender_le']     = le_gender.fit_transform(user['gender'])
user['occupation_le'] = le_occ.fit_transform(user['occupation'])
user['zip_le']        = le_zip.fit_transform(user['zip_code'])


matrix = user[['age', 'zip_le', 'gender_le', 'occupation_le']].to_numpy()


In [None]:
matrix

In [None]:
sil_scores = {}
range_n_clusters = range(2, 11)  # prova da 2 a 10 cluster

for k in range_n_clusters:
    # fit k‑Prototypes
    kp = KPrototypes(n_clusters=k, init='Cao', verbose=0, random_state=42)
    labels = kp.fit_predict(matrix, categorical=[1, 2, 3])
    # silhouette score (usiamo distanza euclidea sulla matrice numerica codificata)
    score = silhouette_score(matrix, labels, metric='euclidean')
    sil_scores[k] = score
    print(f"k = {k} → silhouette score = {score:.4f}")

# Trova il k con il silhouette migliore
best_k = max(sil_scores, key=sil_scores.get)
best_score = sil_scores[best_k]
print(f"\nNumero ottimale di cluster: k = {best_k} (silhouette = {best_score:.4f})")

In [None]:
kp = KPrototypes(n_clusters=best_k, init='Cao', verbose=1, random_state=42)
clusters = kp.fit_predict(matrix, categorical=[1, 2, 3])  # zip_code, gender, occupation
user['cluster_kp'] = clusters

In [None]:
# 5. Riduzione a 3 dimensioni (PCA) per la visualizzazione
pca = PCA(n_components=3)
coords_3d = pca.fit_transform(matrix)
df_plot = pd.DataFrame(coords_3d, columns=['PC1', 'PC2', 'PC3'])
df_plot['cluster'] = clusters.astype(str)

In [None]:
# 6. Grafico 3D interattivo con Plotly
fig = px.scatter_3d(
    df_plot,
    x='PC1', y='PC2', z='PC3',
    color='cluster',
    title='Visualizzazione 3D interattiva dei cluster k‑Prototypes',
    labels={'cluster': 'Cluster'}
)
fig.update_traces(marker=dict(size=4))
fig.show()