In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder, TargetEncoder
from sklearn.feature_extraction import FeatureHasher
from kmodes.kprototypes import KPrototypes
import plotly.express as px


In [13]:
ratings = pd.read_csv('../datasets/ml-100k/u.data', 
                      sep='\t', 
                      names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings = ratings.drop(columns='timestamp')

user_item_dense = ratings.pivot(
    index='user_id',
    columns='item_id',
    values='rating'
).fillna(0)

In [14]:
user = pd.read_csv('../datasets/ml-100k/u.user', 
                      sep='|', 
                      names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

merged = user.merge(user_item_dense, on='user_id')

user

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [15]:
# K-PROTOTYPE
le_gender = LabelEncoder()
le_occ    = LabelEncoder()
le_zip    = LabelEncoder()

user['gender_le']     = le_gender.fit_transform(user['gender'])
user['occupation_le'] = le_occ.fit_transform(user['occupation'])
user['zip_le']        = le_zip.fit_transform(user['zip_code'])


matrix = user[['age', 'zip_le', 'gender_le', 'occupation_le']].to_numpy()
kp = KPrototypes(n_clusters=4, init='Cao', verbose=1, random_state=42)
clusters = kp.fit_predict(matrix, categorical=[1, 2, 3])  # zip_code, gender, occupation
user['cluster_kp'] = clusters

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 225, ncost: 25372.039284658833
Run: 1, iteration: 2/100, moves: 86, ncost: 24715.597095701753
Run: 1, iteration: 3/100, moves: 11, ncost: 24702.73069955004
Run: 1, iteration: 4/100, moves: 0, ncost: 24702.73069955004
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 108, ncost: 24726.16893718197
Run: 2, iteration: 2/100, moves: 30, ncost: 24615.831376146245
Run: 2, iteration: 3/100, moves: 3, ncost: 24614.475029632085
Run: 2, iteration: 4/100, moves: 0, ncost: 24614.475029632085
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 202, ncost: 26940.450766578033
Run: 3, iteration: 2/100, moves: 90, ncost: 25344.09064644475
Run: 3, iteration: 3/100, moves: 50, ncost: 24805.71277979

In [16]:
# 5. Riduzione a 3 dimensioni (PCA) per la visualizzazione
pca = PCA(n_components=3)
coords_3d = pca.fit_transform(matrix)
df_plot = pd.DataFrame(coords_3d, columns=['PC1', 'PC2', 'PC3'])
df_plot['cluster'] = clusters.astype(str)

# 6. Grafico 3D interattivo con Plotly
fig = px.scatter_3d(
    df_plot,
    x='PC1', y='PC2', z='PC3',
    color='cluster',
    title='Visualizzazione 3D interattiva dei cluster k‑Prototypes',
    labels={'cluster': 'Cluster'}
)
fig.update_traces(marker=dict(size=4))
fig.show()

In [None]:
sil_scores = {}
range_n_clusters = range(2, 11)  # prova da 2 a 10 cluster

for k in range_n_clusters:
    # fit k‑Prototypes
    kp = KPrototypes(n_clusters=k, init='Cao', verbose=0, random_state=42)
    labels = kp.fit_predict(matrix, categorical=[1, 2, 3])
    # silhouette score (usiamo distanza euclidea sulla matrice numerica codificata)
    score = silhouette_score(matrix, labels, metric='euclidean')
    sil_scores[k] = score
    print(f"k = {k} → silhouette score = {score:.4f}")

# Trova il k con il silhouette migliore
best_k = max(sil_scores, key=sil_scores.get)
best_score = sil_scores[best_k]
print(f"\nNumero ottimale di cluster: k = {best_k} (silhouette = {best_score:.4f})")

In [None]:
opt = silhouette_scores.index(max(silhouette_scores))
silhouette_scores[opt]

In [None]:
opt

In [None]:
silhouette_scores

In [None]:
# 6. Applica KMeans con il numero ottimale di cluster
#kmeans = KMeans(n_clusters=opt, random_state=42)
#user['cluster'] = kmeans.fit_predict(data_scaled)

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(data_scaled)
user[['PCA1','PCA2','PCA3']] = components


In [None]:
import plotly.express as px

# Sostituisci questa parte con il tuo caricamento del DataFrame
# import pandas as pd
# user = pd.read_csv('tuo_file.csv')

fig = px.scatter_3d(
    user,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='cluster',
    title='Cluster utenti con K-Means (PCA 3D)',
    labels={'PCA1': 'PCA1', 'PCA2': 'PCA2', 'PCA3': 'PCA3'}
)

fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=40),
    legend_title_text='Cluster'
)

fig.show()


ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['user_id', 'age', 'gender', 'occupation', 'zip_code', 'gender_le', 'occupation_le', 'zip_le', 'cluster_kp'] but received: PCA1