In [129]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

In [130]:
movies = pd.read_csv(
    '../../datasets/ml-100k/u.item',
    sep='|',
    encoding='latin-1', 
    names=[
        'movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_url',
        'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
) 
movies = movies.drop(columns=['movie_id','title', 'video_release_date', 'IMDb_url'])
movies

Unnamed: 0,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,01-Jan-1995,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,01-Jan-1995,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,01-Jan-1995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,01-Jan-1995,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,01-Jan-1995,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,06-Feb-1998,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,06-Feb-1998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,01-Jan-1998,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,01-Jan-1994,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [131]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['year']  = movies['release_date'].dt.year
movies['month'] = movies['release_date'].dt.month
movies['day']   = movies['release_date'].dt.day
movies = movies.drop(columns='release_date')
movies

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,month,day
0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1995.0,1.0,1.0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1995.0,1.0,1.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1995.0,1.0,1.0
3,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1995.0,1.0,1.0
4,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1995.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1998.0,2.0,6.0
1678,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1998.0,2.0,6.0
1679,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1998.0,1.0,1.0
1680,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1994.0,1.0,1.0


In [132]:
missing_per_col = movies.isna().sum()
print(missing_per_col)

unknown        0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
year           1
month          1
day            1
dtype: int64


In [133]:
movies = movies.dropna(subset=['year','month','day']).copy()

In [134]:
# Standardizzazione dei dati
scaler = StandardScaler()
movies_scaled = scaler.fit_transform(movies)
movies_scaled

array([[-0.0243975 , -0.41895641, -0.29550317, ...,  0.39397757,
        -0.55418687, -0.59271682],
       [-0.0243975 ,  2.38688315,  3.38405849, ...,  0.39397757,
        -0.55418687, -0.59271682],
       [-0.0243975 , -0.41895641, -0.29550317, ...,  0.39397757,
        -0.55418687, -0.59271682],
       ...,
       [-0.0243975 , -0.41895641, -0.29550317, ...,  0.60451364,
        -0.55418687, -0.59271682],
       [-0.0243975 , -0.41895641, -0.29550317, ...,  0.32379888,
        -0.55418687, -0.59271682],
       [-0.0243975 , -0.41895641, -0.29550317, ...,  0.46415626,
         0.10957382,  0.20336797]])

In [135]:
silhouette_scores = []
K_range = range(2,50)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(movies_scaled)
    score = silhouette_score(movies_scaled, labels)
    silhouette_scores.append(score)

optimal_k = K_range[silhouette_scores.index(max(silhouette_scores))]
print(f"Numero ottimale di cluster trovato: {optimal_k} | ")

Numero ottimale di cluster trovato: 45 | 


In [136]:
silhouette_scores

[0.2419172235380778,
 0.2539141563400573,
 0.21169149922049224,
 0.13038451611211752,
 0.15693948695578766,
 0.17579587799177693,
 0.16937765165223093,
 0.19201886328289475,
 0.20391452175293145,
 0.205462022232165,
 0.2181750901139434,
 0.24494402269990756,
 0.2654962443246246,
 0.27320144193825585,
 0.2793620776675372,
 0.2939744365285082,
 0.3121237746926317,
 0.31215807016232516,
 0.32382323620474895,
 0.3350018217779934,
 0.34116168200180025,
 0.34430606386006674,
 0.3450372612693094,
 0.3626860462983191,
 0.36052773815088285,
 0.3725849087834958,
 0.3714431890719524,
 0.3779650045926212,
 0.39500989179069407,
 0.3934144581380979,
 0.3997263715392541,
 0.39551841192914894,
 0.3948907917848962,
 0.3983659744844029,
 0.4296451723137392,
 0.4240395980564541,
 0.4246666343141372,
 0.4230622721294692,
 0.4222682230581753,
 0.42700899656166896,
 0.42878045848568286,
 0.42910988070399936,
 0.4322157855190423,
 0.432956873663026,
 0.43245537299503833,
 0.42415691227516344,
 0.424813743721

In [137]:
opt = silhouette_scores.index(max(silhouette_scores))
silhouette_scores[opt]

0.432956873663026

In [138]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
movies['cluster'] = kmeans.fit_predict(movies_scaled)

In [139]:
pca = PCA(n_components=3)
components = pca.fit_transform(movies_scaled)
movies[['PCA1','PCA2','PCA3']] = components


In [140]:
import plotly.express as px

fig = px.scatter_3d(
    movies,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='cluster',
    title='Cluster utenti con K-Means (PCA 3D)',
    labels={'PCA1': 'PCA1', 'PCA2': 'PCA2', 'PCA3': 'PCA3'}
)

fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=40),
    legend_title_text='Cluster'
)

fig.show()

In [141]:
from kmodes.kmodes import KModes


silhouette_scores = []
K_range = range(2,11)
for k in K_range:
    kmodes = KModes(n_clusters=k, random_state=42)
    labels = kmodes.fit_predict(movies_scaled)
    score = silhouette_score(movies_scaled, labels)
    silhouette_scores.append(score)

optimal_k = K_range[silhouette_scores.index(max(silhouette_scores))]
print(f"Numero ottimale di cluster trovato: {optimal_k} | ")

Numero ottimale di cluster trovato: 2 | 


In [142]:
silhouette_scores

[0.2782605699916609,
 0.2670845772517278,
 0.10893731503906871,
 0.11081404687260288,
 0.1093400608872219,
 0.07662315994196901,
 0.0705816379539427,
 0.08522040075213756,
 0.10695229494626014]

In [143]:
opt = silhouette_scores.index(max(silhouette_scores))
silhouette_scores[opt]

0.2782605699916609

In [144]:
kmodes = KModes(n_clusters=optimal_k, random_state=42)
movies['cluster_modes'] = kmeans.fit_predict(movies_scaled)

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(movies_scaled)
movies[['PCA1','PCA2','PCA3']] = components

In [None]:
import plotly.express as px

fig = px.scatter_3d(
    movies,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='cluster',
    title='Cluster utenti con K-Means (PCA 3D)',
    labels={'PCA1': 'PCA1', 'PCA2': 'PCA2', 'PCA3': 'PCA3'}
)

fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=40),
    legend_title_text='Cluster'
)

fig.show()