In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


In [5]:
movies = pd.read_csv("../datasets/MovieDS/movies.csv")
ratings = pd.read_csv("../datasets/MovieDS/ratings.csv")
ratings = ratings.drop(columns=("timestamp"))

df_active = ratings.merge(movies, on="movieId")
df_active

Unnamed: 0,userId,movieId,rating,title,genres
0,1,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,Underground (1995),Comedy|Drama|War
4,1,899,3.5,Singin' in the Rain (1952),Comedy|Musical|Romance
...,...,...,...,...,...
25000090,162541,50872,4.5,Ratatouille (2007),Animation|Children|Drama
25000091,162541,55768,2.5,Bee Movie (2007),Animation|Comedy
25000092,162541,56176,2.0,Alvin and the Chipmunks (2007),Children|Comedy
25000093,162541,58559,4.0,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX


In [6]:
# Mantieni solo utenti con almeno 20 rating 
user_counts = df_active['userId'].value_counts()
active_users = user_counts[user_counts >= 20].index
df_active = df_active[df_active['userId'].isin(active_users)].copy()
if len(active_users) > 10000:
    sampled_users = np.random.choice(active_users, 10000, replace=False)
    df_active = df_active[df_active['userId'].isin(sampled_users)]

df_active

Unnamed: 0,userId,movieId,rating,title,genres
3739,21,260,4.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
3740,21,293,4.0,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller
3741,21,318,5.0,"Shawshank Redemption, The (1994)",Crime|Drama
3742,21,527,5.0,Schindler's List (1993),Drama|War
3743,21,541,5.0,Blade Runner (1982),Action|Sci-Fi|Thriller
...,...,...,...,...,...
24999374,162534,193065,3.0,Roma (2018),Drama
24999375,162534,193944,3.0,The Ballad of Buster Scruggs (2018),Comedy|Drama|Western
24999376,162534,194400,3.0,Widows (2018),Crime|Drama|Thriller
24999377,162534,195163,2.0,Bumblebee (2018),Action|Adventure|Sci-Fi


In [7]:
# Mappatura in indici continui e matrice sparsa
df_active['u_idx'] = df_active['userId'].astype('category').cat.codes
df_active['m_idx'] = df_active['movieId'].astype('category').cat.codes

n_users  = df_active['u_idx'].nunique()
n_movies = df_active['m_idx'].nunique()

row = df_active['u_idx'].values
col = df_active['m_idx'].values
data = df_active['rating'].values
user_movie_sparse = csr_matrix((data, (row, col)), shape=(n_users, n_movies))

In [8]:
# Riduzione dimensionale
svd = TruncatedSVD(n_components=50, random_state=42)
X_reduced = svd.fit_transform(user_movie_sparse)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reduced)

  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat


In [9]:
# Agglomerative Clustering
n_clusters = 5
agglo = AgglomerativeClustering(n_clusters=n_clusters)
labels = agglo.fit_predict(X_scaled)

In [10]:
# DataFrame dei cluster
user_map = pd.DataFrame({
    'u_idx': np.arange(n_users),
    'userId': df_active['userId'].astype('category').cat.categories,
    'cluster': labels
})

print(user_map.head())

   u_idx  userId  cluster
0      0      21        0
1      1      41        0
2      2      46        0
3      3      47        4
4      4      61        0


In [11]:
def recommend_for_user_active(user_id, df_active, user_map, top_n=10):
    # cluster di appartenenza
    c = user_map.loc[user_map['userId'] == user_id, 'cluster'].values[0]
    # seleziona tutti gli utenti in quel cluster
    users_in_cluster = user_map.loc[user_map['cluster'] == c, 'userId'].values
    # calcola media rating per movieId
    cluster_ratings = (
        df_active[df_active['userId'].isin(users_in_cluster)]
        .groupby('movieId')['rating']
        .mean()
    )
    # titoli già visti dall'utente
    seen = set(df_active[df_active['userId'] == user_id]['movieId'])
    # top_n non ancora visti
    top_movie_ids = (
        cluster_ratings[~cluster_ratings.index.isin(seen)]
        .sort_values(ascending=False)
        .head(top_n)
        .index
    )
    return (
        df_active[df_active['movieId'].isin(top_movie_ids)]
        [['movieId', 'title']]
        .drop_duplicates()
        .reset_index(drop=True)
    )

In [None]:
user_id = 21
cluster_id = user_map.loc[user_map['userId'] == user_id, 'cluster'].values
if len(cluster_id) == 0:
    raise ValueError(f"Utente {user_id} non trovato tra quelli filtrati.")
cluster_id = cluster_id[0]
print(f"Utente {user_id} è nel cluster {cluster_id}")

ValueError: Utente 1 non trovato tra quelli filtrati.

In [None]:
recs = recommend_for_user_active(user_id, df_active, user_map, top_n=10)
print("Raccomandazioni:")
print(recs)