In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
movies = pd.read_csv("../datasets/MovieDS/movies.csv")
ratings = pd.read_csv("../datasets/MovieDS/ratings.csv")
ratings = ratings.drop(columns=("timestamp"))

df_active = ratings.merge(movies, on="movieId")
df_active

In [None]:
# Mantieni solo utenti con almeno 20 rating 
flag = 10000
user_counts = df_active['userId'].value_counts()
active_users = user_counts[user_counts >= 20].index
df_active = df_active[df_active['userId'].isin(active_users)].copy()
if len(active_users) > flag:
    sampled_users = np.random.choice(active_users, flag, replace=False)
    df_active = df_active[df_active['userId'].isin(sampled_users)]

df_active

In [None]:
# Mappatura in indici continui e matrice sparsa
df_active['u_idx'] = df_active['userId'].astype('category').cat.codes
df_active['m_idx'] = df_active['movieId'].astype('category').cat.codes

n_users  = df_active['u_idx'].nunique()
n_movies = df_active['m_idx'].nunique()

row = df_active['u_idx'].values
col = df_active['m_idx'].values
data = df_active['rating'].values
user_movie_sparse = csr_matrix((data, (row, col)), shape=(n_users, n_movies))

In [None]:
# Riduzione dimensionale
svd = TruncatedSVD(n_components=50, random_state=42)
X_reduced = svd.fit_transform(user_movie_sparse)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reduced)

In [None]:
print("Shape originale:", user_movie_sparse.shape)

# Dopo SVD
print("Shape dopo SVD:", X_reduced.shape)

# Dopo normalizzazione
print("Shape dopo scaling:", X_scaled.shape)

# Valori di esempio
print("\nEsempio - valori originali (sparsi):")
print(user_movie_sparse[0])

print("\nEsempio - valori ridotti:")
print(X_reduced[0])

print("\nEsempio - valori scalati:")
print(X_scaled[0])

In [None]:
# Agglomerative Clustering
n_clusters = 5
agglo = AgglomerativeClustering(n_clusters=n_clusters)
labels = agglo.fit_predict(X_scaled)

In [None]:
# DataFrame dei cluster
user_map = pd.DataFrame({
    'u_idx': np.arange(n_users),
    'userId': df_active['userId'].astype('category').cat.categories,
    'cluster': labels
})

print(user_map.head())

In [None]:
def recommend_for_user_active(user_id, df_active, user_map, top_n=10):
    # cluster di appartenenza
    c = user_map.loc[user_map['userId'] == user_id, 'cluster'].values[0]
    # seleziona tutti gli utenti in quel cluster
    users_in_cluster = user_map.loc[user_map['cluster'] == c, 'userId'].values
    # calcola media rating per movieId
    cluster_ratings = (
        df_active[df_active['userId'].isin(users_in_cluster)]
        .groupby('movieId')['rating']
        .mean()
    )
    # titoli già visti dall'utente
    seen = set(df_active[df_active['userId'] == user_id]['movieId'])
    # top_n non ancora visti
    top_movie_ids = (
        cluster_ratings[~cluster_ratings.index.isin(seen)]
        .sort_values(ascending=False)
        .head(top_n)
        .index
    )
    return (
        df_active[df_active['movieId'].isin(top_movie_ids)]
        [['movieId', 'title']]
        .drop_duplicates()
        .reset_index(drop=True)
    )

In [None]:
random_user = df_active['userId'].drop_duplicates().sample(1).iloc[0]
user_id = random_user

cluster_id = user_map.loc[user_map['userId'] == user_id, 'cluster'].values
if len(cluster_id) == 0:
    raise ValueError(f"Utente {user_id} non trovato tra quelli filtrati.")
cluster_id = cluster_id[0]
print(f"Utente {user_id} è nel cluster {cluster_id}")

In [None]:
recs = recommend_for_user_active(user_id, df_active, user_map, top_n=10)
print("Raccomandazioni:")
print(recs)