In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer


In [None]:
def split_ratings_by_user(rating_df, test_size=0.2):
    train_list = []
    test_list = []
    
    for user_id, group in rating_df.groupby('userId'):
        if len(group) < 5:
            continue  # ignora utenti con pochi rating

        train, test = train_test_split(group, test_size=test_size, random_state=42)
        train_list.append(train)
        test_list.append(test)

    return pd.concat(train_list), pd.concat(test_list)

In [None]:
flag = 1000

rating_df = pd.read_csv('../datasets/MovieDS/ratings.csv')
rating_df.drop(columns=['timestamp'], inplace=True)

user_counts = rating_df['userId'].value_counts()
active_users = user_counts[user_counts >= 20].index
rating_df = rating_df[rating_df['userId'].isin(active_users)].copy()
if len(active_users) > flag:
    sampled_users = np.random.choice(active_users, flag, replace=False)
    rating_df = rating_df[rating_df['userId'].isin(sampled_users)]


rating_train, rating_test = split_ratings_by_user(rating_df=rating_df)

rating_df

In [None]:
user_ids = rating_train['userId'].unique()
movie_ids = rating_train['movieId'].unique()

user_mapper  = {uid: i for i, uid in enumerate(user_ids)}
movie_mapper = {mid: i for i, mid in enumerate(movie_ids)}

user_idx  = rating_train['userId'] .map(user_mapper)
movie_idx = rating_train['movieId'].map(movie_mapper)

sparse_matrix = csr_matrix(
    (rating_train['rating'], (user_idx, movie_idx)),
    shape=(len(user_ids), len(movie_ids))
)

In [None]:
# Riduzione dimensionale
normalizer = Normalizer()
ratings_norm = normalizer.fit_transform(sparse_matrix)

In [None]:
n_clusters = 5
agglo = AgglomerativeClustering(
    n_clusters=n_clusters,
    linkage='ward'          # minimizza la varianza intra-cluster
)
# Agglomerative richiede input denso:
user_clusters = agglo.fit_predict(ratings_norm.toarray())

In [None]:
inv_user_mapper = {v: k for k, v in user_mapper.items()}
cluster_df = pd.DataFrame({
    'userId': [inv_user_mapper[i] for i in range(len(user_clusters))],
    'cluster': user_clusters
})
rating_df = rating_df.merge(cluster_df, on='userId', how='left')

print("Cluster assegnati:", rating_df['cluster'].unique())
rating_df.head()

In [None]:
def recommend_items_df(user_id:int, rating_df: pd.DataFrame, top_n=5):
    # Ottieni il cluster dell'utente
    user_cluster = rating_df.loc[rating_df['userId'] == user_id, 'cluster'].iloc[0]
    
    # Film già valutati dall'utente
    rated_movies = set(rating_df[rating_df['userId'] == user_id]['movieId'])
    
    # Filtra solo utenti nello stesso cluster, escludendo quelli già visti dall'utente
    cluster_df: pd.DataFrame = rating_df[(rating_df['cluster'] == user_cluster) & (~rating_df['movieId'].isin(rated_movies))]
    
    # Calcola la media dei rating per ciascun film
    movie_means = cluster_df.groupby('movieId')['rating'].mean()
    
    # Seleziona i top-N film con media più alta
    top_movies = movie_means.sort_values(ascending=False).head(top_n)
    
    return top_movies.index.tolist()


random_user = rating_df['userId'].drop_duplicates().sample(1).iloc[0]
user_id = random_user

recommended_ids = recommend_items_df(user_id=user_id, rating_df=rating_df)
movies_df = pd.read_csv('../datasets/MovieDS/movies.csv')
recommended_movies = movies_df[movies_df['movieId'].isin(recommended_ids)]
recommended_movies.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import csr_matrix
from typing import List

### STEP 1 – Caricamento dati
rating_df = pd.read_csv('../datasets/MovieDS/ratings.csv')
rating_df.drop(columns=['timestamp'], inplace=True)

user_counts = rating_df['userId'].value_counts()
active_users = user_counts[user_counts >= 20].index
rating_df = rating_df[rating_df['userId'].isin(active_users)].copy()
if len(active_users) > flag:
    sampled_users = np.random.choice(active_users, flag, replace=False)
    rating_df = rating_df[rating_df['userId'].isin(sampled_users)]


### STEP 2 – Train/test split per utente
def split_ratings_by_user(rating_df: pd.DataFrame, test_size=0.2):
    train_list = []
    test_list = []
    for user_id, group in rating_df.groupby('userId'):
        if len(group) < 5:
            continue
        train, test = train_test_split(group, test_size=test_size, random_state=42)
        train_list.append(train)
        test_list.append(test)
    return pd.concat(train_list), pd.concat(test_list)

rating_train, rating_test = split_ratings_by_user(rating_df)

### STEP 3 – Clustering utenti nel training set
def cluster_users(df: pd.DataFrame, n_clusters=5):
    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()

    user_mapper  = {uid: i for i, uid in enumerate(user_ids)}
    movie_mapper = {mid: i for i, mid in enumerate(movie_ids)}

    user_idx  = df['userId'] .map(user_mapper)
    movie_idx = df['movieId'].map(movie_mapper)

    sparse_matrix = csr_matrix(
        (df['rating'], (user_idx, movie_idx)),
        shape=(len(user_ids), len(movie_ids))
    )

    normalizer = Normalizer()
    ratings_norm = normalizer.fit_transform(sparse_matrix)

    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    user_clusters = kmeans.fit_predict(ratings_norm)

    inv_user_mapper = {v: k for k, v in user_mapper.items()}
    cluster_df = pd.DataFrame({
        'userId': [inv_user_mapper[i] for i in range(len(user_clusters))],
        'cluster': user_clusters
    })

    df = df.merge(cluster_df, on='userId', how='left')
    return df

rating_train = cluster_users(rating_train, n_clusters=5)

### STEP 4 – Raccomandazione top-N dal cluster
def recommend_items_df(user_id: int, rating_df: pd.DataFrame, top_n=5) -> List[int]:
    if user_id not in rating_df['userId'].values:
        return []
    user_cluster = rating_df.loc[rating_df['userId'] == user_id, 'cluster'].iloc[0]
    rated_movies = set(rating_df[rating_df['userId'] == user_id]['movieId'])
    cluster_df = rating_df[(rating_df['cluster'] == user_cluster) & (~rating_df['movieId'].isin(rated_movies))]
    movie_means = cluster_df.groupby('movieId')['rating'].mean()
    top_movies = movie_means.sort_values(ascending=False).head(top_n)
    return top_movies.index.tolist()

### STEP 5 – Valutazione: Precision@N e Recall@N
def evaluate_recommender(rating_train: pd.DataFrame, rating_test: pd.DataFrame, top_n=5, threshold=4.0):
    users = rating_test['userId'].unique()
    precisions = []
    recalls = []

    for user_id in users:
        if user_id not in rating_train['userId'].values:
            continue

        relevant_movies = set(
            rating_test[(rating_test['userId'] == user_id) &
                        (rating_test['rating'] >= threshold)]['movieId']
        )
        if not relevant_movies:
            continue

        recommended = recommend_items_df(user_id, rating_train, top_n)
        if not recommended:
            continue

        recommended_set = set(recommended)
        true_positives = recommended_set & relevant_movies

        precision = len(true_positives) / len(recommended_set)
        recall    = len(true_positives) / len(relevant_movies)

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = np.mean(precisions)
    avg_recall    = np.mean(recalls)
    return avg_precision, avg_recall

### STEP 6 – Esegui la valutazione
precision, recall = evaluate_recommender(rating_train, rating_test, top_n=5)
print(f"Precision@5: {precision:.4f}")
print(f"Recall@5:    {recall:.4f}")
