In [275]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
from functools import lru_cache
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_absolute_error, root_mean_squared_error
from sklearn.decomposition import TruncatedSVD

In [276]:
# Funzione di caricamento dei dati
def load_data(url) -> pd.DataFrame:
    columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    df = pd.read_csv(url, sep='\t', names=columns)
    df.drop(columns='timestamp', inplace=True)
    return df

# Funzione per la creazione della matrice User X Item
def create_user_item_matrix(df: pd.DataFrame):
    return df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)


In [277]:
# Funzione per trovare il miglior numero di cluster
def find_optimal_k(data: pd.DataFrame, k_min=2, k_max=10, verbose=False):
    best_k, best_score = k_min, -1
    for k in range(k_min, k_max+1):
        km    = KMeans(n_clusters=k, random_state=42).fit(data)
        score = silhouette_score(data, km.labels_)
        if verbose:
            print(f"k={k:2d} - silhouette={score:.3f}")
        if score > best_score:
            best_k, best_score = k, score
    print(f"Miglior k: {best_k} (silhouette={best_score:.3f})")
    return best_k

def cluster_users(data, n_clusters):
    km     = KMeans(n_clusters=n_clusters, random_state=42).fit(data)
    labels = km.labels_
    return labels, km


In [278]:
def show_cluster(data: pd.DataFrame):
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt

    X = data.drop('cluster', axis=1).values
    y = data['cluster'].values

    # Riduzione con PCA
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)

    # Scatter plot
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.6)
    plt.legend(*scatter.legend_elements(), title="Cluster")
    plt.title("Utenti proiettati su PCA 2D")
    # plt.xlabel("PC1")
    # plt.ylabel("PC2")


In [279]:
@lru_cache(maxsize=1)
def load_movies(path="../datasets/ml-100k/u.item"):
    """Carica e indicizza il dataset dei film UNA volta sola."""
    cols = ["movie_id", "title", "release_date", "video_release_date", "IMDb_URL",
            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
            "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
            "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
    df = pd.read_csv(path, sep="|", encoding="latin-1", names=cols)
    return df.set_index("movie_id")

def get_item_title(rec):
    """Ritorna i titoli per una lista di movie_id in modo vettoriale."""
    movies = load_movies()
    # movies.loc[rec, 'title'] già restituisce una Series nell'ordine di rec
    return movies.loc[rec, "title"].tolist()

In [294]:
def find_similar_user(users: pd.DataFrame, user_id: int) -> pd.Series:
    """Calcola la similarità coseno tra tutti gli utenti e ritorna la serie ordinata."""
    # users.index -> array di user_id
    mat = users.values
    sims = cosine_similarity(mat)
    # prendo la riga corrispondente a user_id
    i = users.index.get_loc(user_id)
    sim_series = pd.Series(sims[i], index=users.index)
    return sim_series.drop(user_id).sort_values(ascending=False)

def predict_rating(ui_matrix: pd.DataFrame, user_id: int, item_id: int, k_sim: int = None, verbose: bool = False) -> float:
    """
    Predice il rating per user_id–item_id:
      - se l'item non esiste: media dell'utente o globale;
      - altrimenti media pesata dei k_sim più simili nel suo cluster.
    """
    # 1) ITEM nuovo -> fallback su media dell'utente / globale
    if item_id not in ui_matrix.columns:
        user_row = ui_matrix.loc[user_id].replace(0, np.nan)
        user_mean = user_row.mean()
        return user_mean if not np.isnan(user_mean) else ui_matrix.replace(0, np.nan).values.mean()

    # 2) Filtra utenti nel medesimo cluster (tolgo colonna 'cluster')
    cluster_id = ui_matrix.at[user_id, 'cluster']
    group = ui_matrix[ui_matrix['cluster'] == cluster_id].drop(columns='cluster')

    if verbose:
        print(f"Utente {user_id} nel cluster {cluster_id} ({group.shape[0]} utenti)")

    # 3) Similarità e top-k
    sims = find_similar_user(group, user_id)
    if k_sim:
        sims = sims.head(k_sim)

    # 4) Prendi solo chi ha valutato item_id
    ratings = ui_matrix.loc[sims.index, item_id]
    mask = ratings > 0
    if not mask.any():
        # nessuno ha votato -> media globale dei voti non-zero su quell'item
        col = ui_matrix[item_id]
        return col[col > 0].mean()

    # 5) media pesata con controllo divisione per zero
    weighted_sum = (sims[mask] * ratings[mask]).sum()
    sim_sum = sims[mask].sum()

    if sim_sum == 0.0:
        # fallback: prima media dei rating dell'utente, poi globale
        user_row = ui_matrix.loc[user_id].replace(0, np.nan)
        user_mean = user_row.mean()
        if not np.isnan(user_mean):
            return user_mean
        return ui_matrix.replace(0, np.nan).values.mean()

    return weighted_sum / sim_sum

def recommend_top_n(ui_matrix: pd.DataFrame, user_id: int, num_recommendations: int = 5, k_sim: int = None) -> list:
    """
    Ritorna la lista dei top-N item consigliati (non ancora valutati da user_id).
    """
    user_ratings = ui_matrix.loc[user_id]
    unrated = user_ratings[user_ratings == 0].index  # item_id con rating==0

    # dict comprehension + predict_rating
    preds = {
        item: predict_rating(ui_matrix, user_id, item, k_sim)
        for item in unrated
    }
    # prendi i num_recommendations con rating più alto
    
    top_items = sorted(preds, key=preds.get, reverse=True)[:num_recommendations]
    print(top_items)
    return top_items


In [None]:
# Funzione per raccomandare film a un utente basandosi sugli utenti simili
def recommend_movies(ui_matrix, user_id, num_recommendations=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]  # escludi se stesso
    weighted_ratings = np.zeros(train_matrix.shape[1])
    
    for other_user, similarity in similar_users.items():
        weighted_ratings += similarity * train_matrix.loc[other_user].values
    
    user_rated = train_matrix.loc[user_id].values > 0
    weighted_ratings[user_rated] = 0  # Escludi quelli già visti
    recommended_indices = np.argsort(weighted_ratings)[::-1][:num_recommendations]
    
    return train_matrix.columns[recommended_indices]

In [None]:
def recommend_top_n(ui_matrix, labels, user_id, N):
    # riusa predict_rating per ogni item non valutato
    rated = set(ui_matrix.columns[ui_matrix.values[user_id-1]>0])
    candidates = [m for m in ui_matrix.columns if m not in rated]
    scores = [(m, predict_rating(ui_matrix, labels, user_id, m)) for m in candidates]
    return [m for m,_ in sorted(scores, key=lambda x: x[1], reverse=True)[:N]]

In [282]:
def evaluate_rating_pred(ui_matrix, test_df: pd.DataFrame, threshold = 4):
    y_true, y_pred = [], []

    # Estraiamo le colonne di interesse e itteriamo con itertuples
    cols = ['user_id', 'movie_id', 'rating']
    total = len(test_df)
    for uid, mid, r in tqdm(test_df[cols].itertuples(index=False, name=None), total=total, 
                            desc="Evaluating ratings", unit="row", ncols=80, mininterval=0.5):
        uid, mid, r = int(uid), int(mid), float(r)
        pred_rating = predict_rating(ui_matrix=ui_matrix, user_id=uid, item_id=mid)
        
        y_true.append(r)
        y_pred.append(pred_rating)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calcolo diretto di RMSE e MAE
    y_true_binary = (y_true >= threshold).astype(int)
    y_pred_binary = (y_pred >= threshold).astype(int)
    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    precision = precision_score(y_true_binary, y_pred_binary, zero_division=0)
    recall = recall_score(y_true_binary, y_pred_binary, zero_division=0)

    rmse = root_mean_squared_error(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    return rmse, mae, accuracy, precision, recall


In [None]:

# Caricamento del dataset
train = load_data('../datasets/ml-100k/u1.base')
test = load_data('../datasets/ml-100k/u1.test')

# Crea la matrice user-item
ui_matrix = create_user_item_matrix(train)

# Riduce la dimensionalità e calcola i cluster degli utenti
svd = TruncatedSVD(n_components=50, random_state=42)
user_latent = svd.fit_transform(ui_matrix)
optimal_n_cluster = find_optimal_k(user_latent)
user_cluster, _ = cluster_users(user_latent, optimal_n_cluster)
clusters = pd.Series(user_cluster, index=ui_matrix.index, name='cluster')
ui_matrix['cluster'] = clusters
# show_cluster(ui_matrix)

rmse, mae, accuracy, precision, recall = evaluate_rating_pred(ui_matrix=ui_matrix, test_df=test)
print(f'MAE {mae:.4f} | RMSE {rmse:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Miglior k: 2 (silhouette=0.350)
['They Made Me a Criminal (1939)', 'Last Dance (1996)', 'Prefontaine (1997)', 'Marlene Dietrich: Shadow and Light (1996) ', 'Star Kid (1997)']


Evaluating ratings: 100%|████████████████| 20000/20000 [04:02<00:00, 82.60row/s]

MAE 0.8312 | RMSE 1.0434
Accuracy: 0.5456
Precision: 0.7944
Recall: 0.2579





In [284]:
rmse, mae, accuracy, precision, recall = evaluate_rating_pred(ui_matrix=ui_matrix, test_df=test, threshold=3)
print(f'MAE {mae:.4f} | RMSE {rmse:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Evaluating ratings: 100%|████████████████| 20000/20000 [03:58<00:00, 83.84row/s]

MAE 0.8312 | RMSE 1.0434
Accuracy: 0.7993
Precision: 0.8685
Recall: 0.8902



