In [49]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split

In [50]:
import random
random.seed(42)

In [51]:
DATA_PATH = "../ml-latest-small"

ratings = pd.read_csv(os.path.join(DATA_PATH, "ratings.csv"))
movies = pd.read_csv(os.path.join(DATA_PATH, "movies.csv"))
tags = pd.read_csv(os.path.join(DATA_PATH, "tags.csv"))

# Unconstrained Matrix Factorisation based Collaborative Filtering (week 7)

In [52]:
df_ecommerce= pd.read_json('../1_ecommerce.jsonl', lines=True)

# for each session, we will create a list of items that the user has clicked on, removing duplicates
clicks_items_list = []
carts_items_list = []
orders_items_list = []

for events in df_ecommerce.events:
    clicks = []
    carts = []
    orders = []
    for e in events:
        if e['type'] == 'clicks':
            clicks.append(e['aid'])
        if e['type'] == 'carts':
            carts.append(e['aid'])
        if e['type'] == 'orders':
            orders.append(e['aid'])

    clicks_items_list.append(list(clicks))
    carts_items_list.append(list(carts))
    orders_items_list.append(list(orders))

df_ecommerce['clicks'] = clicks_items_list
df_ecommerce['carts'] = carts_items_list
df_ecommerce['orders'] = orders_items_list
df = df_ecommerce.drop(columns=["events"], axis=1)

clicks_length = df_ecommerce['clicks'].apply(len)
carts_length = df_ecommerce['carts'].apply(len)
orders_length = df_ecommerce['orders'].apply(len)

# takes only the session with items length higher than 20
df_truncated = df[df['clicks'].apply(lambda x: len(x) > 10)]
df = df_truncated.copy()
# redefine index
df.reset_index(drop=True, inplace=True)

# Explode each column (clicks, carts, orders)
df_clicks = df[['session', 'clicks']].explode('clicks').rename(columns={'clicks': 'item'}).dropna(subset=['item'])
df_carts = df[['session', 'carts']].explode('carts').rename(columns={'carts': 'item'}).dropna(subset=['item'])
df_orders = df[['session', 'orders']].explode('orders').rename(columns={'orders': 'item'}).dropna(subset=['item'])

# Concatenate the exploded dataframes
df_concat = pd.concat([df_clicks, df_carts, df_orders])

# Create a new column for each category indicating whether the item is present in that category
df_concat['click'] = df_concat['item'].isin(df_clicks['item']).astype(int)
df_concat['cart'] = df_concat['item'].isin(df_carts['item']).astype(int)
df_concat['order'] = df_concat['item'].isin(df_orders['item']).astype(int)

# Drop duplicates based on session and item
df_concat = df_concat.drop_duplicates(subset=['session', 'item'])

In [53]:
def sgd_matrix_factorization(df, k=10, alpha=0.01, lambda_reg=0.1, num_epochs=20, w_click=1, w_cart=3, w_order=5, test_size=0.2, validation_size=0.1):
    # Map session (users) and items to consecutive indices
    users = {u: i for i, u in enumerate(df['session'].unique())}
    items = {i: j for j, i in enumerate(df['item'].unique())}

    num_users = len(users)
    num_items = len(items)

    # Initialize U, V, and biases
    U = np.random.rand(num_users, k)
    V = np.random.rand(num_items, k)
    b_u = np.zeros(num_users)
    b_i = np.zeros(num_items)
    b = 0  # Global bias

    # Create (u, i, r_ui) tuples for all interactions
    data = []

    # iter throw all the rows of the dataframe
    # and create a list of tuples (user, item, rating)
    # Assign ratings to interactions
    for _, row in df.iterrows():
        u = users[row['session']]
        i = items[row['item']]

        # if this user has perform an action on this item, we assign a rating based on the action
        # type starting from the most important one
        # (order > cart > click)
        if row['order'] > 0:
            r_ui = w_order
        elif row['cart'] > 0:
            r_ui = w_cart
        elif row['click'] > 0:
            r_ui = w_click
        else:
            continue  # Skip interactions with no recorded action

        data.append((u, i, r_ui))

    # Split the data into training, validation, and test sets
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=validation_size, random_state=42)

    # Compute global bias as the mean rating of the training set
    if train_data:
        b = np.mean([r for _, _, r in train_data])

    print("Datasets created! Starting training...")

    # Training using SGD
    losses = []
    val_losses = []

    for epoch in range(num_epochs):
        np.random.shuffle(train_data)  # Shuffle data to improve convergence
        total_loss = 0
        total_val_loss = 0

        U_temp = U.copy()
        V_temp = V.copy()

        # Training phase
        for u, i, r_ui in train_data:
            # Predict the rating using the current model parameters
            # b is the global bias, b_u[u] is the user bias, b_i[i] is the item bias
            pred = b + b_u[u] + b_i[i] + np.dot(U[u, :], V[i, :])

            # Calculate the error
            e_ui = r_ui - pred  # Error

            # Update biases
            # alpha is the learning rate, lambda_reg is the regularization parameter
            b_u[u] += alpha * (e_ui - lambda_reg * b_u[u])
            b_i[i] += alpha * (e_ui - lambda_reg * b_i[i])

            # Update latent factors using temporary matrices
            U_temp[u, :] += alpha * (e_ui * V[i, :] - lambda_reg * U[u, :])
            V_temp[i, :] += alpha * (e_ui * U[u, :] - lambda_reg * V[i, :])

            total_loss += e_ui ** 2  # Sum of squared errors
        total_loss /= len(train_data)
        total_loss = np.sqrt(total_loss)

        # Copy back updated matrices
        U = U_temp
        V = V_temp

        # Validation phase (calculate validation loss)
        for u, i, r_ui in val_data:
            pred = b + b_u[u] + b_i[i] + np.dot(U[u, :], V[i, :])
            e_ui = r_ui - pred  # Error
            total_val_loss += e_ui ** 2
        total_val_loss /= len(val_data)
        total_val_loss = np.sqrt(total_val_loss)

        # Append losses for the current epoch
        losses.append(total_loss)
        val_losses.append(total_val_loss)

        print(f"Epoch {epoch+1}/{num_epochs}, RMSE Train Loss: {total_loss:.4f}, RMSE Val Loss: {total_val_loss:.4f}")

    # Test phase (calculate test loss)
    test_loss = 0
    for u, i, r_ui in test_data:
        pred = b + b_u[u] + b_i[i] + np.dot(U[u, :], V[i, :])
        e_ui = r_ui - pred  # Error
        test_loss += e_ui ** 2
    test_loss /= len(test_data)
    test_loss = np.sqrt(test_loss)

    print(f"Final Test RMSE: {test_loss:.4f}")

    return U, V, b_u, b_i, b, losses, val_losses, test_data

In [54]:
U, V, b_u, b_i, b, _, _, test_data = sgd_matrix_factorization(df_concat, k=20, alpha=0.001, num_epochs=1, w_click=1, w_order=2, w_cart=3)

Datasets created! Starting training...
Epoch 1/1, RMSE Train Loss: 5.0847, RMSE Val Loss: 4.3136
Final Test RMSE: 4.3113


# Naive Bayes CF Model Based (Week6)

Qui ho ridotto a 2 classi in modo da essere uguale alla week8 (content-based)

```python
user_data['label'] = user_data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
```

In [55]:
df_ratings = ratings.copy()

df_ratings.loc[df_ratings['rating'] <= 2, 'rating_ordinal'] = "Don't like"
df_ratings.loc[df_ratings['rating'] >= 4, 'rating_ordinal'] = "Like"

ratings_ordinals = df_ratings['rating_ordinal'].unique()

all_films = df_ratings["movieId"].unique()

In [56]:
# Prendi solo le righe con rating_ordinal noto
df_known = df_ratings.dropna(subset=['rating_ordinal'])

# Fai lo split tra train e test
# df_train, df_test = train_test_split(df_known, test_size=0.05, random_state=42) #3106 tuples
df_train, df_test = train_test_split(df_known, test_size=0.005, random_state=42) #311 tuples ~1min

# Ricrea la matrice user-movie SOLO con i dati del training set
df_train_user_movie = df_train.pivot(index='userId', columns='movieId', values='rating_ordinal')

# Il test sarà una lista di (userId, movieId, rating_ordinal) da prevedere
df_test = df_test[['userId', 'movieId', 'rating_ordinal']]

In [57]:
def test_naive_cf_model_based(df_test, df_train_user_movie):

    df_predictions = df_test.copy()

    for idx, row in enumerate(df_predictions.itertuples()):
        userId = row.userId
        movieId = row.movieId

        # Film visti dall'utente, dai dati di TRAIN
        try:
            films_seen = df_train_user_movie.loc[userId].dropna().index
        except KeyError:
            # Utente non presente nel train (cold start) → skip
            continue

        category_probs = {}

        for category in ratings_ordinals:
            df_current_movie = df_train[(df_train['movieId'] == movieId)]
            df_current_movie_category = df_current_movie[(df_current_movie['rating_ordinal'] == category)]

            if len(df_current_movie) == 0:
                continue  # impossibile stimare, nessuno ha votato il film

            users_with_category = df_current_movie_category['userId'].unique()

            p_cat = len(df_current_movie_category) / len(df_current_movie)
            probs = [p_cat]

            for film_seen in films_seen:
                try:
                    assigned_rating = df_train_user_movie.loc[userId, film_seen]
                except KeyError:
                    continue  # Film non presente per quell’utente

                df_seen_movie = df_train[df_train['movieId'] == film_seen]
                df_seen_movie_filtered = df_seen_movie[df_seen_movie['userId'].isin(users_with_category)]
                df_seen_movie_same_rating = df_seen_movie_filtered[df_seen_movie_filtered['rating_ordinal'] == assigned_rating]

                if len(df_seen_movie_filtered) == 0:
                    continue

                p_cond = len(df_seen_movie_same_rating) / len(df_seen_movie_filtered)
                probs.append(p_cond)

            category_probs[category] = np.prod(probs)

        if category_probs:
            predicted = max(category_probs.items(), key=lambda x: x[1])[0]
            df_predictions.loc[(df_predictions['userId'] == userId) & (df_predictions['movieId'] == movieId), 'predicted'] = predicted

        if idx % 50 == 0:
            print(f"Processed {idx}/{len(df_predictions)}")

    return df_predictions

In [58]:
df_predictions = test_naive_cf_model_based(df_test, df_train_user_movie)

Processed 0/311
Processed 50/311
Processed 100/311
Processed 150/311
Processed 200/311
Processed 250/311
Processed 300/311


In [59]:
df_predictions.head(10)

Unnamed: 0,userId,movieId,rating_ordinal,predicted
66214,426,2797,Like,Like
73518,474,1394,Like,Like
15301,100,11,Like,Like
23465,160,2078,Like,Like
77851,483,79242,Like,Like
22670,155,2959,Like,Like
75461,477,374,Don't like,Like
22474,153,2571,Don't like,Like
90090,586,1784,Like,Like
51118,330,2739,Like,Like


In [60]:
from sklearn.metrics import classification_report

# Skipping the NaN values (where we don't have that user in train / film without ratings)
df_predictions.dropna(subset=['predicted'], inplace=True)

In [61]:
df_predictions['rating_label'] = df_predictions['rating_ordinal'].map({
    "Don't like": 0,
    "Like": 1
})

df_predictions['pred_label'] = df_predictions['predicted'].map({
    "Don't like": 0,
    "Like": 1
})

print(classification_report(df_predictions['rating_label'], df_predictions['pred_label']))

              precision    recall  f1-score   support

           0       0.56      0.08      0.14        60
           1       0.81      0.98      0.89       238

    accuracy                           0.80       298
   macro avg       0.68      0.53      0.52       298
weighted avg       0.76      0.80      0.74       298



# Bayes Classification Content-based (week 8)

In [62]:
## 🧹 Preprocess Movie Metadata
tags_agg = tags.copy().groupby("movieId")["tag"].apply(lambda x: " ".join(x)).reset_index()
movies_df = movies.copy().merge(tags_agg, on="movieId", how="left")
movies_df["tag"] = movies_df["tag"].fillna("")

#### 1. User-Specific Naive Bayes Recommender

In [63]:
# Clean the title by removing the year in parentheses
def clean_title(title):
    return re.sub(r'\s*\(\d{4}\)', '', title)

metadata = movies_df[["movieId", "title", "genres"]]
metadata.loc[:, 'title'] = metadata['title'].apply(clean_title)

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(metadata['genres'].str.split('|'))
# Create a DataFrame with the encoded genres
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
# Concatenate the original metadata with the encoded genres
metadata = pd.concat([metadata[['movieId', 'title']], genres_df], axis=1)

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')

# Create a preprocessor that transforms the movie metadata:
# - Applies TF-IDF vectorization to the cleaned 'title' column to extract textual features.
# - Passes through the binary genre columns (already transformed by MultiLabelBinarizer).
# - Drops any remaining columns that are not explicitly selected.
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf, 'title'),
        ('genres', 'passthrough', genres_df.columns)
    ],
    remainder='drop'
)

In [64]:
def predict_single_movie(user_id, movie_id, ratings_train):
    ratings = ratings_train.copy()

    # Step 1: Prepare user data
    user_ratings = ratings[ratings['userId'] == user_id]
    user_data = pd.merge(user_ratings, metadata, on='movieId')

    # Step 2: Create binary labels
    user_data['label'] = user_data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    user_data = user_data.dropna(subset=['label'])
    user_data['label'] = user_data['label'].astype(int)

    if user_data.empty:
        print("User has insufficient data.")
        return None

    # Step 3: Train the model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', MultinomialNB())
    ])

    X_train = user_data.drop(columns=['userId', 'rating', 'label', 'timestamp', 'rating_ordinal'])
    y_train = user_data['label']

    pipeline.fit(X_train, y_train)

    # Step 4: Check if the movie has already been watched
    if movie_id in user_ratings['movieId'].values:
        print("The movie has already been watched by the user.")
        return None

    # Step 5: Extract features of the requested movie
    movie_row = metadata[metadata['movieId'] == movie_id]

    if movie_row.empty:
        print("Movie ID not found in the metadata.")
        return None

    input_cols = list(X_train.columns)
    movie_features = movie_row[input_cols]

    # Step 6: Predict the probability of liking the movie
    probs = pipeline.predict_proba(movie_features)[0]  # P(liked | features)
    if len(probs) < 2:
        print(f"The model for user {user_id} has seen only one class (not like/like).")
        return None
    else:
        prob = probs[1]

    return "Like" if prob >= 0.5 else "Don't like"

In [65]:
chosen_user = random.choice(ratings['userId'].unique())
chosen_film = random.choice(metadata['movieId'].unique())

predict_single_movie(user_id=chosen_user, movie_id=chosen_film, ratings_train=df_train)

'Like'

#### 2. Global Content-Based Recommender (Single Model for All Users)

In [66]:
metadata_with_tags = metadata.merge(movies_df[["movieId", "tag"]], on='movieId', how='left')

preprocessor_global = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf, 'title'),
        ('genres', 'passthrough', genres_df.columns),
        ('tfidf_tag', tfidf, 'tag')
    ],
    remainder='drop'
)

In [67]:
def train_global_model(ratings_train):
    ratings = ratings_train.copy()
    # Step 1: Prepare data
    data = pd.merge(ratings, metadata_with_tags, on='movieId')

    # Step 2: Create binary labels
    data['label'] = data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    data = data.dropna(subset=['label'])
    data['label'] = data['label'].astype(int)

    # Step 3: Train the model
    pipeline = Pipeline([
        ('preprocessor', preprocessor_global),
        ('classifier', MultinomialNB())
    ])

    X = data.drop(columns=['userId', 'movieId', 'rating', 'label', 'timestamp', 'rating_ordinal'])
    y = data['label']

    model = pipeline.fit(X, y)
    return model, X.columns

model_global, train_columns = train_global_model(df_train)

def recommend_global(user_id, movie_id, ratings_train):
    ratings = ratings_train.copy()

    # Step 4: Check if the movie has already been watched
    user_ratings = ratings[ratings['userId'] == user_id]
    if movie_id in user_ratings['movieId'].values:
        print("The movie has already been watched by the user.")
        return None

    # Step 5: Extract features of the requested movie
    movie_row = metadata_with_tags[metadata_with_tags['movieId'] == movie_id]

    if movie_row.empty:
        print("Movie ID not found in the metadata.")
        return None
    movie_features = movie_row[train_columns]

    # Step 6: Predict the probability of liking the movie
    probs = model_global.predict_proba(movie_features)[0]  # P(liked | features)
    if len(probs) < 2:
        print(f"The model for user {user_id} has seen only one class (not like/like).")
        return None
    else:
        prob = probs[1]

    return "Like" if prob >= 0.5 else "Don't like"

In [68]:
recommend_global(user_id=chosen_user, movie_id=chosen_film, ratings_train=df_train)

'Like'

In [74]:
def test_naive_cf_content_based(df_param, df_train):
    df_tmp = df_param.copy()

    preds_user = []
    preds_global = []
    for idx, row in enumerate(df_tmp.itertuples()):
        userId = row.userId
        movieId = row.movieId

        pred_user = predict_single_movie(userId, movieId, ratings_train=df_train)
        pred_global = recommend_global(userId, movieId, ratings_train=df_train)

        preds_user.append(pred_user)
        preds_global.append(pred_global)
        # df_tmp.loc[(df_predictions['userId'] == userId) & (df_predictions['movieId'] == movieId), 'pred_user'] = pred_user
        # df_tmp.loc[(df_predictions['userId'] == userId) & (df_predictions['movieId'] == movieId), 'pred_glob'] = pred_global

    return preds_user, preds_global

In [75]:
df_predictions.head()

Unnamed: 0,userId,movieId,rating_ordinal,predicted,rating_label,pred_label
66214,426,2797,Like,Like,1,1
73518,474,1394,Like,Like,1,1
15301,100,11,Like,Like,1,1
23465,160,2078,Like,Like,1,1
77851,483,79242,Like,Like,1,1


In [76]:
u, g = test_naive_cf_content_based(df_predictions, df_train=df_train)

The model for user 586 has seen only one class (not like/like).
The model for user 519 has seen only one class (not like/like).
The model for user 74 has seen only one class (not like/like).
The model for user 74 has seen only one class (not like/like).
The model for user 72 has seen only one class (not like/like).
The model for user 60 has seen only one class (not like/like).
The model for user 319 has seen only one class (not like/like).
The model for user 523 has seen only one class (not like/like).
The model for user 169 has seen only one class (not like/like).
The model for user 533 has seen only one class (not like/like).
The model for user 348 has seen only one class (not like/like).
The model for user 43 has seen only one class (not like/like).
The model for user 49 has seen only one class (not like/like).
The model for user 171 has seen only one class (not like/like).
The model for user 142 has seen only one class (not like/like).
The model for user 106 has seen only one class

(['Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  "Don't like",
  None,
  'Like',
  'Like',
  "Don't like",
  'Like',
  None,
  "Don't like",
  'Like',
  "Don't like",
  'Like',
  "Don't like",
  None,
  'Like',
  "Don't like",
  'Like',
  'Like',
  'Like',
  "Don't like",
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  None,
  "Don't like",
  'Like',
  "Don't like",
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  "Don't like",
  'Like',
  'Like',
  None,
  'Like',
  None,
  "Don't like",
  'Like',
  'Like',
  "Don't like",
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  None,
  None,
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  "Don't like",
  'Like',
  'Like',
  None,
  'Like',
  'Like',
  'Like',
  None,
  'Like',
  "Don't like",
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Like',
  'Lik

In [79]:
len(u), len(df_predictions)

(298, 298)

# Evaluation

In [72]:
# Importa le funzioni di valutazione dal modulo esterno
from eval_functions import (
    evaluate_rating_predictions,
    analyze_long_tail_effect,
    plot_avg_error_by_genre,
    plot_avg_error_by_popularity,
    evaluate_ranking_spearman,
    evaluate_topk,
    plot_precision_recall_roc,
    evaluate_random_recommender
)

In [73]:
# Supponiamo che ratings contenga una colonna 'predicted_rating' già calcolata
# In caso contrario, puoi usare un modello qualsiasi per generarla

# ESEMPIO: Accuracy + Long Tail
metrics = evaluate_rating_predictions(ratings['rating'], ratings['predicted_rating'])
print("Valutazione globale:", metrics)

long_tail_errors = analyze_long_tail_effect(ratings, prediction_column='predicted_rating')
print("Errore medio long tail vs head:\n", long_tail_errors)

# ESEMPIO: Visualizzazione errori
plot_avg_error_by_genre(ratings, movies)
plot_avg_error_by_popularity(ratings)

# ESEMPIO: Ranking Spearman per un utente
user_id = ratings['userId'].sample(1).values[0]
user_df = ratings[ratings['userId'] == user_id]
correlation = evaluate_ranking_spearman(user_df['rating'], user_df['predicted_rating'])
print(f"Spearman correlation per l'utente {user_id}: {correlation:.3f}")

# ESEMPIO: Top-k evaluation (supponendo y_true e y_scores binari)
# Qui facciamo un esempio fittizio
true_items = set(user_df[user_df['rating'] >= 4]['movieId'])
all_items = list(user_df['movieId'])
scores = list(user_df['predicted_rating'])

# Costruiamo y_true binario: 1 se è un item rilevante
y_true = [1 if item in true_items else 0 for item in all_items]

topk_result = evaluate_topk(y_true, scores, k=10)
print("Top-K Evaluation:", topk_result)

# Curve Precision-Recall e ROC
plot_precision_recall_roc(y_true, scores)

# ESEMPIO: Benchmark con Random Recommender
random_baseline = evaluate_random_recommender(true_items, all_items, k=10)
print("Random Recommender Benchmark:", random_baseline)

KeyError: 'predicted_rating'

In [None]:
models_outputs = {
    'Global Naive Bayes': df_global_nb,
    'User Naive Bayes': df_user_nb,
    'Matrix Factorization': df_mf,
    'Random': df_random
}

for name, df in models_outputs.items():
    print(f"\n============================")
    print(f"📊 Risultati per: {name}")

    # Valutazione classica
    metrics = evaluate_rating_predictions(df['rating'].dropna(), df['predicted_rating'].dropna())
    print("📌 Accuracy:", metrics)

    # Long Tail
    long_tail = analyze_long_tail_effect(df.dropna(subset=['rating']))
    print("🐍 Long Tail Error:")
    print(long_tail)

    # Top-K Example su un utente qualsiasi (solo se abbiamo ground truth)
    sample_user = df['userId'].iloc[0]
    user_df = df[df['userId'] == sample_user].dropna(subset=['rating'])

    if not user_df.empty:
        # costruiamo y_true e scores
        true_items = set(user_df[user_df['rating'] >= 4]['movieId'])
        movie_ids = user_df['movieId'].tolist()
        scores = user_df['predicted_rating'].tolist()
        y_true = [1 if m in true_items else 0 for m in movie_ids]

        topk = evaluate_topk(y_true, scores, k=10)
        print("🎯 Top-K:", topk)

        # Correlazione ranking
        spearman = evaluate_ranking_spearman(user_df['rating'], user_df['predicted_rating'])
        print(f"🔗 Spearman correlation: {spearman:.3f}")

        # Curve
        plot_precision_recall_roc(y_true, scores)
