### 03 — Baseline Matrix Factorization (SVD / SVD++)
Baseline collaborative filtering using Surprise.

**Goals**
1. Train SVD / SVD++ on `ratings_train.csv`.
2. Evaluate RMSE/MAE on `ratings_valid.csv`.
3. Compute Precision@K / Recall@K via top-10 recommendations.

In [None]:
import os, sys, math, random
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display

BASE = Path.cwd().parent
sys.path.append(str(BASE / 'src'))

from svd_model import (
    SVD,
    SVDpp,
    build_trainset,
    fit_svd,
    predict_pairs,
    rating_metrics,
    grid_search_svd,
    precision_recall_at_k,
    to_prediction_tuples,
    recommend_top_k
)

PROC = BASE / 'data' / 'processed'

train_path = PROC / 'ratings_train.csv'
valid_path = PROC / 'ratings_valid.csv'

assert train_path.exists() and valid_path.exists(), "Run 02_preprocessing.ipynb first to create processed splits."

train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)

print('Train shape:', train_df.shape, ' Valid shape:', valid_df.shape)
print('Unique users (train):', train_df.userId.nunique(), ' Unique movies (train):', train_df.movieId.nunique())


Train shape: (31316, 7)  Valid shape: (3669, 7)
Unique users (train): 479  Unique movies (train): 426


In [2]:
# Sample only the most active users for faster experimentation
SAMPLE_USERS = 1000  # set to None to disable sampling

if SAMPLE_USERS:
    active_users = train_df['userId'].value_counts().index[:SAMPLE_USERS]
    # Only get active users's data
    train_df = train_df[train_df['userId'].isin(active_users)]
    valid_df = valid_df[valid_df['userId'].isin(active_users)]
    print('After sampling -> Train:', train_df.shape, ' Valid:', valid_df.shape)

After sampling -> Train: (31316, 7)  Valid: (3669, 7)


In [3]:
# Converts the training DataFrame into a format that the recommendation model can understand.
trainset, reader = build_trainset(train_df)
# Converts the validation DataFrame (valid_df) into a list of tuples used for prediction.
valid_pairs = to_prediction_tuples(valid_df)

print('Trainset size (n_ratings):', trainset.n_ratings)
print('n_users:', trainset.n_users, ' n_items:', trainset.n_items)


Trainset size (n_ratings): 31316
n_users: 479  n_items: 426


In [4]:
# Defines a dictionary of hyperparameters for the SVD model
svd_params = dict(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
algo_svd = fit_svd(trainset, **svd_params)
# Trained SVD model to predict ratings for all (user, item) pairs in the validation dataset
preds_valid_svd = predict_pairs(algo_svd, valid_pairs)
metrics_svd = rating_metrics(preds_valid_svd)
rmse_svd = metrics_svd.rmse
mae_svd = metrics_svd.mae

print(f"SVD validation RMSE: {rmse_svd:.4f}")
print(f"SVD validation MAE: {mae_svd:.4f}")


SVD validation RMSE: 0.8479
SVD validation MAE: 0.6495


In [5]:
# Defines a dictionary of hyperparameters for the SVD++ model
svdpp_params = dict(n_factors=80, n_epochs=15, lr_all=0.005, reg_all=0.02, random_state=42)
algo_svdpp = fit_svd(trainset, algo_cls=SVDpp, **svdpp_params)
# Trained SVD++ model to predict ratings for all (user, item) pairs in the validation dataset
preds_valid_svdpp = predict_pairs(algo_svdpp, valid_pairs)
metrics_svdpp = rating_metrics(preds_valid_svdpp)
rmse_svdpp = metrics_svdpp.rmse
mae_svdpp = metrics_svdpp.mae

print(f"SVD++ validation RMSE: {rmse_svdpp:.4f}")
print(f"SVD++ validation MAE: {mae_svdpp:.4f}")


SVD++ validation RMSE: 0.8324
SVD++ validation MAE: 0.6369


In [6]:
# The grid search will try all possible combinations  2 × 2 × 2 × 2 = 16 different models.
# param_grid = {
#     'n_factors': [50, 100],
#     'n_epochs': [15, 25],
#     'lr_all': [0.003, 0.005],
#     'reg_all': [0.02, 0.05]
# }
param_grid = {
  'n_factors': [32, 64, 96, 128, 160],
  'n_epochs':  [10, 20, 30, 40],
  'lr_all':    [0.001, 0.003, 0.005, 0.008],
  'reg_all':   [0.005, 0.01, 0.02, 0.04]
}

# Do cross validate and get the avg RSME
gs = grid_search_svd(train_df, param_grid, algo_cls=SVD, reader=reader)

print('Best RMSE:', gs.best_score['rmse'])
print('Best params:', gs.best_params['rmse'])

# Use the best hyperparameter to get best model
best_svd = fit_svd(trainset, **gs.best_params['rmse'])
preds_best = predict_pairs(best_svd, valid_pairs)
metrics_best = rating_metrics(preds_best)
rmse_best = metrics_best.rmse
mae_best = metrics_best.mae

print(f"Best-model validation RMSE: {rmse_best:.4f}")
print(f"Best-model validation MAE: {mae_best:.4f}")


Best RMSE: 0.8321661492008577
Best params: {'n_factors': 32, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.04}
Best-model validation RMSE: 0.8379
Best-model validation MAE: 0.6418


In [7]:
gs_svdpp = grid_search_svd(train_df, param_grid, algo_cls=SVDpp, reader=reader)

print('Best RMSE:', gs_svdpp.best_score['rmse'])
print('Best params:', gs_svdpp.best_params['rmse'])

best_svdpp = fit_svd(trainset, **gs_svdpp.best_params['rmse'])
preds_best_svdpp = predict_pairs(best_svdpp, valid_pairs)
metrics_svdpp = rating_metrics(preds_best_svdpp)

print(f"SVD++ validation RMSE: {metrics_svdpp.rmse:.4f}")
print(f"SVD++ validation MAE: {metrics_svdpp.mae:.4f}")

Best RMSE: 0.8241966334206415
Best params: {'n_factors': 64, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.04}
SVD++ validation RMSE: 0.8346
SVD++ validation MAE: 0.6389


In [8]:
K = 10
# ratings ≥ 4.0 are treated as “liked”
THRESHOLD = 4.0

for name, model in [
    ("SVD", algo_svd),
    ("SVD++", algo_svdpp),
    ("BEST SVD", best_svd),
    ("BEST SVD++", best_svdpp),
]:
    p, r = precision_recall_at_k(model, train_df, valid_df, k=K, threshold=THRESHOLD)
    print(f"{name}  Precision@{K}: {p:.4f}, Recall@{K}: {r:.4f}")


SVD  Precision@10: 0.0343, Recall@10: 0.0596
SVD++  Precision@10: 0.0362, Recall@10: 0.0653
BEST SVD  Precision@10: 0.0371, Recall@10: 0.0703
BEST SVD++  Precision@10: 0.0327, Recall@10: 0.0616


In [None]:
movies_df = pd.read_csv(PROC / 'movies_enriched.csv', usecols=['movieId', 'title', 'genres'])

sample_user = int(train_df.userId.sample(1, random_state=42).iloc[0])
print(f"Sample user: {sample_user}")

for name, model in zip(
    ["SVD", "SVD++", "Best SVD", "Best SVD++"],
    [algo_svd, algo_svdpp, best_svd, best_svdpp]
):
    # Get list of movieId
    recs = recommend_top_k(model, sample_user, train_df, 10)

    # Transform to DataFrame and merge movie information
    recs_df = pd.DataFrame(recs, columns=["movieId"])
    recs_df = recs_df.merge(movies_df, on="movieId", how="left")

    print(f"\n=== {name} Top 10 ===")
    display(recs_df[["movieId", "title", "genres"]])


Sample user: 591

=== SVD Top 10 ===


Unnamed: 0,movieId,title,genres
0,318,"Shawshank Redemption, The (1994)",Crime|Drama
1,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War
2,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
3,51255,Hot Fuzz (2007),Action|Comedy|Crime|Mystery
4,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
5,48516,"Departed, The (2006)",Crime|Drama|Thriller
6,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
7,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,1276,Cool Hand Luke (1967),Drama
9,76093,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX



=== SVD++ Top 10 ===


Unnamed: 0,movieId,title,genres
0,904,Rear Window (1954),Mystery|Thriller
1,318,"Shawshank Redemption, The (1994)",Crime|Drama
2,48516,"Departed, The (2006)",Crime|Drama|Thriller
3,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
4,1653,Gattaca (1997),Drama|Sci-Fi|Thriller
5,1193,One Flew Over the Cuckoo's Nest (1975),Drama
6,2542,"Lock, Stock & Two Smoking Barrels (1998)",Comedy|Crime|Thriller
7,2028,Saving Private Ryan (1998),Action|Drama|War
8,1203,12 Angry Men (1957),Drama
9,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure



=== Best SVD Top 10 ===


Unnamed: 0,movieId,title,genres
0,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1,318,"Shawshank Redemption, The (1994)",Crime|Drama
2,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
3,912,Casablanca (1942),Drama|Romance
4,1704,Good Will Hunting (1997),Drama|Romance
5,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
6,2502,Office Space (1999),Comedy|Crime
7,2329,American History X (1998),Crime|Drama
8,6016,City of God (Cidade de Deus) (2002),Action|Adventure|Crime|Drama|Thriller
9,1252,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller



=== Best SVD++ Top 10 ===


Unnamed: 0,movieId,title,genres
0,318,"Shawshank Redemption, The (1994)",Crime|Drama
1,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
2,858,"Godfather, The (1972)",Crime|Drama
3,527,Schindler's List (1993),Drama|War
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western
6,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
7,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
8,110,Braveheart (1995),Action|Drama|War
9,912,Casablanca (1942),Drama|Romance
