### 03 — Baseline Matrix Factorization (SVD / SVD++)
Baseline collaborative filtering using Surprise.

**Goals**
1. Train SVD / SVD++ on `ratings_train.csv`.
2. Evaluate RMSE/MAE on `ratings_valid.csv`.
3. Compute Precision@K / Recall@K via top-10 recommendations.

In [1]:
import os, sys, math, random
from pathlib import Path
import pandas as pd
import numpy as np

BASE = Path.cwd().parent
if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
if str(BASE / 'src') not in sys.path:
    sys.path.append(str(BASE / 'src'))

from src.svd_model import (
    SVD,
    SVDpp,
    build_trainset,
    fit_svd,
    predict_pairs,
    rating_metrics,
    grid_search_svd,
    precision_recall_at_k,
    to_prediction_tuples,
)

PROC = BASE / 'data' / 'processed'

train_path = PROC / 'ratings_train.csv'
valid_path = PROC / 'ratings_valid.csv'

assert train_path.exists() and valid_path.exists(), "Run 02_preprocessing.ipynb first to create processed splits."

train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)

print('Train shape:', train_df.shape, ' Valid shape:', valid_df.shape)
print('Unique users (train):', train_df.userId.nunique(), ' Unique movies (train):', train_df.movieId.nunique())


Train shape: (31316, 7)  Valid shape: (3669, 7)
Unique users (train): 479  Unique movies (train): 426


In [2]:
# Sample only the most active users for faster experimentation
SAMPLE_USERS = 20000  # set to None to disable sampling

if SAMPLE_USERS is not None:
    active_users = train_df['userId'].value_counts().index[:SAMPLE_USERS]
    train_df = train_df[train_df['userId'].isin(active_users)]
    valid_df = valid_df[valid_df['userId'].isin(active_users)]
    print('After sampling -> Train:', train_df.shape, ' Valid:', valid_df.shape)

After sampling -> Train: (31316, 7)  Valid: (3669, 7)


In [3]:
trainset, reader = build_trainset(train_df)
valid_pairs = to_prediction_tuples(valid_df)

print('Trainset size (n_ratings):', trainset.n_ratings)
print('n_users:', trainset.n_users, ' n_items:', trainset.n_items)


Trainset size (n_ratings): 31316
n_users: 479  n_items: 426


In [4]:
svd_params = dict(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
algo_svd = fit_svd(trainset, **svd_params)

preds_valid_svd = predict_pairs(algo_svd, valid_pairs)
metrics_svd = rating_metrics(preds_valid_svd)
rmse_svd = metrics_svd.rmse
mae_svd = metrics_svd.mae

print(f"SVD validation RMSE: {rmse_svd:.4f}")
print(f"SVD validation MAE: {mae_svd:.4f}")


SVD validation RMSE: 0.8479
SVD validation MAE: 0.6495


In [5]:
svdpp_params = dict(n_factors=80, n_epochs=15, lr_all=0.005, reg_all=0.02, random_state=42)
algo_svdpp = fit_svd(trainset, algo_cls=SVDpp, **svdpp_params)

preds_valid_svdpp = predict_pairs(algo_svdpp, valid_pairs)
metrics_svdpp = rating_metrics(preds_valid_svdpp)
rmse_svdpp = metrics_svdpp.rmse
mae_svdpp = metrics_svdpp.mae

print(f"SVD++ validation RMSE: {rmse_svdpp:.4f}")
print(f"SVD++ validation MAE: {mae_svdpp:.4f}")


SVD++ validation RMSE: 0.8324
SVD++ validation MAE: 0.6369


In [6]:
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [15, 25],
    'lr_all': [0.003, 0.005],
    'reg_all': [0.02, 0.05]
}

gs = grid_search_svd(train_df, param_grid, algo_cls=SVD, reader=reader)

print('Best RMSE:', gs.best_score['rmse'])
print('Best params:', gs.best_params['rmse'])

best_svd = fit_svd(trainset, **gs.best_params['rmse'])
preds_best = predict_pairs(best_svd, valid_pairs)
metrics_best = rating_metrics(preds_best)
rmse_best = metrics_best.rmse
mae_best = metrics_best.mae

print(f"Best-model validation RMSE: {rmse_best:.4f}")
print(f"Best-model validation MAE: {mae_best:.4f}")


Best RMSE: 0.8346988059776291
Best params: {'n_factors': 50, 'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.05}
Best-model validation RMSE: 0.8393
Best-model validation MAE: 0.6427


In [7]:
K = 10
THRESHOLD = 4.0

p_svd, r_svd = precision_recall_at_k(algo_svd, train_df, valid_df, k=K, threshold=THRESHOLD)
p_svdpp, r_svdpp = precision_recall_at_k(algo_svdpp, train_df, valid_df, k=K, threshold=THRESHOLD)

print(f"SVD   Precision@{K}: {p_svd:.4f}, Recall@{K}: {r_svd:.4f}")
print(f"SVD++ Precision@{K}: {p_svdpp:.4f}, Recall@{K}: {r_svdpp:.4f}")


SVD   Precision@10: 0.0343, Recall@10: 0.0596
SVD++ Precision@10: 0.0362, Recall@10: 0.0653
