### 06 — Model Evaluation
Compare collaborative filtering models on rating accuracy and top-k ranking quality.

In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

BASE = Path.cwd().parent
sys.path.append(str(BASE / 'src'))
PROC = BASE / 'data' / 'processed'
MODELS = BASE / 'models'
REPORTS = BASE / 'reports'
REPORTS.mkdir(exist_ok=True)

if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
if str(BASE / 'src') not in sys.path:
    sys.path.append(str(BASE / 'src'))

from ncf_model import load_ncf_checkpoint
from autoencoder_model import load_autoencoder_checkpoint, encode_dense_splits
from evaluation import (
    predict_ncf,
    predict_autoencoder,
    compute_regression_metrics,
    summarize_ranking,
    recommend_topk_ncf_ids,
    recommend_topk_autoencoder_ids,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


Using device: cpu


In [2]:
train_df = pd.read_csv(PROC / 'ratings_train.csv')
valid_df = pd.read_csv(PROC / 'ratings_valid.csv')
test_df  = pd.read_csv(PROC / 'ratings_test.csv')
movies   = pd.read_csv(PROC / 'movies_enriched.csv')

for name, df in [('train', train_df), ('valid', valid_df), ('test', test_df)]:
    print(f"{name:>5} -> shape={df.shape} users={df.userId.nunique()} items={df.movieId.nunique()}")

train -> shape=(31316, 7) users=479 items=426
valid -> shape=(3669, 7) users=479 items=426
 test -> shape=(3669, 7) users=479 items=426


In [3]:
train_mat, _, _, _, _, _, user2idx, item2idx = encode_dense_splits(train_df, valid_df, test_df)
idx2item = pd.Series(item2idx.index, index=item2idx.values)

train_seen = train_df.groupby('userId')['movieId'].apply(set).to_dict()
valid_truth = valid_df.groupby('userId')['movieId'].apply(set).to_dict()
test_truth = test_df.groupby('userId')['movieId'].apply(set).to_dict()


#### Load Trained Models

In [4]:
ncf_path = MODELS / 'ncf_best.pth'
ae_path  = MODELS / 'autoencoder_best.pth'

ncf_model = load_ncf_checkpoint(
    ncf_path,
    n_users=len(user2idx),
    n_items=len(item2idx),
    device=device,
)

ae_model, ae_meta = load_autoencoder_checkpoint(
    ae_path,
    n_items=len(item2idx),
    device=device,
)
print('Loaded NCF and AutoEncoder checkpoints.')


Loaded NCF and AutoEncoder checkpoints.


#### Rating Prediction Metrics

In [5]:
results = []

for split_name, df in [('valid', valid_df), ('test', test_df)]:
    preds = predict_ncf(
        ncf_model,
        df,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )
    metrics = compute_regression_metrics(df['rating'].to_numpy(dtype=np.float32), preds)
    results.append({'model': 'NCF', 'split': split_name, 'rmse': metrics.rmse, 'mae': metrics.mae})

    preds = predict_autoencoder(
        ae_model,
        df,
        train_matrix=train_mat,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )
    metrics = compute_regression_metrics(df['rating'].to_numpy(dtype=np.float32), preds)
    results.append({'model': 'AutoEncoder', 'split': split_name, 'rmse': metrics.rmse, 'mae': metrics.mae})

ratings_df = pd.DataFrame(results)
ratings_df


Unnamed: 0,model,split,rmse,mae
0,NCF,valid,0.852934,0.66074
1,AutoEncoder,valid,1.711074,1.380758
2,NCF,test,0.888239,0.687901
3,AutoEncoder,test,1.737334,1.380076


#### Top-K Ranking Metrics


In [6]:
from functools import partial

ncf_recommender = partial(
    recommend_topk_ncf_ids,
    ncf_model,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ae_recommender = partial(
    recommend_topk_autoencoder_ids,
    ae_model,
    train_matrix=train_mat,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ranking_rows = [
    summarize_ranking('NCF', 'valid', valid_truth, ncf_recommender),
    summarize_ranking('NCF', 'test', test_truth, ncf_recommender),
    summarize_ranking('AutoEncoder', 'valid', valid_truth, ae_recommender),
    summarize_ranking('AutoEncoder', 'test', test_truth, ae_recommender),
]

ranking_df = pd.DataFrame(ranking_rows)
ranking_df


Unnamed: 0,model,split,users_evaluated,precision@10,recall@10,ndcg@10
0,NCF,valid,479,0.038205,0.036697,0.04269
1,NCF,test,479,0.03048,0.042247,0.039423
2,AutoEncoder,valid,479,0.02714,0.025714,0.031708
3,AutoEncoder,test,479,0.026931,0.025005,0.032282


#### Consolidated Results

In [7]:
combined = ratings_df.merge(ranking_df, on=['model', 'split'], how='outer')
combined.sort_values(['split', 'model']).reset_index(drop=True)

Unnamed: 0,model,split,rmse,mae,users_evaluated,precision@10,recall@10,ndcg@10
0,AutoEncoder,test,1.737334,1.380076,479,0.026931,0.025005,0.032282
1,NCF,test,0.888239,0.687901,479,0.03048,0.042247,0.039423
2,AutoEncoder,valid,1.711074,1.380758,479,0.02714,0.025714,0.031708
3,NCF,valid,0.852934,0.66074,479,0.038205,0.036697,0.04269


In [8]:
combined.to_csv(REPORTS / 'results.csv', index=False)
print('Saved metrics to', REPORTS / 'results.csv')

Saved metrics to /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied_AI_and_Deep_Learning/finalproject/MovieLens-MCRS/reports/results.csv


#### Sample Recommendations

In [9]:
movies_df = pd.read_csv(PROC / 'movies_enriched.csv', usecols=['movieId', 'title', 'genres'])

sample_user = int(train_df.userId.sample(1, random_state=42).iloc[0])
print(f"Sample user: {sample_user}")

for recs in (ncf_recommender(sample_user, k=10),
    ae_recommender(sample_user, k=10)
):
    # Transform to DataFrame and merge movie information
    recs_df = pd.DataFrame(recs, columns=["movieId"])
    recs_df = recs_df.merge(movies_df, on="movieId", how="left")

    print(f"\n=== {name} Top 10 ===")
    display(recs_df[["movieId", "title", "genres"]])

Sample user: 591

=== test Top 10 ===


Unnamed: 0,movieId,title,genres
0,1252,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
1,1199,Brazil (1985),Fantasy|Sci-Fi
2,318,"Shawshank Redemption, The (1994)",Crime|Drama
3,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
5,904,Rear Window (1954),Mystery|Thriller
6,912,Casablanca (1942),Drama|Romance
7,541,Blade Runner (1982),Action|Sci-Fi|Thriller
8,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
9,2329,American History X (1998),Crime|Drama



=== test Top 10 ===


Unnamed: 0,movieId,title,genres
0,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
1,4995,"Beautiful Mind, A (2001)",Drama|Romance
2,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
3,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
4,5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller
5,2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
6,2700,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical
7,5502,Signs (2002),Horror|Sci-Fi|Thriller
8,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
9,4308,Moulin Rouge (2001),Drama|Musical|Romance
