### 06 — Model Evaluation
Compare collaborative filtering models on rating accuracy and top-k ranking quality.

In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

BASE = Path.cwd() if Path.cwd().name != 'notebooks' else Path.cwd().parent
PROC = BASE / 'data' / 'processed'
MODELS = BASE / 'models'
REPORTS = BASE / 'reports'
REPORTS.mkdir(exist_ok=True)

if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
if str(BASE / 'src') not in sys.path:
    sys.path.append(str(BASE / 'src'))

from src.ncf_model import load_ncf_checkpoint
from src.autoencoder_model import load_autoencoder_checkpoint, encode_dense_splits
from src.evaluation import (
    predict_ncf,
    predict_autoencoder,
    compute_regression_metrics,
    summarize_ranking,
    recommend_topk_ncf_ids,
    recommend_topk_autoencoder_ids,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


Using device: cpu


In [2]:
train_df = pd.read_csv(PROC / 'ratings_train.csv')
valid_df = pd.read_csv(PROC / 'ratings_valid.csv')
test_df  = pd.read_csv(PROC / 'ratings_test.csv')
movies   = pd.read_csv(PROC / 'movies_enriched.csv')

for name, df in [('train', train_df), ('valid', valid_df), ('test', test_df)]:
    print(f"{name:>5} -> shape={df.shape} users={df.userId.nunique()} items={df.movieId.nunique()}")

train -> shape=(31316, 7) users=479 items=426
valid -> shape=(3669, 7) users=479 items=426
 test -> shape=(3669, 7) users=479 items=426


In [3]:
train_mat, _, _, _, _, _, user2idx, item2idx = encode_dense_splits(train_df, valid_df, test_df)
idx2item = pd.Series(item2idx.index, index=item2idx.values)

train_seen = train_df.groupby('userId')['movieId'].apply(set).to_dict()
valid_truth = valid_df.groupby('userId')['movieId'].apply(set).to_dict()
test_truth = test_df.groupby('userId')['movieId'].apply(set).to_dict()


#### Load Trained Models

In [4]:
ncf_path = MODELS / 'ncf_best.pth'
ae_path  = MODELS / 'autoencoder_best.pth'

ncf_model = load_ncf_checkpoint(
    ncf_path,
    n_users=len(user2idx),
    n_items=len(item2idx),
    device=device,
)

ae_model, ae_meta = load_autoencoder_checkpoint(
    ae_path,
    n_items=len(item2idx),
    device=device,
)
print('Loaded NCF and AutoEncoder checkpoints.')


Loaded NCF and AutoEncoder checkpoints.


#### Helper Functions

In [5]:
# Prediction helpers are imported from src.evaluation.


#### Rating Prediction Metrics

In [6]:
results = []

for split_name, df in [('valid', valid_df), ('test', test_df)]:
    preds = predict_ncf(
        ncf_model,
        df,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )
    metrics = compute_regression_metrics(df['rating'].to_numpy(dtype=np.float32), preds)
    results.append({'model': 'NCF', 'split': split_name, 'rmse': metrics.rmse, 'mae': metrics.mae})

    preds = predict_autoencoder(
        ae_model,
        df,
        train_matrix=train_mat,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )
    metrics = compute_regression_metrics(df['rating'].to_numpy(dtype=np.float32), preds)
    results.append({'model': 'AutoEncoder', 'split': split_name, 'rmse': metrics.rmse, 'mae': metrics.mae})

ratings_df = pd.DataFrame(results)
ratings_df


Unnamed: 0,model,split,rmse,mae
0,NCF,valid,0.854136,0.662013
1,AutoEncoder,valid,1.740214,1.403107
2,NCF,test,0.894004,0.692092
3,AutoEncoder,test,1.7435,1.384846


#### Top-K Ranking Metrics


In [7]:
from functools import partial

ncf_recommender = partial(
    recommend_topk_ncf_ids,
    ncf_model,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ae_recommender = partial(
    recommend_topk_autoencoder_ids,
    ae_model,
    train_matrix=train_mat,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ranking_rows = [
    summarize_ranking('NCF', 'valid', valid_truth, ncf_recommender),
    summarize_ranking('NCF', 'test', test_truth, ncf_recommender),
    summarize_ranking('AutoEncoder', 'valid', valid_truth, ae_recommender),
    summarize_ranking('AutoEncoder', 'test', test_truth, ae_recommender),
]

ranking_df = pd.DataFrame(ranking_rows)
ranking_df


Unnamed: 0,model,split,users_evaluated,precision@10,recall@10,ndcg@10
0,NCF,valid,479,0.033612,0.034678,0.039639
1,NCF,test,479,0.031942,0.042447,0.040638
2,AutoEncoder,valid,479,0.027349,0.025526,0.030637
3,AutoEncoder,test,479,0.027766,0.02483,0.032209


#### Consolidated Results

In [8]:
combined = ratings_df.merge(ranking_df, on=['model', 'split'], how='outer')
combined.sort_values(['split', 'model']).reset_index(drop=True)

Unnamed: 0,model,split,rmse,mae,users_evaluated,precision@10,recall@10,ndcg@10
0,AutoEncoder,test,1.7435,1.384846,479,0.027766,0.02483,0.032209
1,NCF,test,0.894004,0.692092,479,0.031942,0.042447,0.040638
2,AutoEncoder,valid,1.740214,1.403107,479,0.027349,0.025526,0.030637
3,NCF,valid,0.854136,0.662013,479,0.033612,0.034678,0.039639


In [9]:
combined.to_csv(REPORTS / 'results.csv', index=False)
print('Saved metrics to', REPORTS / 'results.csv')

Saved metrics to /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/reports/results.csv


#### Sample Recommendations

In [10]:
sample_user = int(train_df['userId'].sample(1, random_state=7).iloc[0])
print('Sample user:', sample_user)
print('NCF top-10:')
print(ncf_recommender(sample_user, k=10))
print('AutoEncoder top-10:')
print(ae_recommender(sample_user, k=10))


Sample user: 362
NCF top-10:
[750, 1199, 912, 7361, 1276, 904, 1201, 1197, 1136, 2324]
AutoEncoder top-10:
[595, 5502, 4973, 4308, 5989, 6378, 4027, 5299, 5418, 2700]
