### 06 â€” Model Evaluation
Compare collaborative filtering models (SVD, NCF, AutoEncoder) on rating accuracy and top-k recommendation quality.

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import torch
from surprise import dump

BASE = Path.cwd().parent
sys.path.append(str(BASE / 'src'))
PROC = BASE / 'data' / 'processed'
MODELS = BASE / 'models'
REPORTS = BASE / 'reports'
REPORTS.mkdir(exist_ok=True)

if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
if str(BASE / 'src') not in sys.path:
    sys.path.append(str(BASE / 'src'))

from ncf_model import (
    load_ncf_checkpoint,
    recommend_topk as recommend_topk_ncf_df,
)
from autoencoder_model import (
    load_autoencoder_checkpoint,
    encode_dense_splits,
    recommend_topk as recommend_topk_autoencoder_df,
)
from svd_model import (
    predict_pairs,
    to_prediction_tuples,
    recommend_top_k as recommend_topk_svd,
)
from evaluation import (
    predict_ncf,
    predict_autoencoder,
    compute_regression_metrics,
    summarize_ranking,
    recommend_topk_ncf_ids,
    recommend_topk_autoencoder_ids,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [2]:
train_df = pd.read_csv(PROC / 'ratings_train.csv')
valid_df = pd.read_csv(PROC / 'ratings_valid.csv')
test_df  = pd.read_csv(PROC / 'ratings_test.csv')
movies   = pd.read_csv(PROC / 'movies_enriched.csv')

for name, df in [('train', train_df), ('valid', valid_df), ('test', test_df)]:
    print(f"{name:>5} -> shape={df.shape} users={df.userId.nunique()} items={df.movieId.nunique()}")

train -> shape=(31316, 7) users=479 items=426
valid -> shape=(3669, 7) users=479 items=426
 test -> shape=(3669, 7) users=479 items=426


In [3]:
train_mat, _, _, _, _, _, user2idx, item2idx = encode_dense_splits(train_df, valid_df, test_df)
idx2item = pd.Series(item2idx.index, index=item2idx.values)

train_seen = train_df.groupby('userId')['movieId'].apply(set).to_dict()
valid_truth = valid_df.groupby('userId')['movieId'].apply(set).to_dict()
test_truth = test_df.groupby('userId')['movieId'].apply(set).to_dict()


#### Load Trained Models

In [18]:
ncf_path = MODELS / 'ncf_best.pth'
ae_path  = MODELS / 'autoencoder_best.pth'
svd_dump_path = MODELS / 'svd_baseline.dump'

svd_pairs = {
    'valid': to_prediction_tuples(valid_df),
    'test': to_prediction_tuples(test_df),
}

_, svd_baseline = dump.load(str(svd_dump_path))
svd_predictions = {split: predict_pairs(svd_baseline, pairs) for split, pairs in svd_pairs.items()}

svd_user_items = train_df.groupby('userId')['movieId'].apply(set).to_dict()
svd_all_items = train_df['movieId'].unique()


def recommend_topk_svd_ids(user_id: int, k: int = 10) -> list[int]:
    recs = recommend_topk_svd(
        svd_baseline,
        user_id,
        train_df,
        k=k,
        user_items_cache=svd_user_items,
        all_items=svd_all_items,
    )
    return [movie_id for movie_id, _ in recs]


def recommend_topk_svd_df(user_id: int, k: int = 10) -> pd.DataFrame:
    recs = recommend_topk_svd(
        svd_baseline,
        user_id,
        train_df,
        k=k,
        user_items_cache=svd_user_items,
        all_items=svd_all_items,
    )
    if not recs:
        return pd.DataFrame(columns=['movieId', 'pred_rating'])
    movie_ids, preds = zip(*recs)
    df = pd.DataFrame({'movieId': movie_ids, 'pred_rating': preds})
    return df


ncf_model = load_ncf_checkpoint(
    ncf_path,
    n_users=len(user2idx),
    n_items=len(item2idx),
    device=device,
)

ae_model, ae_meta = load_autoencoder_checkpoint(
    ae_path,
    n_items=len(item2idx),
    device=device,
)
print('Loaded SVD, NCF and AutoEncoder checkpoints.')

Loaded SVD, NCF and AutoEncoder checkpoints.


In [19]:
from functools import partial

# Shared helpers that keep rating predictions and recommenders in one place.
def predict_svd_split(split_name: str, _df: pd.DataFrame) -> np.ndarray:
    return np.array([pred.est for pred in svd_predictions[split_name]], dtype=np.float32)


def predict_ncf_split(split_name: str, interactions: pd.DataFrame) -> np.ndarray:
    return predict_ncf(
        ncf_model,
        interactions,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )


def predict_autoencoder_split(split_name: str, interactions: pd.DataFrame) -> np.ndarray:
    return predict_autoencoder(
        ae_model,
        interactions,
        train_matrix=train_mat,
        user2idx=user2idx,
        item2idx=item2idx,
        device=device,
    )


ncf_rank_recommender = partial(
    recommend_topk_ncf_ids,
    ncf_model,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ncf_display_recommender = partial(
    recommend_topk_ncf_df,
    ncf_model,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ae_rank_recommender = partial(
    recommend_topk_autoencoder_ids,
    ae_model,
    train_matrix=train_mat,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

ae_display_recommender = partial(
    recommend_topk_autoencoder_df,
    ae_model,
    train_matrix=train_mat,
    user2idx=user2idx,
    item2idx=item2idx,
    train_seen=train_seen,
    device=device,
)

model_interfaces = {
    'SVD': {
        'predict': predict_svd_split,
        'rank': recommend_topk_svd_ids,
        'display': recommend_topk_svd_df,
    },
    'NCF': {
        'predict': predict_ncf_split,
        'rank': ncf_rank_recommender,
        'display': ncf_display_recommender,
    },
    'AutoEncoder': {
        'predict': predict_autoencoder_split,
        'rank': ae_rank_recommender,
        'display': ae_display_recommender,
    },
}

#### Rating Prediction Metrics

In [20]:
results = []

splits = [('valid', valid_df), ('test', test_df)]
for split_name, df in splits:
    y_true = df['rating'].to_numpy(dtype=np.float32)
    for model_name, iface in model_interfaces.items():
        preds = iface['predict'](split_name, df)
        metrics = compute_regression_metrics(y_true, preds)
        results.append({
            'model': model_name,
            'split': split_name,
            'rmse': metrics.rmse,
            'mae': metrics.mae,
        })

ratings_df = pd.DataFrame(results)
ratings_df

Unnamed: 0,model,split,rmse,mae
0,SVD,valid,0.835608,0.640157
1,NCF,valid,0.851314,0.657253
2,AutoEncoder,valid,1.716373,1.384759
3,SVD,test,0.867891,0.662377
4,NCF,test,0.886698,0.68324
5,AutoEncoder,test,1.716382,1.363996


#### Top-K Ranking Metrics


In [21]:
split_truth = [('valid', valid_truth), ('test', test_truth)]
ranking_rows = []

for split_name, truth in split_truth:
    for model_name, iface in model_interfaces.items():
        ranking_rows.append(
            summarize_ranking(model_name, split_name, truth, iface['rank'])
        )

ranking_df = pd.DataFrame(ranking_rows)
ranking_df

Unnamed: 0,model,split,users_evaluated,precision@10,recall@10,ndcg@10
0,SVD,valid,479,0.03737,0.042283,0.04567
1,NCF,valid,479,0.033194,0.039636,0.043582
2,AutoEncoder,valid,479,0.04405,0.055251,0.058513
3,SVD,test,479,0.037161,0.044919,0.050311
4,NCF,test,479,0.029854,0.038265,0.042236
5,AutoEncoder,test,479,0.039666,0.052263,0.056412


#### Consolidated Results

In [22]:
combined = ratings_df.merge(ranking_df, on=['model', 'split'], how='outer')
combined.sort_values(['split', 'model']).reset_index(drop=True)

Unnamed: 0,model,split,rmse,mae,users_evaluated,precision@10,recall@10,ndcg@10
0,AutoEncoder,test,1.716382,1.363996,479,0.039666,0.052263,0.056412
1,NCF,test,0.886698,0.68324,479,0.029854,0.038265,0.042236
2,SVD,test,0.867891,0.662377,479,0.037161,0.044919,0.050311
3,AutoEncoder,valid,1.716373,1.384759,479,0.04405,0.055251,0.058513
4,NCF,valid,0.851314,0.657253,479,0.033194,0.039636,0.043582
5,SVD,valid,0.835608,0.640157,479,0.03737,0.042283,0.04567


In [23]:
combined.to_csv(REPORTS / 'results.csv', index=False)
print('Saved metrics to', REPORTS / 'results.csv')

Saved metrics to /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied_AI_and_Deep_Learning/finalproject/MovieLens-MCRS/reports/results.csv


#### Sample Recommendations

In [24]:
movies_df = pd.read_csv(PROC / 'movies_enriched.csv', usecols=['movieId', 'title', 'genres'])

sample_user = int(train_df.userId.sample(1, random_state=42).iloc[0])
print(f"Sample user: {sample_user}")

for name, iface in model_interfaces.items():
    recs_df = iface['display'](sample_user, k=10)
    if recs_df.empty:
        print(f"\n=== {name} Top 10 ===\nNo available recommendations.")
        continue
    recs_df = recs_df.merge(movies_df, on='movieId', how='left')

    print(f"\n=== {name} Top 10 ===")
    display(recs_df[['movieId', 'title', 'genres', 'pred_rating']])

Sample user: 591

=== SVD Top 10 ===


Unnamed: 0,movieId,title,genres,pred_rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.42999
1,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.422786
2,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy,4.421438
3,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,4.376853
4,457,"Fugitive, The (1993)",Thriller,4.371925
5,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.357474
6,2502,Office Space (1999),Comedy|Crime,4.326093
7,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,4.314548
8,49272,Casino Royale (2006),Action|Adventure|Thriller,4.307695
9,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,4.30295



=== NCF Top 10 ===


Unnamed: 0,movieId,title,genres,pred_rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.405581
1,1252,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller,4.374725
2,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.36079
3,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,4.331543
4,1199,Brazil (1985),Fantasy|Sci-Fi,4.319302
5,904,Rear Window (1954),Mystery|Thriller,4.284191
6,858,"Godfather, The (1972)",Crime|Drama,4.271798
7,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,4.261241
8,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.255569
9,1225,Amadeus (1984),Drama,4.249712



=== AutoEncoder Top 10 ===


Unnamed: 0,movieId,title,genres,pred_rating
0,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,5.0
1,4226,Memento (2000),Mystery|Thriller,5.0
2,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,5.0
3,1213,Goodfellas (1990),Crime|Drama,5.0
4,4011,Snatch (2000),Comedy|Crime|Thriller,5.0
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0
6,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,5.0
7,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,5.0
8,368,Maverick (1994),Adventure|Comedy|Western,5.0
9,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,5.0
