# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [3]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np
import pandas as pd

# -- add new imports here --
from surprise import model_selection
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from collections import defaultdict
from surprise.dataset import Trainset
import surprise
from surprise import Reader
from surprise import Dataset

# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_items
from loaders import load_ratings

# -- add new imports here --
from models import get_top_n  


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [4]:
def generate_split_predictions(algo, df_ratings, eval_config):
    """Generate predictions on a random test set specified in eval_config"""

    # Get the surprise dataset
    reader = Reader(rating_scale=(0.5,5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Split the dataset into trainset and testset
    trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size, random_state=42)
   
    # Train the algorithm on the training set
    algo.fit(trainset) 
   
    # Make predictions on the test set
    predictions = algo.test(testset)
   
    return predictions
 
 
def generate_loo_top_n(algo, df_ratings, eval_config):
    """Generate top-N recommendations for each user on a random Leave-one-out split (LOO)"""
    # Convert the DataFrame to a Surprise Dataset
    reader = Reader(rating_scale=(0.5,5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Create a split with LeaveOneOut
    loo = LeaveOneOut(n_splits=1, random_state=eval_config.random_state)
 
    # Get the trainset and testset
    for trainset, testset in loo.split(ratings_dataset):
        # Train the algorithm on the trainset
        algo.fit(trainset)
       
        # Get the anti-testset
        anti_testset = trainset.build_anti_testset()
       
        # Make predictions on the anti-testset
        all_predictions = algo.test(anti_testset)
       
        # Initialize a dictionary to store the top-N recommendations for each user
        top_n_recommendations = defaultdict(list)
       
        # Select the top-N recommendations for each user
        for uid, iid, _, est, _ in all_predictions:
            top_n_recommendations[uid].append((iid, est))
       
        # Sort the recommendations for each user by estimated rating
        for uid, user_ratings in top_n_recommendations.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n_recommendations[uid] = user_ratings[:eval_config.top_n_value]
 
    return top_n_recommendations, testset

 
def generate_full_top_n(algo, df_ratings, eval_config):
    """Generate top-N recommendations for each user on the full training set"""

    # Convert the DataFrame to a Surprise Dataset
    reader = Reader(rating_scale=(0.5,5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Build the full training set
    full_trainset = ratings_dataset.build_full_trainset()
   
    # Train the algorithm on the full training set
    algo.fit(full_trainset)
   
    # Generate anti-testset recommendations
    anti_testset = full_trainset.build_anti_testset()
   
    # Make predictions on the anti-testset
    all_predictions = algo.test(anti_testset)
   
    # Initialize a dictionary to store the top-N recommendations for each user
    top_n_recommendations = defaultdict(list)
   
    # Select the top-N recommendations for each user
    for uid, iid, _, est, _ in all_predictions:
        top_n_recommendations[uid].append((iid, est))
   
    # Sort the recommendations for each user by estimated rating
    for uid, user_ratings in top_n_recommendations.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[uid] = user_ratings[:eval_config.top_n_value]
 
    return top_n_recommendations

def precompute_information(df_items, df_ratings):
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
   
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to popularity ranks
    - (-- for your project, add other relevant information here -- )
    """
    # Initialize the list
    precomputed_dict = {}
   
    # Calculate the average rating for each movie
    avg_rating = df_ratings.groupby('movieId')['rating'].mean().reset_index()
   
    # Normalize the average ratings to get popularity ranks
    avg_rating['popularity_rank'] = avg_rating['rating'].rank(ascending=False)
   
    # Create a dictionary mapping movie IDs to their popularity ranks
    item_to_rank = dict(zip(avg_rating['movieId'], avg_rating['popularity_rank']))
    precomputed_dict["item_to_rank"] = item_to_rank
   
    # You can add other relevant information here
    return precomputed_dict


def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
       
        # Type 1 : Evaluate split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, df_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters)
 
        # Type 2 : Evaluate loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, df_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters = available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
       
        #Type 3 : Evaluate full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, df_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                if metric == 'novelty':
                    evaluation_dict[model_name][metric] = get_novelty(anti_testset_top_n, precomputed_dict["item_to_rank"])
                else:
                    evaluation_function, parameters =  available_metrics["full"][metric]
                    evaluation_dict[model_name][metric] = evaluation_function(
                        anti_testset_top_n,
                        **parameters
    )
       
    return pd.DataFrame.from_dict(evaluation_dict).T
 
 
# Load data 
df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME, usecols=['userId', 'movieId', 'rating'])
 
# Load of the datatest
df_ratings_test = load_ratings(False)
 
# Initialize an instance of the algorythm 
algo = surprise.SVD()
 
# Test the function generate_split_predictions
predictions_split = generate_split_predictions(algo, df_ratings_test, EvalConfig)
top_n_split = get_top_n(predictions_split, n=EvalConfig.top_n_value)
 
# Test the function generate_loo_top_n
top_n_loo, _ = generate_loo_top_n(algo, df_ratings_test, EvalConfig)
 
# Test the function generate_full_top_n
top_n_full = generate_full_top_n(algo, df_ratings_test, EvalConfig)
 
# Print the results
print("Split predictions:")
print(top_n_split)
 
print("Leave-One-Out predictions:")
print(top_n_loo)
 
print("Full predictions:")
print(top_n_full)
 

FileNotFoundError: [Errno 2] No such file or directory: 'data/small/evidence/ratings.csv'

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [None]:

def calculate_hit_rate(anti_testset_top_n, testset):
    """Calculate the hit rate metric."""

    # Intialize variables
    total_users = len(testset)
    total_hits = 0
   
    for user_id, movie_id, _ in testset:
        
        # Verify if the evaluated movie by user is in the top-N recommandations 
        if user_id in anti_testset_top_n and movie_id in [movie_id for movie_id, _ in anti_testset_top_n[user_id]]:
            total_hits += 1
   
    # Calculate the hit rate
    hit_rate = total_hits / total_users if total_users > 0 else 0.0
    return hit_rate
 
def get_novelty(anti_testset_top_n, item_to_rank):
    """Calculate the average novelty of the recommended top-n lists."""
    total_novelty = 0
    total_users = len(anti_testset_top_n)
   
    for user_id, top_n_recommendations in anti_testset_top_n.items():
        user_novelty = sum(item_to_rank.get(item_id, 0) for item_id, _ in top_n_recommendations)
        total_novelty += user_novelty
   
    return total_novelty / total_users

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [None]:
AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        "rmse": (accuracy.rmse, {'verbose': False}),
        
    },
    "loo": {

        # Add hit metric
        "hit_rate": (calculate_hit_rate, {}),
    },
    "full": {

        # Add get novelty
        "novelty": (get_novelty, {}),
    }
}

# Load data
sp_ratings = load_ratings(surprise_format=True)
df_items = load_items()

# Precalculate needed information
precomputed_dict = precompute_information(df_items, df_ratings)
 
# Make the report of evaluation
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
print(evaluation_report)

# Export the report in a file 
export_evaluation_report(evaluation_report)



Handling model contentBased-random_forest-year
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model contentBased-lasso_regression-year
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model contentBased-ridge_regression-year
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model contentBased-linear_regression-year
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model contentBased-gradient_boosting-year
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
                                          mae      rmse  hit_rate  \
contentBased-random_forest-year      0.862402  1.125898  