# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [1]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np 
import pandas as pd
# -- add new imports here --
from surprise import model_selection
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from collections import defaultdict
from surprise.dataset import Trainset
import surprise
from surprise import Reader
from surprise import Dataset
from surprise import PredictionImpossible
# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings, load_items
# -- add new imports here --
from models import get_top_n  # Importez la fonction get_top_n depuis votre fichier models.py

# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [2]:
def generate_split_predictions(algo, df_ratings, eval_config):
    """Generate predictions on a random test set specified in eval_config"""

    # Get the surprise dataset
    reader = Reader(rating_scale=(0.5, 5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Split the dataset into trainset and testset
    trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size, random_state=42)
   
    # Train the algorithm on the training set
    algo.fit(trainset) 
   
    # Make predictions on the test set
    predictions = algo.test(testset)
   
    # Handle NoneType predictions
    default_prediction = 0  # Replace with your default value
    predictions = [(uid, iid, r_ui_trans, default_prediction if est is None else est, details) for uid, iid, r_ui_trans, est, details in predictions]
   
    return predictions

 
def generate_loo_top_n(algo, df_ratings, eval_config):
    # Convert the DataFrame to a Surprise Dataset
    reader = Reader(rating_scale=(0.5,5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Create a split with LeaveOneOut
    loo = LeaveOneOut(n_splits=1, random_state=eval_config.random_state)
 
    # Get the trainset and testset
    for trainset, testset in loo.split(ratings_dataset):
        # Train the algorithm on the trainset
        algo.fit(trainset)
       
        # Get the anti-testset
        anti_testset = trainset.build_anti_testset()
       
        # Make predictions on the anti-testset
        all_predictions = algo.test(anti_testset)
       
        # Initialize a dictionary to store the top-N recommendations for each user
        top_n_recommendations = defaultdict(list)
       
        # Select the top-N recommendations for each user
        for uid, iid, _, est, _ in all_predictions:
            top_n_recommendations[uid].append((iid, est))
       
        # Sort the recommendations for each user by estimated rating
        for uid, user_ratings in top_n_recommendations.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n_recommendations[uid] = user_ratings[:eval_config.top_n_value]
 
    return top_n_recommendations, testset

 
def generate_full_top_n(algo, df_ratings, eval_config):

    # Convert the DataFrame to a Surprise Dataset
    reader = Reader(rating_scale=(0.5,5.0))
    ratings_dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
   
    # Build the full training set
    full_trainset = ratings_dataset.build_full_trainset()
   
    # Train the algorithm on the full training set
    algo.fit(full_trainset)
   
    # Generate anti-testset recommendations
    anti_testset = full_trainset.build_anti_testset()
   
    # Make predictions on the anti-testset
    all_predictions = algo.test(anti_testset)
   
    # Initialize a dictionary to store the top-N recommendations for each user
    top_n_recommendations = defaultdict(list)
   
    # Select the top-N recommendations for each user
    for uid, iid, _, est, _ in all_predictions:
        top_n_recommendations[uid].append((iid, est))
   
    # Sort the recommendations for each user by estimated rating
    for uid, user_ratings in top_n_recommendations.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[uid] = user_ratings[:eval_config.top_n_value]
 
    return top_n_recommendations


def precompute_information(df_items, df_ratings):

    # Initialize the list
    precomputed_dict = {}
   
    # Calculate the average rating for each movie
    avg_rating = df_ratings.groupby('movieId')['rating'].mean().reset_index()
   
    # Normalize the average ratings to get popularity ranks
    avg_rating['popularity_rank'] = avg_rating['rating'].rank(ascending=False)
   
    # Create a dictionary mapping movie IDs to their popularity ranks
    item_to_rank = dict(zip(avg_rating['movieId'], avg_rating['popularity_rank']))
    precomputed_dict["item_to_rank"] = item_to_rank
   
    # You can add other relevant information here
    return precomputed_dict


def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
       
        # Type 1 : Evaluate split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, df_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters)
 
        # Type 2 : Evaluate loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, df_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters = available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
       
        #Type 3 : Evaluate full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, df_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                if metric == 'novelty':
                    evaluation_dict[model_name][metric] = get_novelty(anti_testset_top_n, precomputed_dict["item_to_rank"])
                else:
                    evaluation_function, parameters =  available_metrics["full"][metric]
                    evaluation_dict[model_name][metric] = evaluation_function(
                        anti_testset_top_n,
                        **parameters
    )
       
    return pd.DataFrame.from_dict(evaluation_dict).T
 
 
# Load data 
df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME, usecols=['userId', 'movieId', 'rating'])
 
# Load of the datatest
df_ratings_test = load_ratings(False)
 
# Initialize an instance of the algorythm 
algo = surprise.SVD()
 
# Test the function generate_split_predictions
predictions_split = generate_split_predictions(algo, df_ratings_test, EvalConfig)
top_n_split = get_top_n(predictions_split, n=EvalConfig.top_n_value)
 
# Test the function generate_loo_top_n
top_n_loo, _ = generate_loo_top_n(algo, df_ratings_test, EvalConfig)
 
# Test the function generate_full_top_n
top_n_full = generate_full_top_n(algo, df_ratings_test, EvalConfig)
 
# Print the results
print("Split predictions:")
print(top_n_split)
 
print("Leave-One-Out predictions:")
print(top_n_loo)
 
print("Full predictions:")
print(top_n_full)

Split predictions:
defaultdict(<class 'list'>, {646: [(1304, 5.0), (260, 4.846360574879733), (1196, 4.812730321830206), (356, 4.711459202679803), (595, 4.536728910116012), (1028, 4.4967378761943175), (2010, 4.472020531445065), (3255, 4.469366483095488), (1019, 4.435524888648039), (1029, 4.36694106817878), (1265, 4.364925581261103), (3033, 4.324916830041628), (2090, 4.2081415040110794), (368, 4.191581462758218), (2747, 4.180153139580532), (2394, 4.175644965363322), (2012, 4.145383337194421), (1014, 4.1363938014304455), (2011, 4.11062036422398), (2013, 4.0982743329486935), (2424, 4.011700194051805), (1018, 3.986976944554266), (2170, 3.9708876424559953), (1858, 3.965679542934195), (1021, 3.9308382785417613), (2805, 3.914648159454813), (1380, 3.8721017979508896), (813, 3.837768895892422), (1862, 3.8313964556671913), (1372, 3.796561355670266), (3, 3.7081374468448804), (1855, 3.682134913079911), (519, 3.0489410711843017), (1831, 2.9553982286695883), (3354, 2.9152958539833946), (2054, 2.88505

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [3]:
def get_hit_rate(anti_testset_top_n, testset):
    """Compute the average hit over the users (loo metric)
    
    A hit (1) happens when the movie in the testset has been picked by the top-n recommender
    A fail (0) happens when the movie in the testset has not been picked by the top-n recommender
    """
    
    # Initialize variables
    total_hits = 0
    
    # Convert testset to a dictionary for faster look-up
    testset_dict = defaultdict(set)
    for user_id, movie_id, _ in testset:
        testset_dict[user_id].add(movie_id)
    
    # Compute the hit rate
    for user_id, top_n_recommendations in anti_testset_top_n.items():
        recommended_movie_ids = {movie_id for movie_id, _ in top_n_recommendations}
        hits = recommended_movie_ids & testset_dict[user_id]  # Intersection of recommended and actual movies
        total_hits += len(hits)
    
    # Calculate the hit rate
    hit_rate = total_hits / len(testset) if testset else 0.0
    return hit_rate


def get_novelty(anti_testset_top_n, item_to_rank):
    """
    Calculate the average novelty of the recommended top-n lists.
    
    Args:
    anti_testset_top_n (dict): Dictionary containing top-n recommendations for each user.
                               Keys are user IDs, values are lists of tuples (item_id, estimated_rating).
    item_to_rank (dict): Dictionary mapping item IDs to their popularity ranks.
    
    Returns:
    float: The average novelty of the recommendations.
    """
    total_novelty = 0
    total_users = len(anti_testset_top_n)
   
    for user_id, top_n_recommendations in anti_testset_top_n.items():
        user_novelty = sum(item_to_rank.get(item_id, 0) for item_id, _ in top_n_recommendations)
        total_novelty += user_novelty / len(top_n_recommendations) if top_n_recommendations else 0
   
    return total_novelty / total_users if total_users > 0 else 0

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [5]:
AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        "rmse": (accuracy.rmse, {'verbose': False}),
        
    },
    "loo": {

        # Add hit metric
        "hit_rate": (get_hit_rate, {}),
    },
    "full": {

        # Add get novelty
        "novelty": (get_novelty, {}),
    }
}

# Load data
sp_ratings = load_ratings(surprise_format=True)
df_ratings = load_ratings(surprise_format=False)
df_items = load_items()

print(sp_ratings)
print(df_items)

# Precalculate needed information
precomputed_dict = precompute_information(df_items, df_ratings)
 
# Make the report of evaluation
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
print(evaluation_report)

# Export the report in a file 
export_evaluation_report(evaluation_report)

<surprise.dataset.DatasetAutoFolds object at 0x7c1da0277150>
                                                     title  \
movieId                                                      
1                                         Toy Story (1995)   
2                                           Jumanji (1995)   
3                                  Grumpier Old Men (1995)   
4                                 Waiting to Exhale (1995)   
5                       Father of the Bride Part II (1995)   
...                                                    ...   
162672                                 Mohenjo Daro (2016)   
163056                                Shin Godzilla (2016)   
163949   The Beatles: Eight Days a Week - The Touring Y...   
164977                            The Gay Desperado (1936)   
164979                               Women of '69, Unboxed   

                                              genres  
movieId                                               
1        Adventure|Ani

OSError: Cannot save file into a non-existent directory: 'data/small/evaluations'