# Packages

In [16]:
%load_ext autoreload
%autoreload 2
# External import
import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise import PredictionImpossible
from sklearn.linear_model import LinearRegression
from surprise import model_selection
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from collections import defaultdict
from surprise.dataset import Trainset
import surprise
from surprise import Reader
from surprise import Dataset
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import NearestNeighbors

# Local import
from loaders import load_ratings
from loaders import load_items
from constants import Constant as C
from configs import EvalConfig
from loaders import export_evaluation_report
from models import get_top_n  





The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Explore and select content features

In [17]:
# Load datafram
df_items = load_items()
df_ratings = load_ratings()


Unnamed: 0_level_0,n_character_title
movieId,Unnamed: 1_level_1
3,23
15,23
34,11
59,44
64,20


# Build a content-based model
When ready, move the following class in the *models.py* script

In [18]:
class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)
        sp_ratings = load_ratings(surprise_format=True)
        self.trainset = sp_ratings.build_full_trainset()
        self.user_profile_explain = {u: None for u in self.trainset.all_users()}
 
    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        if features_method is None:
            df_features = None

        # title_length feature
        elif features_method == "title_length": 
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')

        # release year feature
        elif features_method == "release_year":

            # Extract release year from movie title
            df_items['release_year'] = df_items['title'].str.extract(r'\((\d{4})\)')
            df_items['release_year'] = df_items['release_year'].astype(float)

            # Inside the create_content_features method
            df_features = df_items[['release_year']].fillna(1990.0)

        # genres feature
        elif features_method == "genres":
            df_features = df_items['genres'].str.get_dummies(sep='|')
            df_ratings = load_ratings(surprise_format=False)

            # Replace binary values with average ratings for each genre
            for genre in df_features.columns:

                # Calculate the average rating for movies with this genre
                genre_ratings = []
                for index, row in df_items.iterrows():
                    if genre in row['genres']:

                        # Check if the movie has been rated by users
                        if index in df_ratings['movieId'].values:
                            ratings_for_movie = df_ratings[df_ratings['movieId'] == index]['rating'].values
                            genre_ratings.extend(ratings_for_movie)

                # Calculate the average rating for this genre
                if genre_ratings:
                    genre_avg_rating = sum(genre_ratings) / len(genre_ratings)
                else:
                    genre_avg_rating = 0 

                # Replace binary values with average rating for this genre
                df_features[genre] = genre_avg_rating
 
        else: 
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
   
 
    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        self.trainset = trainset

        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}
        self.user_profile_explain = {u: None for u in trainset.all_users()}
 
        if self.regressor_method == 'random_score':
            pass
        elif self.regressor_method == 'linear_regression':
            for u in self.user_profile:

                # List to stock data of each users 
                user_data = []  
                for inner_iid, rating in trainset.ur[u]:
                    raw_iid = trainset.to_raw_iid(inner_iid)
                    user_data.append({'item_id': raw_iid, 'user_ratings': rating})
              
                # Create the datafram
                df_user = pd.DataFrame(user_data)

                # Fusion df_user with self.content_features
                df_user = df_user.merge(
                    self.content_features,
                    how='left',
                    left_on='item_id',
                    right_index=True
               )
                # Extract features (X) and targets (y)
                X = df_user['release_year'].values.reshape(-1, 1)
                y = df_user['user_ratings'].values

                # Fitting the linear regressor
                regressor = LinearRegression(fit_intercept=True)
                regressor.fit(X, y)

                # Assigning the linear regressor to the user
                self.user_profile[u] = regressor
                self.user_profile_explain[u] = self.calculate_feature_importance(df_user, regressor)


        elif self.regressor_method == 'ridge_regression':
            for u in self.user_profile:

                # List to stock data of each users 
                user_data = []  
                for inner_iid, rating in trainset.ur[u]:
                    raw_iid = trainset.to_raw_iid(inner_iid)
                    user_data.append({'item_id': raw_iid, 'user_ratings': rating})

                # Create the datafram
                df_user = pd.DataFrame(user_data)

                # Fusion df_user with self.content_features
                df_user = df_user.merge(
                    self.content_features,
                    how='left',
                    left_on='item_id',
                    right_index=True
                )

                # Select column 
                genre_columns = [col for col in df_user.columns if col != 'item_id' and col != 'user_ratings']

                # Extract features (X) and targets (y)
                X = df_user[genre_columns].values
                y = df_user['user_ratings'].values

                # Fitting the ridge regressor
                regressor = Ridge(alpha=1)
                regressor.fit(X, y)

                # Assigning the ridge regressor to the user
                self.user_profile[u] = regressor
                self.user_profile_explain[u] = self.calculate_feature_importance(df_user, regressor)

        elif self.regressor_method == 'lasso_regression':
            for u in self.user_profile:

                # List to stock data of each users 
                user_data = []  
                for inner_iid, rating in trainset.ur[u]:
                    raw_iid = trainset.to_raw_iid(inner_iid)
                    user_data.append({'item_id': raw_iid, 'user_ratings': rating})

                # Create the datafram
                df_user = pd.DataFrame(user_data)

                # Fusion df_user with self.content_features 
                df_user = df_user.merge(
                    self.content_features,
                    how='left',
                    left_on='item_id',
                    right_index=True
                )
                # Select column 
                genre_columns = [col for col in df_user.columns if col != 'item_id' and col != 'user_ratings']

                # Extract features (X) and targets (y)
                X = df_user['release_year'].values.reshape(-1, 1)
                y = df_user['user_ratings'].values

                # Fitting the lasso regressor
                regressor = Lasso(alpha=0.1)
                regressor.fit(X, y)

                # Assigning the lasso regressor to the user
                self.user_profile[u] = regressor
                self.user_profile_explain[u] = self.calculate_feature_importance(df_user, regressor)

        elif self.regressor_method == 'random_forest':
            for u in self.user_profile:

                # List to stock data of each users 
                user_data = [] 

                for inner_iid, rating in trainset.ur[u]:
                    raw_iid = trainset.to_raw_iid(inner_iid)
                    user_data.append({'item_id': raw_iid, 'user_ratings': rating})

                # Create the dataframe 
                df_user = pd.DataFrame(user_data)

                # Fusion df_user with self.content_features 
                df_user = df_user.merge(
                    self.content_features,
                    how='left',
                    left_on='item_id',
                    right_index=True
                )

                # Select column 
                genre_columns = [col for col in df_user.columns if col != 'item_id' and col != 'user_ratings']

                # Extract features (X) and targets (y)
                X = df_user[genre_columns].values.reshape(-1, 1)
                y = df_user['user_ratings'].values

                # Fitting the random forest regressor
                regressor = RandomForestRegressor(n_estimators=5, random_state=42)
                regressor.fit(X, y)

                # Assigning the random forest regressor to the user
                self.user_profile[u] = regressor
                self.user_profile_explain[u] = self.calculate_feature_importance(df_user, regressor)

        elif self.regressor_method == 'gradient_boosting':
            for u in self.user_profile:

                # List to stock data of each users 
                user_data = []  

                for inner_iid, rating in trainset.ur[u]:
                    raw_iid = trainset.to_raw_iid(inner_iid)
                    user_data.append({'item_id': raw_iid, 'user_ratings': rating})

                # Create the dataframe 
                df_user = pd.DataFrame(user_data)

                # Fusion df_user with self.content_features 
                df_user = df_user.merge(
                    self.content_features,
                    how='left',
                    left_on='item_id',
                    right_index=True
                )

                # Complete the Nan value by 0
                df_user.fillna(0, inplace=True)

                # Select colomn 
                genre_columns = [col for col in df_user.columns if col != 'item_id' and col != 'user_ratings']
                
                # Extract features (X) and targets (y)
                X = df_user[genre_columns].values
                y = df_user['user_ratings'].values

                # Fitting the gradient boosting regressor
                regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
                regressor.fit(X, y)

                # Assigning the gradient boosting regressor to the user
                self.user_profile[u] = regressor
                self.user_profile_explain[u] = self.calculate_feature_importance(df_user, regressor)


        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
        else:
            pass

        
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
 
        # estimate for the random score
        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)

        # estimate for the random sample
        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])

        # estimate for linear_regression, ridge_regression, lasso_regression
        elif self.regressor_method == 'linear_regression' or self.regressor_method == 'ridge_regression' or self.regressor_method == 'lasso_regression':
            user_regressor = self.user_profile[u]

            # Convert the id of element into id brut 
            raw_item_id = self.trainset.to_raw_iid(i)

            # Recover features of element 
            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values

            # Make prediction with regressor
            score = user_regressor.predict(item_features.reshape(1, -1))[0] 

       # estimate for random_forest
        elif self.regressor_method == 'random_forest':
            user_regressor = self.user_profile[u]

            # Convert the id of element into id brut 
            raw_item_id = self.trainset.to_raw_iid(i)

            # Recover features of element 
            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values

            # Make prediction with regressor 
            score = user_regressor.predict(item_features.reshape(1, -1))[0]  #

        # estimate for gradient_boosting
        elif self.regressor_method == 'gradient_boosting':
            user_regressor = self.user_profile[u]

            # Convert the id of element into id brut 
            raw_item_id = self.trainset.to_raw_iid(i)

            # Recover features of element 
            item_features = self.content_features.loc[raw_item_id:raw_item_id, :].values

            # Make prediction with regressor
            score = user_regressor.predict(item_features.reshape(1, -1))[0] 
        
        else:
            score=None
 
        return score
    
    def calculate_feature_importance(self, df_user, regressor):
        """Calculate the importance of each feature based on user ratings"""
        features = df_user.drop(columns=['item_id', 'user_ratings']).columns
        feature_importance = {}

        # features importances for linear_regression, ridge regression, lasso regression
        if isinstance(regressor, (LinearRegression, Ridge, Lasso)):
            importance = regressor.coef_

        # features importances for random forest regression and gradient boosting
        elif isinstance(regressor, (RandomForestRegressor, GradientBoostingRegressor)):
            importance = regressor.feature_importances_
        # features importances for regressor not mentionned 
        else:
            importance = [0] * len(features)

        importance_sum = sum(importance)
        for i, feature in enumerate(features):
            feature_importance[feature] = importance[i] / importance_sum if importance_sum != 0 else 0

        return feature_importance
        

    def explain(self, u):
        """Explain the importance of each feature for a given user"""
        if u not in self.user_profile_explain:
            raise ValueError("User profile not found")

        return self.user_profile_explain[u]


The following script test the ContentBased class

In [21]:
def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user, item) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    print(type(sp_ratings))
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])

    # Print prediction
    print(prediction)

    # Call explain method and print the explanation
    explanation = content_algo.explain(anti_test_set_first[0])
    print("Explanation for user:", anti_test_set_first[0])
    print(explanation)

# Test the function
test_contentbased_class("release_year", "ridge_regression")

'''the random sample give a better RMSE it is logical because he is based on ratings before compared of random score is based on nothing, he gives purely random ratings for each'''
''' Intercepted True give a better RMSE and it is logical because he choose a better intercept term than when it is false is choose the origin(0,0) as intercept term '''

<class 'surprise.dataset.DatasetAutoFolds'>


user: 15         item: 942        r_ui = None   est = 4.47   {'was_impossible': False}
Explanation for user: 15
{'release_year': 1.0}
