# <ins>Filtrage collaboratif : approche hybride et approche modèle</ins>

Il se trouve que l’une des méthodes les plus connues des systèmes de recommandation à filtrage collaboratif est la Factorisation Matricielle, qui trouve son origine dans le concours Netflix par l’équipe ayant fini en 3ème place

Le but de ce notebook est d'entrer dans les détails de la Factorisation Matricielle

In [1]:
import pandas as pd

In [2]:
ratings = pd.read_csv('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/silver/processed_ratings.csv', dtype={'userid': str, 'movieid': str, 'rating': float})
movies = pd.read_csv('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/silver/processed_movies.csv', dtype={'movieid': str, 'title': str, 'genres': str})
df = pd.merge(ratings, movies, on='movieid')
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,bayesian_mean,title,genres,year
0,1,2,3.5,1112486027,3.21,Jumanji,"Adventure, Children, Fantasy",1995
1,5,2,3.0,851527569,3.21,Jumanji,"Adventure, Children, Fantasy",1995
2,13,2,3.0,849082742,3.21,Jumanji,"Adventure, Children, Fantasy",1995
3,29,2,3.0,835562174,3.21,Jumanji,"Adventure, Children, Fantasy",1995
4,34,2,3.0,846509384,3.21,Jumanji,"Adventure, Children, Fantasy",1995


In [3]:
# Étape 1 : Compter le nombre de films notés par utilisateur
user_counts = df['userid'].value_counts()
users_with_more_than_10_ratings = user_counts[user_counts > 10].index

# Étape 2 : Compter le nombre de notes par film
movie_counts = df['movieid'].value_counts()
movies_with_at_least_5_ratings = movie_counts[movie_counts >= 5].index

# Étape 3 : Filtrer le DataFrame
df= df[(df['userid'].isin(users_with_more_than_10_ratings)) & (df['movieid'].isin(movies_with_at_least_5_ratings))]

df.shape

(19984024, 8)

In [4]:
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

In [5]:
mat_ratings = pd.pivot_table(data = df, values = 'rating', columns = 'title', index = 'userid')

mat_ratings = mat_ratings +1 # On ajoute 1 à toutes les notes pour éviter les problèmes de division par 0

mat_ratings = mat_ratings.fillna(0)

mat_ratings.head()

title,"""Great Performances"" Cats",$ (Dollars),$5 a Day,$9.99,'71,'Hellboy': The Seeds of Creation,'Neath the Arizona Skies,'R Xmas,'Round Midnight,'Salem's Lot,...,[REC]³ 3 Génesis,a/k/a Tommy Chong,eXistenZ,iSteve,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À nos amours,À nous la liberté (Freedom for Us)
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from scipy.sparse import csr_matrix

sparse_ratings = csr_matrix(mat_ratings)

user_ids = mat_ratings.index.tolist()

titles = mat_ratings.columns.tolist()

print(sparse_ratings)

  (0, 476)	5.0
  (0, 9743)	4.5
  (1, 611)	5.0
  (1, 5043)	5.0
  (1, 8744)	4.0
  (1, 11646)	5.0
  (1, 12547)	5.0
  (2, 2960)	5.0
  (2, 7358)	5.0
  (3, 10448)	6.0
  (4, 198)	4.0
  (4, 603)	3.0
  (4, 657)	6.0
  (4, 12985)	4.0
  (5, 10155)	4.0
  (5, 11528)	5.0
  (6, 2529)	4.0
  (6, 6267)	5.0
  (6, 11528)	4.0
  (7, 426)	6.0
  (7, 5304)	5.0
  (7, 9816)	6.0
  (7, 10324)	6.0
  (7, 11341)	6.0
  (7, 11487)	2.0
  :	:
  (125692, 7198)	4.0
  (125692, 9679)	3.0
  (125692, 9713)	3.0
  (125692, 10048)	4.0
  (125692, 11342)	4.0
  (125692, 12691)	4.0
  (125692, 12913)	4.0
  (125692, 13526)	4.0
  (125693, 11523)	5.0
  (125694, 1014)	5.0
  (125694, 1149)	4.0
  (125694, 3768)	4.0
  (125694, 6312)	5.5
  (125694, 9560)	3.0
  (125695, 6303)	5.0
  (125696, 8966)	4.5
  (125696, 11646)	5.5
  (125697, 3851)	4.0
  (125697, 4044)	5.0
  (125697, 6147)	4.0
  (125697, 7106)	5.0
  (125697, 8008)	5.0
  (125697, 10282)	5.5
  (125697, 12668)	5.0
  (125697, 12801)	5.0


In [14]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)

ratings_red = svd.fit_transform(sparse_ratings.T)

ratings_red.shape

(13970, 20)

In [15]:
# new_ratings_red = svd.inverse_transform(ratings_red)

from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(ratings_red)

# item_similarity_2 = cosine_similarity(new_ratings_red)

item_similarity = pd.DataFrame(item_similarity, index = titles, columns = titles)

# item_similarity_2 = pd.DataFrame(item_similarity_2, index = titles, columns = titles)

item_similarity.head()

Unnamed: 0,"""Great Performances"" Cats",$ (Dollars),$5 a Day,$9.99,'71,'Hellboy': The Seeds of Creation,'Neath the Arizona Skies,'R Xmas,'Round Midnight,'Salem's Lot,...,[REC]³ 3 Génesis,a/k/a Tommy Chong,eXistenZ,iSteve,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À nos amours,À nous la liberté (Freedom for Us)
"""Great Performances"" Cats",1.0,0.052147,0.017169,0.210788,0.121287,-0.05826,0.141581,0.03316,0.089811,0.036598,...,0.029101,-0.071382,0.293858,0.009912,0.023548,0.211133,0.13767,0.1734,0.103382,0.08228
$ (Dollars),0.052147,1.0,0.139031,0.518306,0.636881,-0.151537,0.856964,0.231393,0.73604,0.303124,...,0.40296,0.155638,0.66677,0.338482,0.622212,0.623149,0.605975,0.755591,0.47294,0.783517
$5 a Day,0.017169,0.139031,1.0,0.304779,0.33653,0.132147,0.331147,0.15131,0.309278,0.16638,...,0.178439,-0.012922,0.409166,0.08278,0.255733,0.46786,0.31238,0.380589,0.207051,0.360255
$9.99,0.210788,0.518306,0.304779,1.0,0.871817,0.418146,0.803163,0.451161,0.586342,0.260824,...,0.633638,0.004886,0.807008,0.0494,0.5436,0.806287,0.875528,0.7201,0.664493,0.645562
'71,0.121287,0.636881,0.33653,0.871817,1.0,0.332734,0.821548,0.443527,0.575649,0.297635,...,0.646489,0.071436,0.824763,0.079855,0.649711,0.880007,0.88809,0.812484,0.589137,0.640762


In [9]:
import numpy as np
import pickle
import joblib

def pred_item(mat_ratings, item_similarity, k, user_id):
    # Sélectionner dans mat_ratings les films qui n'ont pas été encore lu par le user
    to_predict = mat_ratings.loc[user_id][mat_ratings.loc[user_id]==0]
    # Itérer sur tous ces films
    for i in to_predict.index:
        #Trouver les k films les plus similaires en excluant le film lui-même
        similar_items = item_similarity.loc[i].sort_values(ascending=False)[1:k+1]
        # Calcul de la norme du vecteur similar_items
        norm = np.sum(np.abs(similar_items))
        # Récupérer les notes données par l'utilisateur aux k plus proches voisins
        ratings = mat_ratings[similar_items.index].loc[user_id]
        # Calculer le produit scalaire entre ratings et similar_items
        scalar_prod = np.dot(ratings,similar_items)
        #Calculer la note prédite pour le film i
        pred = scalar_prod / norm
        # Remplacer par la prédiction
        to_predict[i] = pred
    return to_predict

In [10]:
userID = '35022'

# notations au dessus de 4
user_rating_sup4 = df.loc[(df['userid'] == userID) & (df['rating'] > 2)]

user_preferences = user_rating_sup4.sort_values(by = 'rating', ascending = False).head(10)

user_preferences

Unnamed: 0,userid,movieid,rating,timestamp,bayesian_mean,title,genres,year
950736,35022,1289,4.5,1072661795,3.77,Koyaanisqatsi (a.k.a. Koyaanisqatsi: Life Out ...,Documentary,1983
539714,35022,3033,4.0,991938077,3.39,Spaceballs,"Comedy, Sci-Fi",1987
555664,35022,1061,4.0,977526043,3.6,Sleepers,Thriller,1996
551937,35022,562,4.0,977285853,3.72,Welcome to the Dollhouse,"Comedy, Drama",1995
27209,35022,1760,4.0,977288348,2.07,Spice World,Comedy,1997
143659,35022,1671,4.0,977526447,3.13,Deceiver,"Crime, Drama, Thriller",1997
519892,35022,924,4.0,982363377,3.93,2001: A Space Odyssey,"Adventure, Drama, Sci-Fi",1968
362332,35022,432,3.0,977287525,2.76,City Slickers II: The Legend of Curly's Gold,"Adventure, Comedy, Western",1994
396687,35022,107,3.0,977509509,3.21,Muppet Treasure Island,"Adventure, Children, Comedy, Musical",1996
420310,35022,799,3.0,977285365,3.35,"Frighteners, The","Comedy, Horror, Thriller",1996


In [16]:
print("10 recommandations avec item_similarity")
print(pred_item(mat_ratings, item_similarity, 10, userID).sort_values(ascending = False).head(10), end = "\n\n")


10 recommandations avec item_similarity
title
Four Weddings and a Funeral    1.100023
Bad Company                    0.902382
Bait                           0.900458
Heaven & Earth                 0.898015
Crow: City of Angels, The      0.895983
Back to the Future Part III    0.800067
Corrina, Corrina               0.704100
It Could Happen to You         0.702808
Moll Flanders                  0.700675
Mission: Impossible II         0.700237
Name: 35022, dtype: float64



In [13]:
reco_user1 = pred_item(mat_ratings, item_similarity, 10, '501').sort_values(ascending=False).head(10)
print(reco_user1)


title
If Lucy Fell                  0.502727
Dragonheart                   0.500524
Dead Presidents               0.500247
Postman, The (Postino, Il)    0.499885
Mulholland Falls              0.499591
Blown Away                    0.499203
Pinocchio                     0.498937
20,000 Years in Sing Sing     0.498257
Over the Hedge                0.000000
Over the Top                  0.000000
Name: 501, dtype: float64


In [37]:
print("Vos 10 meilleurs notations :", end="\n\n")
best_rating = df[(df['userid'] == '35022') & (df['rating'] > 1)]
best_rating = best_rating.sort_values(
by='rating', ascending=False).head(10)
print(best_rating.title, end="\n\n")
print("Voici nos 10 recommandations pour vous :", end="\n\n")
best_recomm = pred_item(mat_ratings, item_similarity,  5, '35022')
best_recomm = best_recomm.sort_values(ascending = False).head(10)
print(best_recomm)

Vos 10 meilleurs notations :

477746                                               Avatar
962713              Grave of the Fireflies (Hotaru no haka)
58939                                               Sabrina
295937                                           District 9
418084                                               Powder
600372                                     Jungle Book, The
474124                                    Never Been Kissed
34003                                 2001: A Space Odyssey
131487    Adventures of Priscilla, Queen of the Desert, The
172700                                Africa: The Serengeti
Name: title, dtype: object

Voici nos 10 recommandations pour vous :

title
Fair Game                                         1.199763
Purge, The                                        1.199728
Doulos, Le                                        1.199666
Grand Hotel                                       1.199458
Wet Hot American Summer                           1.199042

## <ins>UTILISATION DE SURPRISE SVD</ins>

In [47]:
from surprise import Reader
from surprise import Dataset

reader = Reader(rating_scale = (0, 5))

df_surprise = Dataset.load_from_df(df[["userid", "title", "rating"]], reader=reader)

In [48]:
user_id = '35022'

In [40]:
from surprise import NormalPredictor
from surprise.model_selection import cross_validate

normpred = NormalPredictor()

cross_validate(normpred, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4380  1.4359  1.4328  1.4370  1.4365  1.4360  0.0018  
MAE (testset)     1.1473  1.1462  1.1422  1.1454  1.1454  1.1453  0.0017  
Fit time          1.04    1.31    1.29    1.28    1.26    1.24    0.10    
Test time         1.81    1.49    1.42    1.46    1.44    1.53    0.15    


{'test_rmse': array([1.43803031, 1.43594969, 1.43276611, 1.43698179, 1.43651469]),
 'test_mae': array([1.14730282, 1.14620217, 1.14222682, 1.14536291, 1.14540409]),
 'fit_time': (1.0430779457092285,
  1.3093020915985107,
  1.2891581058502197,
  1.2826848030090332,
  1.255103588104248),
 'test_time': (1.8144383430480957,
  1.4862446784973145,
  1.4241828918457031,
  1.463242769241333,
  1.4431817531585693)}

In [49]:
# Construire le jeu d'entraînement complet à partir du DataFrame df_surprise
train_set = df_surprise.build_full_trainset()

# Initialiser une liste vide pour stocker les paires (utilisateur, movie) pour le jeu "anti-testset"
anti_testset = []

# Convertir l'ID de l'utilisateur externe en l'ID interne utilisé par Surprise
targetUser = train_set.to_inner_uid(user_id)

# Obtenir la valeur de remplissage à utiliser (moyenne globale des notes du jeu d'entraînement)
moyenne = train_set.global_mean

# Obtenir les évaluations de l'utilisateur cible pour les movies

user_note = train_set.ur[targetUser]

# Extraire la liste des movies notés par l'utilisateur
user_movie = [item for (item,_) in (user_note)]

# Obtenir toutes les notations du jeu d'entraînement
ratings = train_set.all_ratings()


# Boucle sur tous les items du jeu d'entraînement
for movie in train_set.all_items():

# Si l'item n'a pas été noté par l'utilisateur
    if movie not in user_movie:

        # Ajouter la paire (utilisateur, movie, valeur de remplissage) à la liste "anti-testset"
        anti_testset.append((user_id, train_set.to_raw_iid(movie), moyenne))

In [42]:
# Effectuer les prédictions sur l'ensemble "anti-testset_user" en utilisant le modèle (non spécifié dans le code)

predictions = normpred.test(anti_testset)

# Convertir les prédictions en un DataFrame pandas
predictions = pd.DataFrame(predictions)

# Trier les prédictions par la colonne 'est' (estimation) en ordre décroissant
predictions.sort_values(by=['est'], inplace=True, ascending=False)

# Afficher les 10 meilleures prédictions
predictions.head(10)

Unnamed: 0,uid,iid,r_ui,est,details
3011,35022,Sullivan's Travels,3.5271,5.0,{'was_impossible': False}
13663,35022,Tanguy,3.5271,5.0,{'was_impossible': False}
7106,35022,Class Reunion,3.5271,5.0,{'was_impossible': False}
3990,35022,Hey Arnold! The Movie,3.5271,5.0,{'was_impossible': False}
7085,35022,Cool as Ice,3.5271,5.0,{'was_impossible': False}
14510,35022,Main Street,3.5271,5.0,{'was_impossible': False}
12311,35022,Futureworld,3.5271,5.0,{'was_impossible': False}
9206,35022,Ruby,3.5271,5.0,{'was_impossible': False}
3989,35022,Monty Python Live at the Hollywood Bowl,3.5271,5.0,{'was_impossible': False}
9217,35022,Certified Copy (Copie conforme),3.5271,5.0,{'was_impossible': False}


In [50]:
from surprise import SVD

svd = SVD()

cross_validate(svd, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7875  0.7876  0.7882  0.7885  0.7878  0.7879  0.0004  
MAE (testset)     0.5991  0.5994  0.5997  0.6000  0.5994  0.5995  0.0003  
Fit time          663.22  633.28  597.24  559.28  574.26  605.46  38.17   
Test time         91.50   67.61   47.83   51.68   72.88   66.30   15.71   


{'test_rmse': array([0.78748295, 0.78756427, 0.78823107, 0.78852963, 0.78780605]),
 'test_mae': array([0.59911483, 0.59935023, 0.59973963, 0.60003934, 0.5994472 ]),
 'fit_time': (663.2172005176544,
  633.2800345420837,
  597.2385222911835,
  559.281792640686,
  574.259927034378),
 'test_time': (91.4967794418335,
  67.61434936523438,
  47.83231854438782,
  51.676172971725464,
  72.87640762329102)}

In [51]:
predictionsSVD = svd.test(anti_testset)

# Convertir les prédictions en un DataFrame pandas
predictionsSVD = pd.DataFrame(predictions)

# Trier les prédictions par la colonne 'est' (estimation) en ordre décroissant
predictionsSVD.sort_values(by=['est'], inplace=True, ascending=False)

# Afficher les 10 meilleures prédictions
predictionsSVD.head(10)

Unnamed: 0,uid,iid,r_ui,est,details
3011,35022,Sullivan's Travels,3.5271,5.0,{'was_impossible': False}
2113,35022,Wild Hogs,3.5271,5.0,{'was_impossible': False}
11641,35022,Women Robbers (Diebinnen),3.5271,5.0,{'was_impossible': False}
11639,35022,Gone,3.5271,5.0,{'was_impossible': False}
2114,35022,Revenge of the Nerds,3.5271,5.0,{'was_impossible': False}
3020,35022,Moonrise Kingdom,3.5271,5.0,{'was_impossible': False}
11636,35022,"Monster Club, The",3.5271,5.0,{'was_impossible': False}
11635,35022,Castle Keep,3.5271,5.0,{'was_impossible': False}
11688,35022,"Superweib, Das",3.5271,5.0,{'was_impossible': False}
5528,35022,Carriers,3.5271,5.0,{'was_impossible': False}


In [52]:
import pickle
# Entraîner le modèle SVD
svd = SVD()
trainset = df_surprise.build_full_trainset()
svd.fit(trainset)

# Sauvegarder le modèle avec pickle
with open('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/models/svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)