# <ins>Filtrage collaboratif : approche hybride et approche modèle</ins>

Il se trouve que l’une des méthodes les plus connues des systèmes de recommandation à filtrage collaboratif est la Factorisation Matricielle, qui trouve son origine dans le concours Netflix par l’équipe ayant fini en 3ème place

Le but de ce notebook est d'entrer dans les détails de la Factorisation Matricielle ainsi que de vous guider au cours de son implémentation.

In [1]:
import pandas as pd

In [45]:
ratings = pd.read_csv('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/silver/processed_ratings.csv', dtype={'userid': str, 'movieid': str, 'rating': float})
movies = pd.read_csv('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/raw/silver/processed_movies.csv', dtype={'movieid': str, 'title': str, 'genres': str})
df = pd.merge(ratings, movies, on='movieid')
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,bayesian_mean,title,genres,year
0,1,2,3.5,1112486027,3.21,Jumanji,"Adventure, Children, Fantasy",1995
1,5,2,3.0,851527569,3.21,Jumanji,"Adventure, Children, Fantasy",1995
2,13,2,3.0,849082742,3.21,Jumanji,"Adventure, Children, Fantasy",1995
3,29,2,3.0,835562174,3.21,Jumanji,"Adventure, Children, Fantasy",1995
4,34,2,3.0,846509384,3.21,Jumanji,"Adventure, Children, Fantasy",1995


In [46]:
# Étape 1 : Compter le nombre de films notés par utilisateur
user_counts = df['userid'].value_counts()
users_with_more_than_10_ratings = user_counts[user_counts > 10].index

# Étape 2 : Compter le nombre de notes par film
movie_counts = df['movieid'].value_counts()
movies_with_at_least_2_ratings = movie_counts[movie_counts >= 2].index

# Étape 3 : Filtrer le DataFrame
df= df[(df['userid'].isin(users_with_more_than_10_ratings)) & (df['movieid'].isin(movies_with_at_least_2_ratings))]

df.shape

(19996291, 8)

In [29]:
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

In [32]:
df[df['userid'] == '35022'].head(20)

Unnamed: 0,userid,movieid,rating,timestamp,bayesian_mean,title,genres,year
0,35022,4039,3.0,982969076,3.18,Annie,"Children, Musical",1982
34003,35022,924,4.0,982363377,3.93,2001: A Space Odyssey,"Adventure, Drama, Sci-Fi",1968
58939,35022,7,4.0,982968367,3.35,Sabrina,"Comedy, Romance",1995
131487,35022,345,4.0,991716714,3.61,"Adventures of Priscilla, Queen of the Desert, The","Comedy, Drama",1994
164088,35022,4823,3.0,1024248311,3.28,Serendipity,"Comedy, Romance",2001
172700,35022,4458,3.5,1061510005,3.28,Africa: The Serengeti,"Documentary, IMAX",1994
295937,35022,70286,4.0,1263693176,3.81,District 9,"Mystery, Sci-Fi, Thriller",2009
350425,35022,3510,3.0,977880212,3.53,Frequency,"Drama, Thriller",2000
386393,35022,1644,2.0,977526865,2.54,I Know What You Did Last Summer,"Horror, Mystery, Thriller",1997
418084,35022,24,4.0,977620732,3.19,Powder,"Drama, Sci-Fi",1995


In [9]:
mat_ratings = pd.pivot_table(data = df, values = 'rating', columns = 'title', index = 'userid')

mat_ratings = mat_ratings +1

mat_ratings = mat_ratings.fillna(0)

mat_ratings.head()

title,'Round Midnight,'Salem's Lot,'Til There Was You,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,"...All the Marbles (California Dolls, The)",...And Justice for All,"1, 2, 3, Sun (Un, deuz, trois, soleil)",...,Zulu,[REC],[REC]²,a/k/a Tommy Chong,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:

from scipy.sparse import csr_matrix

sparse_ratings = csr_matrix(mat_ratings)

user_ids = mat_ratings.index.tolist()

titles = mat_ratings.columns.tolist()

print(sparse_ratings)

  (0, 275)	5.0
  (0, 5745)	4.5
  (1, 1643)	5.0
  (2, 1499)	6.0
  (3, 1623)	4.0
  (4, 593)	4.0
  (5, 632)	1.5
  (5, 2492)	5.5
  (5, 4511)	5.5
  (5, 7541)	6.0
  (6, 2007)	6.0
  (7, 2560)	4.0
  (7, 5704)	5.0
  (8, 1403)	4.5
  (8, 3819)	4.5
  (9, 125)	4.5
  (9, 2962)	4.5
  (9, 5128)	3.0
  (9, 6851)	4.0
  (10, 158)	6.0
  (10, 1067)	5.0
  (10, 3215)	5.0
  (10, 4160)	3.0
  (10, 6083)	3.0
  (10, 7175)	3.0
  :	:
  (52204, 4989)	4.0
  (52204, 5041)	5.0
  (52205, 6669)	4.0
  (52205, 6896)	6.0
  (52206, 2032)	5.0
  (52206, 7944)	5.0
  (52207, 4983)	2.0
  (52208, 2739)	4.5
  (52208, 6519)	5.0
  (52209, 3971)	5.0
  (52210, 1544)	3.0
  (52211, 7334)	5.0
  (52212, 3808)	6.0
  (52213, 727)	4.0
  (52213, 5702)	5.5
  (52213, 6126)	5.5
  (52213, 6189)	5.5
  (52213, 8035)	5.0
  (52214, 235)	5.0
  (52214, 557)	4.0
  (52214, 2816)	3.0
  (52214, 4135)	5.0
  (52215, 6597)	6.0
  (52216, 1333)	5.0
  (52216, 7455)	5.0


In [11]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=12)

ratings_red = svd.fit_transform(sparse_ratings.T)

ratings_red.shape

(8183, 12)

In [12]:
new_ratings_red = svd.inverse_transform(ratings_red)

from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(ratings_red)

item_similarity_2 = cosine_similarity(new_ratings_red)

item_similarity = pd.DataFrame(item_similarity, index = titles, columns = titles)

item_similarity_2 = pd.DataFrame(item_similarity_2, index = titles, columns = titles)

item_similarity.head()

Unnamed: 0,'Round Midnight,'Salem's Lot,'Til There Was You,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,"...All the Marbles (California Dolls, The)",...And Justice for All,"1, 2, 3, Sun (Un, deuz, trois, soleil)",...,Zulu,[REC],[REC]²,a/k/a Tommy Chong,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
'Round Midnight,1.0,0.34133,-0.157915,-0.64184,-0.512502,-0.129918,-0.509195,-0.538189,-0.074259,-0.588295,...,-0.712704,0.256875,-0.28866,-0.556862,0.272821,-0.447906,-0.13411,-0.212307,-0.076301,0.738794
'Salem's Lot,0.34133,1.0,-0.056977,0.106324,-0.402607,0.453242,0.119097,-0.501862,0.333672,-0.374412,...,-0.060097,0.101423,0.17198,-0.431329,-0.09769,0.456567,-0.197975,0.209965,0.205204,0.653944
'Til There Was You,-0.157915,-0.056977,1.0,0.559907,-0.000785,0.646268,0.194223,0.164527,0.712861,-0.41546,...,0.415786,0.202352,0.774429,0.200481,0.531889,-0.060353,0.3644,0.557076,0.179497,-0.077726
"'burbs, The",-0.64184,0.106324,0.559907,1.0,0.075531,0.750641,0.659492,0.434242,0.606146,-0.115571,...,0.931975,-0.33485,0.825394,0.342563,0.160055,0.593549,0.442944,0.770448,0.296456,-0.169018
'night Mother,-0.512502,-0.402607,-0.000785,0.075531,1.0,-0.286015,0.20719,0.494215,-0.174089,0.646046,...,0.303255,-0.154668,-0.139413,0.17404,-0.490405,0.387868,0.056219,-0.026881,0.255433,-0.387793


In [13]:
import numpy as np

def pred_item(mat_ratings, item_similarity, k, user_id):

# Sélectionner dans mat_ratings les livres qui n'ont pas été encore lu par le user
    to_predict = mat_ratings.loc[user_id][mat_ratings.loc[user_id]==0]
    # Itérer sur tous ces livres
    for i in to_predict.index:
        #Trouver les k livres les plus similaires en excluant le livre lui-même
        similar_items = item_similarity.loc[i].sort_values(ascending=False)[1:k+1]
        # Calcul de la norme du vecteur similar_items
        norm = np.sum(np.abs(similar_items))
        # Récupérer les notes données par l'utilisateur aux k plus proches voisins
        ratings = mat_ratings[similar_items.index].loc[user_id]
        # Calculer le produit scalaire entre ratings et similar_items
        scalar_prod = np.dot(ratings,similar_items)
        #Calculer la note prédite pour le film i
        pred = scalar_prod / norm
        # Remplacer par la prédiction
        to_predict[i] = pred
    return to_predict

In [33]:
userID = '35022'

# notations au dessus de 4
user_rating_sup4 = df.loc[(df['userid'] == userID) & (df['rating'] > 2)]

user_preferences = user_rating_sup4.sort_values(by = 'rating', ascending = False).head(10)

user_preferences

Unnamed: 0,userid,movieid,rating,timestamp,bayesian_mean,title,genres,year
962713,35022,5690,5.0,1038443147,3.91,Grave of the Fireflies (Hotaru no haka),"Animation, Drama, War",1988
477746,35022,72998,5.0,1263693006,3.73,Avatar,"Action, Adventure, Sci-Fi, IMAX",2009
600372,35022,362,4.0,977509354,3.37,"Jungle Book, The","Adventure, Children, Romance",1994
58939,35022,7,4.0,982968367,3.35,Sabrina,"Comedy, Romance",1995
131487,35022,345,4.0,991716714,3.61,"Adventures of Priscilla, Queen of the Desert, The","Comedy, Drama",1994
295937,35022,70286,4.0,1263693176,3.81,District 9,"Mystery, Sci-Fi, Thriller",2009
418084,35022,24,4.0,977620732,3.19,Powder,"Drama, Sci-Fi",1995
34003,35022,924,4.0,982363377,3.93,2001: A Space Odyssey,"Adventure, Drama, Sci-Fi",1968
474124,35022,2581,4.0,1137901274,3.06,Never Been Kissed,"Comedy, Romance",1999
172700,35022,4458,3.5,1061510005,3.28,Africa: The Serengeti,"Documentary, IMAX",1994


In [34]:
print("10 recommandations avec item_similarity")
print(pred_item(mat_ratings, item_similarity, 4, userID).
sort_values(ascending = False).head(10), end = "\n\n")
print("10 recommandations avec item_similarity_2")
print(pred_item(mat_ratings, item_similarity_2, 4, userID).sort_values(ascending = False).head(10), end = "\n\n")

10 recommandations avec item_similarity
title
Fair Game                                         1.499479
Doulos, Le                                        1.499477
Wet Hot American Summer                           1.497774
Terrorist, The (a.k.a. Malli) (Theeviravaathi)    1.383298
Downfall (Untergang, Der)                         1.367051
Reservoir Dogs                                    1.362721
Garbage Pail Kids Movie, The                      1.266540
Under the Rainbow                                 1.252091
Just Go with It                                   1.243377
Waiting for Superman                              1.226985
Name: 35022, dtype: float64

10 recommandations avec item_similarity_2
title
Fair Game                                         1.499479
Doulos, Le                                        1.499477
Wet Hot American Summer                           1.497774
Terrorist, The (a.k.a. Malli) (Theeviravaathi)    1.383298
Downfall (Untergang, Der)                         1

In [36]:
reco_user1 = pred_item(mat_ratings, item_similarity, 3, '35022').sort_values(ascending=False).head(10)
print(reco_user1)


reco_user1bis = pred_item(mat_ratings, item_similarity_2, 3, '35022').sort_values(ascending=False).head(10)
print(reco_user1bis)

title
Fair Game                                         1.998873
Terrorist, The (a.k.a. Malli) (Theeviravaathi)    1.840799
Downfall (Untergang, Der)                         1.815160
Reservoir Dogs                                    1.810857
Garbage Pail Kids Movie, The                      1.686004
Under the Rainbow                                 1.663698
Just Go with It                                   1.653862
Waiting for Superman                              1.617960
Book of Life, The                                 1.617482
Warrior                                           1.616641
Name: 35022, dtype: float64
title
Fair Game                                         1.998873
Terrorist, The (a.k.a. Malli) (Theeviravaathi)    1.840799
Downfall (Untergang, Der)                         1.815160
Reservoir Dogs                                    1.810857
Garbage Pail Kids Movie, The                      1.686004
Under the Rainbow                                 1.663698
Just Go with It 

In [37]:
print("Vos 10 meilleurs notations :", end="\n\n")
best_rating = df[(df['userid'] == '35022') & (df['rating'] > 1)]
best_rating = best_rating.sort_values(
by='rating', ascending=False).head(10)
print(best_rating.title, end="\n\n")
print("Voici nos 10 recommandations pour vous :", end="\n\n")
best_recomm = pred_item(mat_ratings, item_similarity,  5, '35022')
best_recomm = best_recomm.sort_values(ascending = False).head(10)
print(best_recomm)

Vos 10 meilleurs notations :

477746                                               Avatar
962713              Grave of the Fireflies (Hotaru no haka)
58939                                               Sabrina
295937                                           District 9
418084                                               Powder
600372                                     Jungle Book, The
474124                                    Never Been Kissed
34003                                 2001: A Space Odyssey
131487    Adventures of Priscilla, Queen of the Desert, The
172700                                Africa: The Serengeti
Name: title, dtype: object

Voici nos 10 recommandations pour vous :

title
Fair Game                                         1.199763
Purge, The                                        1.199728
Doulos, Le                                        1.199666
Grand Hotel                                       1.199458
Wet Hot American Summer                           1.199042

## <ins>UTILISATION DE SURPRISE SVD</ins>

In [47]:
from surprise import Reader
from surprise import Dataset

reader = Reader(rating_scale = (0, 5))

df_surprise = Dataset.load_from_df(df[["userid", "title", "rating"]], reader=reader)

In [48]:
user_id = '35022'

In [40]:
from surprise import NormalPredictor
from surprise.model_selection import cross_validate

normpred = NormalPredictor()

cross_validate(normpred, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4380  1.4359  1.4328  1.4370  1.4365  1.4360  0.0018  
MAE (testset)     1.1473  1.1462  1.1422  1.1454  1.1454  1.1453  0.0017  
Fit time          1.04    1.31    1.29    1.28    1.26    1.24    0.10    
Test time         1.81    1.49    1.42    1.46    1.44    1.53    0.15    


{'test_rmse': array([1.43803031, 1.43594969, 1.43276611, 1.43698179, 1.43651469]),
 'test_mae': array([1.14730282, 1.14620217, 1.14222682, 1.14536291, 1.14540409]),
 'fit_time': (1.0430779457092285,
  1.3093020915985107,
  1.2891581058502197,
  1.2826848030090332,
  1.255103588104248),
 'test_time': (1.8144383430480957,
  1.4862446784973145,
  1.4241828918457031,
  1.463242769241333,
  1.4431817531585693)}

In [49]:
# Construire le jeu d'entraînement complet à partir du DataFrame df_surprise
train_set = df_surprise.build_full_trainset()

# Initialiser une liste vide pour stocker les paires (utilisateur, movie) pour le jeu "anti-testset"
anti_testset = []

# Convertir l'ID de l'utilisateur externe en l'ID interne utilisé par Surprise
targetUser = train_set.to_inner_uid(user_id)

# Obtenir la valeur de remplissage à utiliser (moyenne globale des notes du jeu d'entraînement)
moyenne = train_set.global_mean

# Obtenir les évaluations de l'utilisateur cible pour les movies

user_note = train_set.ur[targetUser]

# Extraire la liste des movies notés par l'utilisateur
user_movie = [item for (item,_) in (user_note)]

# Obtenir toutes les notations du jeu d'entraînement
ratings = train_set.all_ratings()


# Boucle sur tous les items du jeu d'entraînement
for movie in train_set.all_items():

# Si l'item n'a pas été noté par l'utilisateur
    if movie not in user_movie:

# Ajouter la paire (utilisateur, movie, valeur de remplissage) à la liste "anti-testset"
        anti_testset.append((user_id, train_set.to_raw_iid(movie), moyenne))

In [42]:
# Effectuer les prédictions sur l'ensemble "anti-testset_user" en utilisant le modèle (non spécifié dans le code)

predictions = normpred.test(anti_testset)

# Convertir les prédictions en un DataFrame pandas
predictions = pd.DataFrame(predictions)

# Trier les prédictions par la colonne 'est' (estimation) en ordre décroissant
predictions.sort_values(by=['est'], inplace=True, ascending=False)

# Afficher les 10 meilleures prédictions
predictions.head(10)

Unnamed: 0,uid,iid,r_ui,est,details
3011,35022,Sullivan's Travels,3.5271,5.0,{'was_impossible': False}
13663,35022,Tanguy,3.5271,5.0,{'was_impossible': False}
7106,35022,Class Reunion,3.5271,5.0,{'was_impossible': False}
3990,35022,Hey Arnold! The Movie,3.5271,5.0,{'was_impossible': False}
7085,35022,Cool as Ice,3.5271,5.0,{'was_impossible': False}
14510,35022,Main Street,3.5271,5.0,{'was_impossible': False}
12311,35022,Futureworld,3.5271,5.0,{'was_impossible': False}
9206,35022,Ruby,3.5271,5.0,{'was_impossible': False}
3989,35022,Monty Python Live at the Hollywood Bowl,3.5271,5.0,{'was_impossible': False}
9217,35022,Certified Copy (Copie conforme),3.5271,5.0,{'was_impossible': False}


In [50]:
from surprise import SVD

svd = SVD()

cross_validate(svd, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7875  0.7876  0.7882  0.7885  0.7878  0.7879  0.0004  
MAE (testset)     0.5991  0.5994  0.5997  0.6000  0.5994  0.5995  0.0003  
Fit time          663.22  633.28  597.24  559.28  574.26  605.46  38.17   
Test time         91.50   67.61   47.83   51.68   72.88   66.30   15.71   


{'test_rmse': array([0.78748295, 0.78756427, 0.78823107, 0.78852963, 0.78780605]),
 'test_mae': array([0.59911483, 0.59935023, 0.59973963, 0.60003934, 0.5994472 ]),
 'fit_time': (663.2172005176544,
  633.2800345420837,
  597.2385222911835,
  559.281792640686,
  574.259927034378),
 'test_time': (91.4967794418335,
  67.61434936523438,
  47.83231854438782,
  51.676172971725464,
  72.87640762329102)}

In [51]:
predictionsSVD = svd.test(anti_testset)

# Convertir les prédictions en un DataFrame pandas
predictionsSVD = pd.DataFrame(predictions)

# Trier les prédictions par la colonne 'est' (estimation) en ordre décroissant
predictionsSVD.sort_values(by=['est'], inplace=True, ascending=False)

# Afficher les 10 meilleures prédictions
predictionsSVD.head(10)

Unnamed: 0,uid,iid,r_ui,est,details
3011,35022,Sullivan's Travels,3.5271,5.0,{'was_impossible': False}
2113,35022,Wild Hogs,3.5271,5.0,{'was_impossible': False}
11641,35022,Women Robbers (Diebinnen),3.5271,5.0,{'was_impossible': False}
11639,35022,Gone,3.5271,5.0,{'was_impossible': False}
2114,35022,Revenge of the Nerds,3.5271,5.0,{'was_impossible': False}
3020,35022,Moonrise Kingdom,3.5271,5.0,{'was_impossible': False}
11636,35022,"Monster Club, The",3.5271,5.0,{'was_impossible': False}
11635,35022,Castle Keep,3.5271,5.0,{'was_impossible': False}
11688,35022,"Superweib, Das",3.5271,5.0,{'was_impossible': False}
5528,35022,Carriers,3.5271,5.0,{'was_impossible': False}


In [52]:
import pickle
# Entraîner le modèle SVD
svd = SVD()
trainset = df_surprise.build_full_trainset()
svd.fit(trainset)

# Sauvegarder le modèle avec pickle
with open('/home/antoine/PROJET_MLOPS_RECO_MOVIES/data/models/svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)