# Construction d'un système de recommandation

Nous avons décidé d'orienter notre projet sur la recommendation de films.
En effet durant ce confinement, nous avons eu le temps de visionner beaucoup de films,
mais nous nous sommes rendus compte que nous passions quasiment autant de temps
à choisir le film qu'à le regarder. D'où la nécessité de créer un système de re-
commendations afin d'optimiser notre temps de visionnage.
Nous avons chercher une base de données assez exploitable afin de mener à bien
notre projet. Nous nous sommes basés sur la base de données de 'The Movies Dataset'.


In [137]:
import numpy as np
import pandas as pd
import math

## Fetching data

In [195]:
movies = pd.read_csv("movies_metadata.csv")
movies.head()
print(movies.shape)

(45466, 24)


In [171]:
movies = movies[~movies.id.duplicated()]

In [200]:
movies = movies.astype({'id': 'int64'})

ValueError: invalid literal for int() with base 10: '2012-09-29'

In [85]:
ratings = pd.read_csv("ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [86]:
# ne pas rerun !
ratings = ratings.drop(columns=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [89]:
print(min(ratings.rating), max(ratings.rating))
ratings.describe()
ratings.dtypes

0.5 5.0


userId       int64
movieId      int64
rating     float64
dtype: object

In [94]:
ratings = ratings.astype({'movieId': 'str'})
ratings.dtypes

userId       int64
movieId     object
rating     float64
dtype: object

In [172]:
movies_cleaned = movies.loc[:, ['id', 'title']]
movies_cleaned = movies_cleaned.loc[movies_cleaned['id'].isin(ratings.movieId)]

In [173]:
print(movies_cleaned.shape)
movies_cleaned.head()
movies = movies_cleaned

(2830, 2)


In [174]:
movies.tail()

Unnamed: 0,id,title
45318,80831,Sang Pemimpi
45353,3104,Frankenstein Created Woman
45403,64197,Travelling with Pets
45406,98604,Cinderella
45450,49280,The One-Man Band


In [131]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [246]:
nbPers = len(ratings.userId.unique())
nbMovi = len(ratings.movieId.unique())

## User Based

In [147]:
def cor(su, sv):
    return np.dot(su, sv) / math.sqrt(np.dot(su, su) * np.dot(sv, sv))

In [186]:
def mean_rat(uid):
    n = ratings.loc[ratings['userId'] == uid].count().loc['userId']
    s = ratings.loc[ratings['userId'] == uid].sum().loc['rating']
    return s / n

In [281]:
mean = ratings.loc[:, ['userId']]
mean = mean[~mean.userId.duplicated()]
mean['mu'] = mean['userId'].map(lambda uid : mean_rat(uid))

faire que userId soit l' index, acces plus efficace

In [282]:
mean = mean.set_index('userId')

In [283]:
mean.head()

Unnamed: 0_level_0,mu
userId,Unnamed: 1_level_1
1,2.55
2,3.486842
3,3.568627
4,4.348039
5,3.91


In [292]:
def get_mean(x):
    return mean.at[x, 'mu']

In [293]:
def normalize(df):
    df['rating_norm'] = df[['userId', 'rating']].apply(lambda row : row['rating'] / get_mean(row['userId']), axis=1)

In [294]:
test = ratings.head(n = 30)
test

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
5,1,1263,2.0
6,1,1287,2.0
7,1,1293,2.0
8,1,1339,3.5
9,1,1343,2.0


In [295]:
normalize(test)
mean.at[1, 'mu']

ValueError: ('At based indexing on an integer index can only have integer indexers', 'occurred at index 0')

In [249]:
normalize(ratings)

KeyboardInterrupt: 

In [248]:
cor_matrix = np.array((nbPers, nbPers))
for u in ratings.userId:
    for v in ratings.userId:
        su = ratings.loc[ratings['userId'] == u].loc[:, 'rating_norm']
        sv = ratings.loc[ratings['userId'] == v].loc[:, 'rating_norm']
        cor_matrix[u-1, v-1] = cor(su, sv)

IndexError: too many indices for array