# Construction d'un système de recommandation

Nous avons décidé d'orienter notre projet sur la recommendation de films.
En effet durant ce confinement, nous avons eu le temps de visionner beaucoup de films,
mais nous nous sommes rendus compte que nous passions quasiment autant de temps
à choisir le film qu'à le regarder. D'où la nécessité de créer un système de re-
commendations afin d'optimiser notre temps de visionnage.
Nous avons chercher une base de données assez exploitable afin de mener à bien
notre projet. Nous nous sommes basés sur la base de données de 'The Movies Dataset'.


In [137]:
import numpy as np
import pandas as pd
import math

## Fetching data

In [296]:
movies = pd.read_csv("movies_metadata.csv")
movies.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [317]:
movies.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [316]:
movies = movies[~movies.id.duplicated()
i = movies[movies.id.str.contains("-")].index
movies.drop(i, inplace=True)
movies = movies.astype({'id': 'int64'})

In [85]:
ratings = pd.read_csv("ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [86]:
# ne pas rerun !
ratings = ratings.drop(columns=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [326]:
ratings[(ratings['userId'] == 1) & (ratings['movieId'] == 31)]

Unnamed: 0,userId,movieId,rating


In [89]:
print(min(ratings.rating), max(ratings.rating))
ratings.describe()
ratings.dtypes

0.5 5.0


userId       int64
movieId      int64
rating     float64
dtype: object

In [327]:
movies_cleaned = movies.loc[:, ['id', 'title']]
movies_cleaned = movies_cleaned.loc[movies_cleaned['id'].isin(ratings.movieId)]

In [328]:
print(movies_cleaned.shape)
movies_cleaned.head()
movies = movies_cleaned

(2831, 2)


In [246]:
nbPers = len(ratings.userId.unique())
nbMovi = len(ratings.movieId.unique())

## User Based

In [147]:
def cor(su, sv):
    return np.dot(su, sv) / math.sqrt(np.dot(su, su) * np.dot(sv, sv))

In [186]:
def mean_rat(uid):
    n = ratings.loc[ratings['userId'] == uid].count().loc['userId']
    s = ratings.loc[ratings['userId'] == uid].sum().loc['rating']
    return s / n

In [281]:
mean = ratings.loc[:, ['userId']]
mean = mean[~mean.userId.duplicated()]
mean['mu'] = mean['userId'].map(lambda uid : mean_rat(uid))

In [282]:
mean = mean.set_index('userId')

In [283]:
mean.head()

Unnamed: 0_level_0,mu
userId,Unnamed: 1_level_1
1,2.55
2,3.486842
3,3.568627
4,4.348039
5,3.91


In [376]:
ratings[ratings['userId'] == None]
mean.loc[3]['mu']

3.5686274509803924

In [334]:
mean.at[4, 'mu']

4.348039215686274

In [378]:
def normalize(df):
    df['rating_norm'] = df[['userId', 'rating']].apply(lambda row : row['rating'] / mean.loc[int(row['userId'])]['mu'], axis=1)

In [343]:
test = ratings.head(n = 30)
test.dtypes

userId       int64
movieId     object
rating     float64
dtype: object

In [350]:
normalize(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [351]:
test

Unnamed: 0,userId,movieId,rating,rating_norm
0,1,31,2.5,0.980392
1,1,1029,3.0,1.176471
2,1,1061,3.0,1.176471
3,1,1129,2.0,0.784314
4,1,1172,4.0,1.568627
5,1,1263,2.0,0.784314
6,1,1287,2.0,0.784314
7,1,1293,2.0,0.784314
8,1,1339,3.5,1.372549
9,1,1343,2.0,0.784314


In [352]:
normalize(ratings)

In [387]:
cor_matrix = np.zeros((nbPers, nbPers))
for u in ratings.userId:
    for v in ratings.userId:
        su = np.array(ratings[ratings['userId'] == u]['rating_norm'])
        sv = np.array(ratings[ratings['userId'] == v]['rating_norm'])
        cor_matrix[u-1, v-1] = cor(su, sv)

ValueError: shapes (20,) and (76,) not aligned: 20 (dim 0) != 76 (dim 0)