# Construction d'un système de recommandation

Nous avons décidé d'orienter notre projet sur la recommendation de films.
En effet durant ce confinement, nous avons eu le temps de visionner beaucoup de films,
mais nous nous sommes rendus compte que nous passions quasiment autant de temps
à choisir le film qu'à le regarder. D'où la nécessité de créer un système de re-
commendations afin d'optimiser notre temps de visionnage.
Nous avons chercher une base de données assez exploitable afin de mener à bien
notre projet. Nous nous sommes basés sur la base de données de 'The Movies Dataset'.


In [1]:
import numpy as np
import pandas as pd
import math
import re

## Fetching data

In [2]:
movies = pd.read_csv("movies_metadata.csv")
movies.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
movies.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [4]:
def filter_correct_id(word):
    if re.fullmatch(r'[0-9]+', word):
        return word
    return "wrong_id"

In [5]:
# don't re-run
movies = movies[~movies.id.duplicated()]
movies.id = movies.id.apply(filter_correct_id)
movies = movies[movies.id != "wrong_id"]
movies.id = movies.id.astype('int64')

In [6]:
movies.id

0           862
1          8844
2         15602
3         31357
4         11862
5           949
6         11860
7         45325
8          9091
9           710
10         9087
11        12110
12        21032
13        10858
14         1408
15          524
16         4584
17            5
18         9273
19        11517
20         8012
21         1710
22         9691
23        12665
24          451
25        16420
26         9263
27        17015
28          902
29        37557
          ...  
45436     45527
45437    455661
45438    327237
45439     84710
45440     39562
45441     14008
45442     44330
45443     49279
45444     44333
45445     49277
45446     49271
45447     44324
45448    122036
45449     14885
45450     49280
45451    106807
45452    276895
45453    404604
45454    420346
45455     67179
45456     84419
45457    390959
45458    289923
45459    222848
45460     30840
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45433,

In [7]:
ratings = pd.read_csv("ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [8]:
# ne pas re-run !
ratings = ratings.drop(columns=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
ratings[(ratings['userId'] == 1) & (ratings['movieId'] == 31)]

Unnamed: 0,userId,movieId,rating
0,1,31,2.5


In [10]:
print(min(ratings.rating), max(ratings.rating))
ratings.describe()
ratings.dtypes

0.5 5.0


userId       int64
movieId      int64
rating     float64
dtype: object

In [11]:
movies_cleaned = movies[['id', 'title']]
movies_cleaned = movies_cleaned.loc[movies_cleaned['id'].isin(ratings.movieId)]

In [12]:
print(movies_cleaned.shape)
movies_cleaned.head()
movies = movies_cleaned

(2830, 2)


In [13]:
nbPers = len(ratings.userId.unique())
nbMovi = len(ratings.movieId.unique())

## User Based

In [14]:
def cor(su, sv):
    return np.dot(su, sv) / math.sqrt(np.dot(su, su) * np.dot(sv, sv))

In [15]:
def mean_rating(uid):
    n = ratings.loc[ratings['userId'] == uid].count().loc['userId']
    s = ratings.loc[ratings['userId'] == uid].sum().loc['rating']
    return s / n

In [94]:
def normalize(df):
    mean = ratings.loc[:, ['userId']].drop_duplicates()
    mean['mu'] = mean['userId'].map(lambda uid : mean_rating(uid))
    mean = mean.set_index('userId')
    df['rating_norm'] = df[['userId', 'rating']].apply(lambda row : row['rating'] -  mean.loc[int(row['userId'])]['mu'], axis=1)

In [96]:
normalize(ratings)

In [105]:
def corelation_matrix():
    cor_matrix = np.zeros((nbPers, nbPers))
    for u in ratings.userId:
        for v in ratings.userId:
            Iu = ratings.loc[ratings['userId'] == u, ['movieId']]
            Iv = ratings.loc[ratings['userId'] == v, ['movieId']]
            Iuv = Iu.join(Iv.set_index('movieId'), on='movieId', how='inner')
            su = ratings.loc[(ratings['userId'] == u) & (ratings['movieId'].isin(Iuv['movieId'])), ['rating_norm']]
            sv = ratings.loc[(ratings['userId'] == v) & (ratings['movieId'].isin(Iuv['movieId'])), ['rating_norm']]
            su = np.array(su.rating_norm)
            sv = np.array(sv.rating_norm)
            if Iuv.size :
                cor_matrix[u-1, v-1] = cor(su, sv)
    return cor_matrix

In [None]:
cm = corelation_matrix()

  


In [None]:
print(cm)