In [None]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv('rating.csv')
movies = pd.read_csv('movie.csv')

In [3]:
df = pd.merge(ratings, movies, on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

## Je vais procéder au système de recommandation en fonction des films (filtrage collaboratif basé sur les items)

In [None]:
# Je crée une table avec les utilisateurs en lignes, les titres en colonnes, et les notes comme valeurs
user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')
user_movie_matrix.head()

  user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')


title,#chicagoGirl: The Social Network Takes on a Dictator (2013),$ (Dollars) (1971),$5 a Day (2008),$9.99 (2008),$ellebrity (Sellebrity) (2012),'71 (2014),'Hellboy': The Seeds of Creation (2004),"'Human' Factor, The (Human Factor, The) (1975)",'Neath the Arizona Skies (1934),'R Xmas (2001),...,¡Qué hacer! (1970),¡Three Amigos! (1986),À l'aventure (2008),À nos amours (1983),À nous la liberté (Freedom for Us) (1931),À propos de Nice (1930),Árido Movie (2005),Åsa-Nisse - Wälkom to Knohult (2011),Üvegtigris (2001),貞子3D (2012)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [8]:
user_movie_matrix['$ (Dollars) (1971)'].value_counts()

$ (Dollars) (1971)
2.5    5
3.0    5
3.5    4
4.0    3
0.5    3
4.5    2
2.0    2
Name: count, dtype: int64

In [None]:
# Je vais trouver les films qui sont les plus similaires à un film spécifique
# Par exemple, je vais utiliser "Toy Story (1995)" comme film de référence

target_movie = "Toy Story (1995)"

# Je trouve les films qui ont une corrélation de notes avec ce film
similar_movies = user_movie_matrix.corrwith(user_movie_matrix[target_movie])
similar_movies = similar_movies.dropna()  # Enlever les NaN

# Je le convertis en DataFrame
corr_df = pd.DataFrame(similar_movies, columns=['correlation'])

# J'affiche les fils les plus similaires à "Toy Story (1995)"
corr_df.sort_values('correlation', ascending=False).head(10)


  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[None, :]


Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
"Human Tornado, The (1976)",1.0
Craig Ferguson: Does This Need to Be Said? (2011),1.0
"Pit, The (1981)",1.0
Big Pun: The Legacy (2008),1.0
Flu Bird (2008),1.0
White God (Fehér isten) (2014),1.0
Dead Silence (1997),1.0
The Boy Who Cried Werewolf (2010),1.0
Now and Forever (1934),1.0
Bad Girl Island (Sirens of Eleuthera) (Sirens of the Caribbean) (2007),1.0


In [23]:
# Je compte le nombre de notes par film
ratings_count = df.groupby('title')['rating'].count()

# Je l'ajoute ça à mon tableau
corr_df['num_ratings'] = ratings_count

# Je garde les films avec au moins 50 notes pour éviter les biais, les films avec peu de notes peuvent avoir des corrélations élevées par hasard
filtered_corr = corr_df[corr_df['num_ratings'] > 50]

# Trier par corrélation
filtered_corr.sort_values('correlation', ascending=False).head(50)


Unnamed: 0_level_0,correlation,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1.0,49695
Toy Story 2 (1999),0.739854,22770
Paint It Yellow (Rang De Basanti) (2006),0.687487,55
Tangled Ever After (2012),0.67927,51
Paddington (2014),0.663923,51
Dinner with Friends (2001),0.660082,62
"Lone Ranger, The (1956)",0.632808,74
"Butterfly, The (Papillon, Le) (2002)",0.626741,56
Macao (1952),0.603604,57
George Harrison: Living in the Material World (2011),0.601604,68


Maintenant que j'ai un système de recommandation, je vais le généralisé en fonnction du film dont t'a besoin d'avoir une similarité

In [24]:
def recommend_movies(movie_title, min_ratings=50):
    movie_ratings = user_movie_matrix[movie_title]
    similar = user_movie_matrix.corrwith(movie_ratings).dropna()
    corr_df = pd.DataFrame(similar, columns=['correlation'])
    corr_df['num_ratings'] = df.groupby('title')['rating'].count()
    result = corr_df[corr_df['num_ratings'] > min_ratings].sort_values('correlation', ascending=False)
    return result


In [26]:
df['title']

0                                              Jumanji (1995)
1           City of Lost Children, The (Cité des enfants p...
2                   Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3                                 Seven (a.k.a. Se7en) (1995)
4                                  Usual Suspects, The (1995)
                                  ...                        
20000258                                            Up (2009)
20000259           Transformers: Revenge of the Fallen (2009)
20000260                Ice Age: Dawn of the Dinosaurs (2009)
20000261                                    District 9 (2009)
20000262        Coco Before Chanel (Coco avant Chanel) (2009)
Name: title, Length: 20000263, dtype: object

In [27]:
me = recommend_movies("Twelve Monkeys (a.k.a. 12 Monkeys) (1995)").head(5)
print(me)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


                                           correlation  num_ratings
title                                                              
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)     1.000000        44980
Paint It Yellow (Rang De Basanti) (2006)      0.830679           55
49th Parallel (1941)                          0.668334           51
Mad Detective (Sun taam) (2007)               0.664658           55
Stars and Bars (1988)                         0.662522           65


Je vais essayer une autre méthode de système de recommandation

## Recommander des films similaires à ceux qu’un utilisateur aime, en se basant sur les genres ou les tags

Étapes du filtrage par contenu (simple avec les genres)

In [None]:
# Séparer les genres
movies['genres'] = movies['genres'].str.split('|')

# Créer un vecteur pour chaque genre (One-hot encoding)
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies['title'])
genre_df.head()


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Je calculer la similarité entre films (cosine similarity)

cosine_sim = cosine_similarity(genre_df)

In [32]:
# Je crée une série d'index des titres
indices = pd.Series(range(len(genre_df)), index=genre_df.index)
indices.head()

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
dtype: int64

In [None]:
# Je recommande des films similaires basés sur les genres. Cette fonction prend un titre de film et retourne les n films les plus similaires en se basant sur la similarité des genres.
# Elle exclut le film lui-même de la liste des recommandations. n est le nombre de recommandations à retourner (par défaut 5).
def recommend_by_genre(title, n=5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  
    movie_indices = [i[0] for i in sim_scores]
    return genre_df.iloc[movie_indices].index.tolist()



In [35]:
# Exemple d'utilisation de la fonction
recommend_by_genre("Jumanji (1995)")

['Kids of the Round Table (1995)',
 'Indian in the Cupboard, The (1995)',
 'NeverEnding Story III, The (1994)',
 'Escape to Witch Mountain (1975)',
 "Darby O'Gill and the Little People (1959)"]