In [20]:
import matplotlib
matplotlib.use("Agg")  # OBLIGATOIRE sur Vertex

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Pour test de modèles
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy


In [21]:
# 1. Paths

BASE_DIR = Path(".").resolve()
FIG_DIR = BASE_DIR / "figures"
FIG_DIR.mkdir(exist_ok=True)

In [4]:
movies = pd.read_csv("../data/movies.csv")  # movieId, title, genres
ratings = pd.read_csv("../data/train_ratings.csv")

In [22]:
print(ratings.head())

   userId  movieId  rating   timestamp  user_idx  movie_idx
0     143      150     5.0   841221776       142        126
1     575     7947     2.5  1085090436       574       5521
2     296     1370     3.5  1085798578       295       1111
3     426     1301     4.0   973997536       425       1059
4     473      344     2.0   836972430       472        304


In [6]:
print(movies.head())

   movieId                             title              genres    year  \
0   126929              Li'l Quinquin (    )  (no genres listed)     NaN   
1   135460                      Pablo (2012)  (no genres listed)  2012.0   
2   138863  The Big Broadcast of 1936 (1935)  (no genres listed)  1935.0   
3   141305       Round Trip to Heaven (1992)  (no genres listed)  1992.0   
4   141472       The 50 Year Argument (2014)  (no genres listed)  2014.0   

              genres_list  
0  ['(no genres listed)']  
1  ['(no genres listed)']  
2  ['(no genres listed)']  
3  ['(no genres listed)']  
4  ['(no genres listed)']  


In [7]:

# 2. Analyse exploratoire

# Nombre de films et utilisateurs
print(f"Nombre de films: {movies['movieId'].nunique()}")
print(f"Nombre d'utilisateurs: {ratings['userId'].nunique()}")

Nombre de films: 10329
Nombre d'utilisateurs: 668


In [9]:
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84271 entries, 0 to 84270
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     84271 non-null  int64  
 1   movieId    84271 non-null  int64  
 2   rating     84271 non-null  float64
 3   timestamp  84271 non-null  int64  
 4   user_idx   84271 non-null  int64  
 5   movie_idx  84271 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 3.9 MB
None


In [10]:
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      10329 non-null  int64  
 1   title        10329 non-null  object 
 2   genres       10329 non-null  object 
 3   year         10326 non-null  float64
 4   genres_list  10329 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 403.6+ KB
None


In [23]:
# Compter combien de notes chaque utilisateur a fait
user_counts = ratings.groupby('user_idx')['rating'].count()
print(user_counts)

user_idx
0        95
1        25
2        59
3       101
4        56
       ... 
663      55
664     190
665     172
666      58
667    4508
Name: rating, Length: 668, dtype: int64


In [24]:
# Compter combien de notes chaque film a reçu
movie_counts = ratings.groupby('movie_idx')['rating'].count()
print(movie_counts)

movie_idx
0        188
1         69
2         49
3          8
4         53
        ... 
10320      1
10321      1
10322      1
10323      2
10324      1
Name: rating, Length: 9565, dtype: int64


In [16]:
#lister quelques genres des films 

modalités_genre_film=list(movies["genres"].unique())


print("voici les genres possibles pour les films:")

for i in range(10):
    print(f"{modalités_genre_film[i]}\n")
    
    



voici les genres possibles pour les films:
(no genres listed)

Action

Action|Adventure

Action|Adventure|Animation

Action|Adventure|Animation|Children

Action|Adventure|Animation|Children|Comedy

Action|Adventure|Animation|Children|Comedy|Fantasy

Action|Adventure|Animation|Children|Comedy|IMAX

Action|Adventure|Animation|Children|Comedy|Romance

Action|Adventure|Animation|Children|Comedy|Sci-Fi



In [31]:

# ---- Distribution des notes
plt.figure(figsize=(6,4))
sns.histplot(ratings["rating"], bins=20, kde=False)
plt.title("Distribution des ratings")
plt.xlabel("Rating")
plt.ylabel("Nombre")
plt.tight_layout()
plt.savefig(FIG_DIR / "distribution_ratings.png")
plt.close()


In [32]:
# ---- Ratings par utilisateur
ratings_per_user = ratings.groupby("userId").size()

plt.figure(figsize=(6,4))
sns.histplot(ratings_per_user, bins=50, kde=False)
plt.title("Nombre de ratings par utilisateur")
plt.xlabel("Nombre de ratings")
plt.ylabel("Nombre d'utilisateurs")
plt.tight_layout()
plt.savefig(FIG_DIR / "ratings_per_user.png")
plt.close()

# ---- Ratings par film
ratings_per_movie = ratings.groupby("movieId").size()

plt.figure(figsize=(6,4))
sns.histplot(ratings_per_movie, bins=50, kde=False)
plt.title("Nombre de ratings par film")
plt.xlabel("Nombre de ratings")
plt.ylabel("Nombre de films")
plt.tight_layout()
plt.savefig(FIG_DIR / "ratings_per_movie.png")
plt.close()



In [38]:
print("Missing values (ratings):")
print(ratings.isna().sum())




Missing values (ratings):
userId       0
movieId      0
rating       0
timestamp    0
user_idx     0
movie_idx    0
dtype: int64


In [39]:
print("Missing values (ratings):")
print(movies.isna().sum())

Missing values (ratings):
movieId        0
title          0
genres         0
year           3
genres_list    0
dtype: int64


In [40]:
nb_no_genres = (movies["genres"] == "(no genres listed)").sum()
print(f"Nombre de films sans genres renseignés : {nb_no_genres}")


Nombre de films sans genres renseignés : 7


In [42]:
n_users = ratings["userId"].nunique()
n_movies = ratings["movieId"].nunique()
n_ratings = len(ratings)

sparsity = 1 - (n_ratings / (n_users * n_movies))

print(f"Users   : {n_users}")
print(f"Movies  : {n_movies}")
print(f"Ratings : {n_ratings}")
print(f"Sparsity: {sparsity:.4%}")


Users   : 668
Movies  : 9565
Ratings : 84271
Sparsity: 98.6811%


In [43]:
ratings_per_movie = ratings.groupby("movieId").size()

plt.figure(figsize=(6,4))
sns.histplot(ratings_per_movie, bins=100, log_scale=(True, True))
plt.title("Long tail – popularité des films (log-log)")
plt.xlabel("Nb ratings par film")
plt.ylabel("Nb films")
plt.tight_layout()
plt.savefig(FIG_DIR / "long_tail_movies.png")
plt.close()


In [44]:
ratings_per_user = ratings.groupby("userId").size()

plt.figure(figsize=(6,4))
sns.histplot(ratings_per_user, bins=100, log_scale=(True, True))
plt.title("Activité utilisateurs (log-log)")
plt.xlabel("Nb ratings par utilisateur")
plt.ylabel("Nb utilisateurs")
plt.tight_layout()
plt.savefig(FIG_DIR / "user_activity.png")
plt.close()


In [45]:
user_stats = ratings.groupby("userId")["rating"].agg(["mean", "std"]).fillna(0)

plt.figure(figsize=(6,4))
sns.scatterplot(x=user_stats["mean"], y=user_stats["std"], alpha=0.3)
plt.title("Biais utilisateurs : moyenne vs variance")
plt.xlabel("Moyenne des ratings")
plt.ylabel("Écart-type")
plt.tight_layout()
plt.savefig(FIG_DIR / "user_bias.png")
plt.close()


In [46]:
movie_stats = ratings.groupby("movieId")["rating"].agg(["mean", "count"])

plt.figure(figsize=(6,4))
sns.scatterplot(
    x=movie_stats["count"],
    y=movie_stats["mean"],
    alpha=0.3
)
plt.xscale("log")
plt.title("Popularité vs moyenne des films")
plt.xlabel("Nb ratings (log)")
plt.ylabel("Rating moyen")
plt.tight_layout()
plt.savefig(FIG_DIR / "movie_bias.png")
plt.close()


In [47]:
cold_users = (ratings_per_user < 5).mean()
cold_movies = (ratings_per_movie < 5).mean()

print(f"Cold users (<5 ratings): {cold_users:.2%}")
print(f"Cold movies (<5 ratings): {cold_movies:.2%}")


Cold users (<5 ratings): 0.00%
Cold movies (<5 ratings): 65.15%


In [48]:
movies["genres_count"] = movies["genres"].str.split("|").apply(len)

plt.figure(figsize=(6,4))
sns.histplot(movies["genres_count"], bins=10)
plt.title("Nombre de genres par film")
plt.xlabel("Nb genres")
plt.ylabel("Nb films")
plt.tight_layout()
plt.savefig(FIG_DIR / "genres_per_movie.png")
plt.close()
