In [72]:
import os
import math
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [15]:
data_path = 'data/'

### Dataset Movie Lens

Load dataset from source

In [16]:
users = pd.read_csv(
    data_path + "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python",
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

In [17]:
mv = movies.copy()
us = users.copy()
rt = ratings.copy()

In [38]:
movies = mv.copy()
users = us.copy()
ratings = rt.copy()


## Preprocess

users:
* change la valeur str de la colonne sex en 1 ou 0 pour que le modèle comprenne la colonne
* drop la colonne zip_code car peu intéressante pour la tâche

movies:
* sectionne la partie année des titres pour créer une nouvelle colonne année
* supprime la partie (année) des titres pour qu'ils soient possible de les comparer avec le dataset imdb

ratings:
* passe la colonne rating de str a float
* drop la colonne "unix_timestamp" car inutile

In [39]:
# users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
# users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")
# users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")

users['sex'] = users['sex'].apply(lambda x: 1 if(x == 'M') else 0)
users.drop(columns="zip_code", inplace=True)

# movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")
movies["year"] = movies["title"].apply(lambda x : int(x[-5:-1]))
movies["title"] = movies["title"].apply(lambda x : x[:-7].lower())

# ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
# ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

ratings.drop(columns="unix_timestamp", inplace=True)


Certains titres sont sous la forme: "aaaa, the" au lieu de "the aaaa". Je modifie donc la colonne title avec une regex, du pattern matching avec groupe, pour résoudre ce problème.
Après résolution du problème les titres pourront être comparé avec ceux de imdb.

In [43]:
pattern = re.compile(r'^(.*), (the|a|an)$', re.IGNORECASE)
def rearrange_title(title):
    match = pattern.match(title)
    if match:
        return f"{match.group(2)} {match.group(1)}"
    return title

movies['title'] = movies['title'].apply(rearrange_title)

Change le genre Children's avec Family pour correspondre au même genre que l'autre jeu de donnée (imdb)

In [42]:
movies['genres'] = movies['genres'].str.replace("Children's", "Family")

In [46]:
### pas besoin y en a pas
movies.drop_duplicates(inplace=True)

### dataset imdb

Data Loading

In [25]:
# name_basics = pd.read_csv(
#     data_path + r"imdb/name.basics.tsv",
#     sep='\t',
#     engine="python",
#     # low_memory=False
# )

title_basics = pd.read_csv(
    data_path + r"imdb/title.basics.tsv",
    sep='\t',
    # engine="python",
    low_memory=False
)

title_ratings = pd.read_csv(
    data_path + r"imdb/title.ratings.tsv",
    sep='\t',
    # engine="python",
    low_memory=False
)


In [26]:
unique_type = title_basics['titleType'].unique()
print("Valeurs différentes de la colonne 'title Type':", unique_type)

Valeurs différentes de la colonne 'title Type': ['short' 'movie' 'tvShort' 'tvMovie' 'tvSeries' 'tvEpisode' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']


In [142]:
movietype = ["movie", "tvMovie"]
title_preprocess = title_basics[title_basics["titleType"].isin(movietype)].copy()
title_preprocess['titleType'].unique()

array(['movie', 'tvMovie'], dtype=object)

In [114]:
print(f'before : {len(title_basics)} after: {len(title_preprocess)}')

before : 10906503 after: 833594


In [143]:
### on drope endYear car la colonne ne fait sens que pour les TV series
title_preprocess.drop(columns="endYear", inplace=True)

Toutes les colonnes sont de type object, to_numeric c'est quand des valeurs sont a Nan

In [148]:
title_preprocess["primaryTitle"] = title_preprocess["primaryTitle"].apply(str.lower)
title_preprocess["originalTitle"] = title_preprocess["originalTitle"].apply(str.lower)
title_preprocess["isAdult"] = title_preprocess["isAdult"].apply(int)
title_preprocess['startYear'] = pd.to_numeric(title_preprocess['startYear'], errors="coerce")
title_preprocess['runtimeMinutes'] = pd.to_numeric(title_preprocess['runtimeMinutes'], errors="coerce")

title_ratings["averageRating"] = title_ratings['averageRating'].apply(float)
title_ratings["numVotes"] = title_ratings['numVotes'].apply(int)

title_preprocess.replace(r'\N', np.nan, inplace=True)
# title_preprocess.drop(columns='tconst', inplace=True)

merge les descriptions des films avec leur rating celon tconst, leur id

In [151]:
title_merged = pd.merge(title_preprocess, title_ratings, on="tconst", how="left")
title_merged.drop(columns="tconst", inplace=True)
title_merged.head()

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,movie,miss jerry,miss jerry,0,1894.0,45.0,Romance,5.4,212.0
1,movie,the corbett-fitzsimmons fight,the corbett-fitzsimmons fight,0,1897.0,100.0,"Documentary,News,Sport",5.2,518.0
2,movie,bohemios,bohemios,0,1905.0,100.0,,4.4,17.0
3,movie,the story of the kelly gang,the story of the kelly gang,0,1906.0,70.0,"Action,Adventure,Biography",6.0,914.0
4,movie,the prodigal son,l'enfant prodigue,0,1907.0,90.0,Drama,5.4,27.0


On récupère tous ceux qui

In [194]:
merged_df1 = pd.merge(movies, title_merged, left_on=["title", "year"], right_on=["primaryTitle", "startYear"], how="left")
merged_df2 = pd.merge(movies, title_merged, left_on=["title", "year"], right_on=["originalTitle", "startYear"], how="inner")
final_df = pd.concat([merged_df2, merged_df1], ignore_index=True).drop_duplicates()
# merged_df3 = pd.merge(movies, final_df, on='movie_id', how="outer")
# final_df = pd.concat([final_df, merged_df3], ignore_index=True).drop_duplicates()

j'ai deux cas soit title != primarytitle et j'obtiens un doublon a cause de left (je suis obligé sinon je perd des films), il y a deux films avec le meme titre et je prend le plus connue => qui est le plus plosible d'etre dans movielens.

In [197]:
def custom_rule(group):
    return group.sort_values(by="numVotes", ascending=False).iloc[0]

final_df = final_df.groupby("movie_id").apply(custom_rule).reset_index(drop=True)

merge les colonnes de genres de imdb et movie lens

In [199]:
def merge_genres(row):
    genres1 = row['genres_x'].split('|') if pd.notna(row['genres_x']) else []
    genres2 = row['genres_y'].split(',') if pd.notna(row['genres_y']) else []
    merged_genres = list(set(genres1 + genres2))  # Fusionner et supprimer les doublons
    return '|'.join(merged_genres)  # Rejoindre avec le délimiteur '|'

# Appliquer la fonction à chaque ligne du DataFrame
final_df['genres'] = final_df.apply(merge_genres, axis=1)

drop toutes les colonnes inutiles pour le model

In [201]:
final_df.drop(columns=["primaryTitle", "originalTitle", "genres_x", "genres_y", "startYear"], inplace=True)

ceux qui n'avaient pas de type dans imdb sont forcément des films vu qu'ils sont dans movie lens

In [202]:
final_df["titleType"] = final_df["titleType"].fillna("movie")

In [206]:
final_df.to_csv("dataset.csv", index=False)

In [4]:
df = pd.read_csv("dataset.csv")

In [5]:
df.drop(columns="startYear", inplace=True)

In [6]:
genres = set("|".join(df['genres'].unique()).split('|'))
for genre in genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
df["titleType"] = df["titleType"].apply(lambda x: 1 if x == "movie" else 0)

In [12]:
df.head()

Unnamed: 0,movie_id,title,year,titleType,isAdult,runtimeMinutes,averageRating,numVotes,genres,Thriller,...,Biography,Drama,Horror,Sport,War,Action,Animation,Western,Documentary,Crime
0,1,toy story,1995,1,0.0,81.0,8.3,1076189.0,Comedy|Family|Animation|Adventure,0,...,0,0,0,0,0,0,1,0,0,0
1,2,jumanji,1995,1,0.0,104.0,7.1,380088.0,Fantasy|Family|Adventure|Comedy,0,...,0,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,1995,1,0.0,101.0,6.6,29874.0,Comedy|Romance,0,...,0,0,0,0,0,0,0,0,0,0
3,4,waiting to exhale,1995,1,0.0,124.0,6.0,12310.0,Comedy|Romance|Drama,0,...,0,1,0,0,0,0,0,0,0,0
4,5,father of the bride part ii,1995,1,0.0,106.0,6.1,41943.0,Comedy|Romance|Family,0,...,0,0,0,0,0,0,0,0,0,0


On remplit les averageRating avec ceux qu'on a dans MovieLens

In [28]:
df[df["averageRating"].isna()]

Unnamed: 0,movie_id,title,year,titleType,isAdult,runtimeMinutes,averageRating,numVotes,genres,Thriller,...,Biography,Drama,Horror,Sport,War,Action,Animation,Western,Documentary,Crime
281,284,new york cop,1996,1,,,,0.0,Action|Crime,0,...,0,0,0,0,0,1,0,0,0,1
282,285,beyond bedlam,1993,1,,,,0.0,Horror|Drama,0,...,0,1,1,0,0,0,0,0,0,0
391,395,desert winds,1995,1,,,,0.0,Drama,0,...,0,1,0,0,0,0,0,0,0,0
399,403,two crimes,1995,1,,,,0.0,Comedy|Crime|Drama,0,...,0,1,0,0,0,0,0,0,0,1
600,604,criminals,1996,1,,,,0.0,Documentary,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3520,3589,"kill, baby... kill! (operazione paura)",1966,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3561,3630,"house of exorcism, the (la casa dell'esorcismo)",1974,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3581,3650,anguish (angustia),1986,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3786,3856,autumn heart,1999,1,,,,0.0,Drama,0,...,0,1,0,0,0,0,0,0,0,0


In [26]:
for id in df[df["averageRating"].isna()]["movie_id"].unique():
    r_list = ratings[ratings["movie_id"] == id]
    if len(r_list) == 0:
        df.loc[df["movie_id"] == id, ["numVotes"]] = [0]
    else:
        df.loc[df["movie_id"] == id, ["averageRating","numVotes"]] = [r_list["rating"].values.mean(), len(r_list)]

In [27]:
df[df["averageRating"].isna()]

Unnamed: 0,movie_id,title,year,titleType,isAdult,runtimeMinutes,averageRating,numVotes,genres,Thriller,...,Biography,Drama,Horror,Sport,War,Action,Animation,Western,Documentary,Crime
281,284,new york cop,1996,1,,,,0.0,Action|Crime,0,...,0,0,0,0,0,1,0,0,0,1
282,285,beyond bedlam,1993,1,,,,0.0,Horror|Drama,0,...,0,1,1,0,0,0,0,0,0,0
391,395,desert winds,1995,1,,,,0.0,Drama,0,...,0,1,0,0,0,0,0,0,0,0
399,403,two crimes,1995,1,,,,0.0,Comedy|Crime|Drama,0,...,0,1,0,0,0,0,0,0,0,1
600,604,criminals,1996,1,,,,0.0,Documentary,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3520,3589,"kill, baby... kill! (operazione paura)",1966,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3561,3630,"house of exorcism, the (la casa dell'esorcismo)",1974,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3581,3650,anguish (angustia),1986,1,,,,0.0,Horror,0,...,0,0,1,0,0,0,0,0,0,0
3786,3856,autumn heart,1999,1,,,,0.0,Drama,0,...,0,1,0,0,0,0,0,0,0,0


In [33]:
Y = ratings.pivot(index='user_id', columns='movie_id', values='rating')
Y = Y.fillna(0)
Y

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
users.head()

Unnamed: 0,user_id,sex,age_group,occupation
0,1,0,1,10
1,2,1,56,16
2,3,1,25,15
3,4,1,45,7
4,5,1,25,20


In [40]:
scaler = StandardScaler()

users.set_index('user_id', inplace=True)
users_norm = scaler.fit_transform(users)
users_norm

array([[-1.59192668, -2.29852514,  0.29280287],
       [ 0.62816964,  1.96672895,  1.2408218 ],
       [ 0.62816964, -0.43732336,  1.08281865],
       ...,
       [-1.59192668,  1.96672895, -1.12922554],
       [-1.59192668,  1.11367813, -1.28722869],
       [ 0.62816964, -0.43732336, -0.33920976]])

In [48]:
data = df.set_index("movie_id").copy()
data_norm = scaler.fit_transform(data.drop(columns=["title", "genres"]))
len(data_norm[0])

28

In [51]:
def getR(Y):
    R = Y.copy()
    R[R != 0] = 1
    return R

In [58]:

Yarray = np.array(Y)

In [61]:
Y_train, Y_test = split_ratings(Yarray, val_size=0.2) 

R_train = getR(Y_train)
R_test = getR(Y_test)

In [83]:
Y.T

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
scalerY = MinMaxScaler((-1,1))
y_train = scalerY.fit_transform(Y)
y_train, y_test = train_test_split(y_train, test_size=0.2, random_state=42)

user_train, user_test = train_test_split(users, test_size=0.2, random_state=42)

item_train, item_test = train_test_split(data_norm, test_size=0.2, random_state=42)

In [86]:
user_test

Unnamed: 0_level_0,sex,age_group,occupation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5530,0,18,4
711,1,25,18
4924,0,35,0
2154,1,25,12
1273,1,35,2
...,...,...,...
1216,1,25,2
2081,1,18,17
5674,1,35,7
3936,0,35,12


In [None]:
num_user_features = 3
num_item_features = 28

In [79]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential(
    [
        ### START CODE HERE ###
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
        ### END CODE HERE ###
    ]
)

item_NN = tf.keras.models.Sequential(
    [
        ### START CODE HERE ###
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
        ### END CODE HERE ###
    ]
)

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

NameError: name 'num_user_features' is not defined

In [77]:
y_train.reshape(-1,1).shape

(17907392, 1)

In [87]:
import pandas as pd

# Exemple de DataFrame des utilisateurs (users_df)
users_data = {
    'user_id': [1, 2, 3]
}

users_df = pd.DataFrame(users_data)

# Exemple de DataFrame des évaluations (ratings_df)
ratings_data = {
    'user_id': [1, 1, 2, 2, 3],
    'movie_id': [101, 102, 101, 103, 102],
    'rating': [4.0, 5.0, 3.0, 2.0, 4.5]
}

ratings_df = pd.DataFrame(ratings_data)

# Exemple de DataFrame des genres de films (movies_df)
movies_data = {
    'movie_id': [101, 102, 103, 104],
    'genres': ["Animation|Family|Comedy", "Drama|Romance", "Action|Thriller", "Comedy|Drama"]
}

movies_df = pd.DataFrame(movies_data)


In [90]:
# usercopy = users.copy()
users = usercopy.copy()

In [91]:
genres = set("|".join(df['genres'].unique()).split('|'))
for genre in genres:
    users[genre] = 0
# Ajouter la colonne 'nrating'
users['nrating'] = 0


In [106]:
all_genres

['Drama']

In [98]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [107]:
# Fusionner les évaluations avec les genres
all_genres = set("|".join(df['genres'].unique()).split('|'))
merged_df = ratings.merge(df, on='movie_id', how="left")
users_df = users.copy().reset_index()

# Initialiser un dictionnaire pour stocker les sommes et les comptes des genres
genre_sum_count = {genre: {'sum': {}, 'count': {}} for genre in all_genres}

# Parcourir les lignes de merged_df pour calculer les sommes et les comptes
for _, row in merged_df.iterrows():
    user_id = row['user_id']
    rating = row['rating']
    genres = row['genres'].split('|')
    
    users_df.loc[users_df['user_id'] == user_id, 'nrating'] += 1
    
    for genre in genres:
        if user_id in genre_sum_count[genre]['sum']:
            genre_sum_count[genre]['sum'][user_id] += rating
            genre_sum_count[genre]['count'][user_id] += 1
        else:
            genre_sum_count[genre]['sum'][user_id] = rating
            genre_sum_count[genre]['count'][user_id] = 1

# Calculer les moyennes pour chaque genre et mettre à jour users_df
for genre in all_genres:
    for user_id in genre_sum_count[genre]['sum']:
        users_df.loc[users_df['user_id'] == user_id, genre] = (
            genre_sum_count[genre]['sum'][user_id] / genre_sum_count[genre]['count'][user_id]
        )

# Afficher le DataFrame résultant
print(users_df)
