In [2]:
import os
import math
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

In [3]:
data_path = 'data/'

### Dataset Movie Lens

Load dataset from source

In [4]:
users = pd.read_csv(
    data_path + "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python",
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

In [5]:
mv = movies.copy()
us = users.copy()
rt = ratings.copy()

In [6]:
movies = mv.copy()
users = us.copy()
ratings = rt.copy()


## Preprocess

users:
* change la valeur str de la colonne sex en 1 ou 0 pour que le modèle comprenne la colonne
* drop la colonne zip_code car peu intéressante pour la tâche

movies:
* sectionne la partie année des titres pour créer une nouvelle colonne année
* supprime la partie (année) des titres pour qu'ils soient possible de les comparer avec le dataset imdb

ratings:
* passe la colonne rating de str a float
* drop la colonne "unix_timestamp" car inutile

In [7]:
# users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
# users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")
# users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")

users['sex'] = users['sex'].apply(lambda x: 1 if(x == 'M') else 0)
users.drop(columns="zip_code", inplace=True)

# movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")
movies["year"] = movies["title"].apply(lambda x : int(x[-5:-1]))
movies["title"] = movies["title"].apply(lambda x : x[:-7].lower())

# ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
# ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

ratings.drop(columns="unix_timestamp", inplace=True)


Certains titres sont sous la forme: "aaaa, the" au lieu de "the aaaa". Je modifie donc la colonne title avec une regex, du pattern matching avec groupe, pour résoudre ce problème.
Après résolution du problème les titres pourront être comparé avec ceux de imdb.

In [8]:
pattern = re.compile(r'^(.*), (the|a|an)$', re.IGNORECASE)
def rearrange_title(title):
    match = pattern.match(title)
    if match:
        return f"{match.group(2)} {match.group(1)}"
    return title

movies['title'] = movies['title'].apply(rearrange_title)

Change le genre Children's avec Family pour correspondre au même genre que l'autre jeu de donnée (imdb)

In [9]:
movies['genres'] = movies['genres'].str.replace("Children's", "Family")

In [10]:
### pas besoin y en a pas
movies.drop_duplicates(inplace=True)

### dataset imdb

Data Loading

In [11]:
# name_basics = pd.read_csv(
#     data_path + r"imdb/name.basics.tsv",
#     sep='\t',
#     engine="python",
#     # low_memory=False
# )

title_basics = pd.read_csv(
    data_path + r"imdb/title.basics.tsv",
    sep='\t',
    # engine="python",
    low_memory=False
)

title_ratings = pd.read_csv(
    data_path + r"imdb/title.ratings.tsv",
    sep='\t',
    # engine="python",
    low_memory=False
)


je sélectionne que les critiques de films ou de tvmovie car movielens ne contient que ce genre d'oeuvre

In [12]:
movietype = ["movie", "tvMovie"]
title_preprocess = title_basics[title_basics["titleType"].isin(movietype)].copy()
title_preprocess['titleType'].unique()

array(['movie', 'tvMovie'], dtype=object)

on drope endYear car la colonne ne fait sens que pour les TV series, et on drope isAdult car il n'y a que la valeur 0 pour les films qui sont aussi dans movielens donc la colonne est inutile.

In [13]:
title_preprocess.drop(columns=["endYear", "isAdult"], inplace=True)

Toutes les colonnes sont de type object, to_numeric c'est quand des valeurs sont a Nan

In [14]:
title_preprocess["primaryTitle"] = title_preprocess["primaryTitle"].apply(str.lower)
title_preprocess["originalTitle"] = title_preprocess["originalTitle"].apply(str.lower)
title_preprocess['startYear'] = pd.to_numeric(title_preprocess['startYear'], errors="coerce")
title_preprocess['runtimeMinutes'] = pd.to_numeric(title_preprocess['runtimeMinutes'], errors="coerce")

title_ratings["averageRating"] = title_ratings['averageRating'].apply(float)
title_ratings["numVotes"] = title_ratings['numVotes'].apply(int)

title_preprocess.replace(r'\N', np.nan, inplace=True)
# title_preprocess.drop(columns='tconst', inplace=True)

merge les descriptions des films avec leur rating celon tconst, leur id

In [15]:
title_merged = pd.merge(title_preprocess, title_ratings, on="tconst", how="left")
title_merged.drop(columns="tconst", inplace=True)
title_merged.head()

Unnamed: 0,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,movie,miss jerry,miss jerry,1894.0,45.0,Romance,5.4,212.0
1,movie,the corbett-fitzsimmons fight,the corbett-fitzsimmons fight,1897.0,100.0,"Documentary,News,Sport",5.2,518.0
2,movie,bohemios,bohemios,1905.0,100.0,,4.4,17.0
3,movie,the story of the kelly gang,the story of the kelly gang,1906.0,70.0,"Action,Adventure,Biography",6.0,914.0
4,movie,the prodigal son,l'enfant prodigue,1907.0,90.0,Drama,5.4,27.0


on merge les caractéristiques des films de imdb et movielens, selont les titres et l'année de parutions.

In [16]:
merged_df1 = pd.merge(movies, title_merged, left_on=["title", "year"], right_on=["primaryTitle", "startYear"], how="left")
merged_df2 = pd.merge(movies, title_merged, left_on=["title", "year"], right_on=["originalTitle", "startYear"], how="inner")
final_df = pd.concat([merged_df2, merged_df1], ignore_index=True).drop_duplicates()
# merged_df3 = pd.merge(movies, final_df, on='movie_id', how="outer")
# final_df = pd.concat([final_df, merged_df3], ignore_index=True).drop_duplicates()

j'ai deux cas soit title != primarytitle et j'obtiens un doublon a cause de left (je suis obligé sinon je perd des films), il y a deux films avec le meme titre et je prend le plus connue => qui est le plus plosible d'etre dans movielens.

In [17]:
def custom_rule(group):
    return group.sort_values(by="numVotes", ascending=False).iloc[0]

final_df = final_df.groupby("movie_id").apply(custom_rule).reset_index(drop=True)

merge les colonnes de genres de imdb et movie lens

In [18]:
def merge_genres(row):
    genres1 = row['genres_x'].split('|') if pd.notna(row['genres_x']) else []
    genres2 = row['genres_y'].split(',') if pd.notna(row['genres_y']) else []
    merged_genres = list(set(genres1 + genres2))  # Fusionner et supprimer les doublons
    return '|'.join(merged_genres)  # Rejoindre avec le délimiteur '|'

# Appliquer la fonction à chaque ligne du DataFrame
final_df['genres'] = final_df.apply(merge_genres, axis=1)

drop toutes les colonnes inutiles pour le model

In [19]:
final_df.drop(columns=["primaryTitle", "originalTitle", "genres_x", "genres_y", "startYear"], inplace=True)

ceux qui n'avaient pas de type dans imdb sont forcément des films vu qu'ils sont dans movie lens

In [20]:
final_df["titleType"] = final_df["titleType"].fillna("movie")

In [21]:
genres = set("|".join(final_df['genres'].unique()).split('|'))
for genre in genres:
    final_df[genre] = final_df['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
final_df["titleType"] = final_df["titleType"].apply(lambda x: 1 if x == "movie" else 0)

On remplit les averageRating avec ceux qu'on a dans MovieLens

In [22]:
for id in final_df[final_df["averageRating"].isna()]["movie_id"].unique():
    r_list = ratings[ratings["movie_id"] == id]
    if len(r_list) == 0:
        final_df.loc[final_df["movie_id"] == id, ["numVotes"]] = [0]
    else:
        final_df.loc[final_df["movie_id"] == id, ["averageRating","numVotes"]] = [r_list["rating"].values.mean(), len(r_list)]

In [23]:
final_df.to_csv("dataset.csv", index=False)

In [24]:
df = pd.read_csv("dataset.csv")

In [25]:
# df.drop(columns="startYear", inplace=True)
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0
...,...,...,...
1000204,6040,1091,1.0
1000205,6040,1094,5.0
1000206,6040,562,5.0
1000207,6040,1096,4.0


In [26]:
Y = ratings.pivot(index='movie_id', columns='user_id', values='rating')
Y = Y.fillna(0)
Y

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Crée un dataframe users avec la note moyenne pour chaque genre de film, 0 si l'utilisateur n'a jamais noté le genre.

In [27]:


genres = set("|".join(df['genres'].unique()).split('|'))
# Ajoute les colonne genres au df users
for genre in genres:
    users[genre] = 0
# Ajouter la colonne 'nrating'
users['nrating'] = 0


In [29]:
# Fusionner les évaluations avec les genres
all_genres = set("|".join(df['genres'].unique()).split('|'))
merged_df = ratings.merge(df, on='movie_id', how="left")
users_df = users.copy().reset_index()

# Initialiser un dictionnaire pour stocker les sommes et les comptes des genres
genre_sum_count = {genre: {'sum': {}, 'count': {}} for genre in all_genres}

# Parcourir les lignes de merged_df pour calculer les sommes et les comptes
for _, row in merged_df.iterrows():
    user_id = row['user_id']
    rating = row['rating']
    genres = row['genres'].split('|')
    
    users_df.loc[users_df['user_id'] == user_id, 'nrating'] += 1
    
    for genre in genres:
        if user_id in genre_sum_count[genre]['sum']:
            genre_sum_count[genre]['sum'][user_id] += rating
            genre_sum_count[genre]['count'][user_id] += 1
        else:
            genre_sum_count[genre]['sum'][user_id] = rating
            genre_sum_count[genre]['count'][user_id] = 1

# Calculer les moyennes pour chaque genre et mettre à jour users_df
for genre in all_genres:
    for user_id in genre_sum_count[genre]['sum']:
        users_df.loc[users_df['user_id'] == user_id, genre] = (
            genre_sum_count[genre]['sum'][user_id] / genre_sum_count[genre]['count'][user_id]
        )

users_df.head()


Unnamed: 0,index,user_id,sex,age_group,occupation,Documentary,Romance,Family,Drama,Music,...,History,Biography,Thriller,War,Action,Sci-Fi,Adventure,Film-Noir,Mystery,nrating
0,0,1,0,1,10,0.0,3.714286,4.2,4.290323,5.0,...,5.0,4.6,3.666667,5.0,4.166667,4.333333,4.2,0.0,4.0,53
1,1,2,1,56,16,0.0,3.774194,5.0,3.817204,4.25,...,3.666667,4.0,3.511628,3.75,3.483871,3.588235,3.677419,4.0,3.125,129
2,2,3,1,25,15,0.0,3.833333,4.25,3.894737,0.0,...,0.0,5.0,3.714286,4.0,4.0,4.0,3.965517,0.0,3.0,51
3,3,4,1,45,7,0.0,4.0,4.0,4.25,0.0,...,0.0,0.0,4.0,3.333333,4.157895,3.555556,4.0,0.0,5.0,21
4,4,5,1,25,20,3.666667,3.142857,3.833333,3.090909,2.9,...,3.285714,3.285714,2.782609,3.142857,2.710526,3.176471,3.2,4.0,3.111111,198


In [55]:
list_user = np.array(users_df.sort_values("nrating", ascending=False)[:200]["user_id"].copy())

In [73]:
coupled = []
couplelist = []
i = 0
for user in list_user:
    i += 1
    if user in coupled:
        continue
    movie_rated1 = set(ratings[ratings["user_id"] == user]["movie_id"])
    max_nb_movies_commun = 0
    for j in range(i, len(list_user)):
        if(list_user[j] in coupled):
            continue
        movie_rated2 = set(ratings[ratings["user_id"] == list_user[j]]["movie_id"])
        nb_movies_commun = len(movie_rated1.intersection(movie_rated2))
        if nb_movies_commun > max_nb_movies_commun:
            user_couple = j
            nb_movies_commun_couple = nb_movies_commun
    if max_nb_movies_commun > 100:
        couplelist.append((user, user_couple))
        coupled.append(user)
        coupled.append(user_couple) 

In [75]:
# len(couplelist)
print(couplelist)

[(4169, 2), (1680, 3), (4277, 4), (1941, 4), (1181, 6), (889, 9), (3618, 15), (2063, 10), (1150, 9), (1015, 12), (5795, 13), (4344, 21), (1980, 18), (2909, 15), (1449, 16), (4510, 19), (424, 21), (4227, 24), (5831, 21), (3841, 24), (3391, 23), (4508, 23), (1088, 30), (5367, 25), (3808, 25), (549, 26), (1285, 29), (3224, 30), (3539, 32), (4543, 31), (5643, 53), (1448, 32), (752, 39), (3032, 38), (3824, 36), (524, 39), (4448, 49), (4064, 45), (4725, 52), (1010, 68), (5954, 43), (678, 49), (1447, 61), (4447, 49), (2116, 71), (550, 49), (1605, 77), (3526, 57), (1698, 55), (1880, 73), (1912, 55), (3778, 57), (4808, 57), (3292, 60), (4425, 73), (1019, 76), (3507, 67), (3311, 89), (2181, 108), (881, 149), (6016, 105), (2015, 69), (2106, 77), (2820, 93), (5812, 92), (4647, 76), (6036, 93), (710, 68), (4312, 82), (1647, 78), (4979, 97), (2777, 84), (5026, 95), (4386, 107), (2304, 110), (352, 102), (5046, 88), (531, 84), (1676, 90), (1051, 84), (2507, 81), (3163, 138), (3626, 89), (5100, 137), (

In [63]:
ratings[ratings["user_id"] == list_user[0]]["movie_id"]

695642    3789
695643     571
695644     574
695645     575
695646     577
          ... 
697951    3784
697952    3785
697953    2047
697954    3788
697955    2049
Name: movie_id, Length: 2314, dtype: int64

# Méthode 1 



In [30]:
from typing import Tuple, Union
import random

def split_ratings(ratings: np.ndarray, val_size: Union[float, int]):
    rows, cols = np.where(ratings > 0)
    val_ratings = np.zeros(ratings.shape)
    train_ratings = ratings.copy()

    if val_size < 1:  # Hence it is a percentage
        val_size = int(len(rows) * val_size)

    val_ids = random.sample(range(len(rows)), val_size)

    for row, col in zip(rows[val_ids], cols[val_ids]):
        train_ratings[row, col] = 0
        val_ratings[row, col] = ratings[row, col]

    return train_ratings, val_ratings

In [31]:
movies = df.drop(columns=["title", "genres"]).copy()
Y1 = ratings.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)
Y2 = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
movies_c = movies[movies['movie_id'].isin(Y1.index)].copy()
Y1 = np.array(Y1)
Y2 = np.array(Y2)

In [32]:
meantimes = movies_c['runtimeMinutes'].mean()
movies_c['runtimeMinutes'] = movies_c['runtimeMinutes'].fillna(meantimes)

In [None]:
# movies_c.set_index("movie_id", inplace=True)
# users_df.set_index("user_id", inplace=True)

: 

In [33]:
scaler = StandardScaler()
scalerY = MinMaxScaler((-1,1))

movie_feat = scaler.fit_transform(movies_c)
user_feat = scaler.fit_transform(users_df)
# y_train = scalerY.fit_transform(y_train)

In [34]:
# calcule le masque de Y
def getR(Y):
    R = Y.copy()
    R[R != 0] = 1
    return R

def normalizeRatings(Y: np.ndarray, R: np.ndarray, axis: int = 1):
    Ymean = (np.sum(Y * R, axis=axis) / (np.sum(R, axis=axis) + 1e-12)).reshape(-1, 1)

    if axis == 0:
        Ynorm = Y.T - np.multiply(Ymean, R.T)
    else:
        Ynorm = Y - np.multiply(Ymean, R)

    return (Ynorm, Ymean)

R1 = getR(Y1)
Ynorm1, Ymean1 = normalizeRatings(Y1, R1)
R2 = getR(Y2)
Ynorm2, Ymean2 = normalizeRatings(Y2, R2)

In [35]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

b1 = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name="b1")
b2 = tf.Variable(tf.random.normal((1, num_movies), dtype=tf.float64), name="b2")

W1 = tf.Variable(tf.random.normal((num_users, movie_feat.shape[1]), dtype=tf.float64), name="W1")
W2 = tf.Variable(tf.random.normal((num_movies, user_feat.shape[1]), dtype=tf.float64), name="W2")

X1 = tf.convert_to_tensor(movie_feat)
X2 = tf.convert_to_tensor(user_feat)

# Instantiate an optimizer.
optimizer1 = tf.keras.optimizers.Adam(learning_rate=1)
optimizer2 = tf.keras.optimizers.Adam(learning_rate=1)

In [56]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
      j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
      J = 0.5 * tf.reduce_sum(j**2) + (lambda_ / 2) * (tf.reduce_sum(W**2))
      return J


def train(X,W,b,Ynorm, R, optimizer, iterations=200, lambda_=1):
    for iter in range(iterations):
        # Use TensorFlow’s GradientTape
        # to record the operations used to compute the cost
        with tf.GradientTape() as tape:
            # Compute the cost (forward pass included in cost)
            cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss
        grads = tape.gradient(cost_value, [W,b])

        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        optimizer.apply_gradients(zip(grads, [W,b]))

        # Log periodically.
        if iter % 20 == 0:
            print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
    return W,b

In [None]:
W2, b2 = train(X2,W2, b2, Ynorm2, R2,optimizer2, 300)

: 

In [37]:
W1, b1 = train(X1,W1, b1, Ynorm1, R1,optimizer1, 300)

Training loss at iteration 0: 19761023.9
Training loss at iteration 20: 1447650.5
Training loss at iteration 40: 514406.5
Training loss at iteration 60: 380724.9
Training loss at iteration 80: 362720.0
Training loss at iteration 100: 360313.8
Training loss at iteration 120: 360039.6
Training loss at iteration 140: 359963.8
Training loss at iteration 160: 360043.6
Training loss at iteration 180: 360307.6
Training loss at iteration 200: 360374.4
Training loss at iteration 220: 360678.9
Training loss at iteration 240: 361229.2
Training loss at iteration 260: 361702.4
Training loss at iteration 280: 362169.3


In [None]:
p = np.matmul(X1.numpy(), np.transpose(W1.numpy())) + b1.numpy()

# restore the mean
pm = p + Ymean1
my_predictions = pm[:, 0]

# sort predictions
ix = tf.argsort(my_predictions, direction="DESCENDING")
ix

: 

In [None]:
# Make a prediction using trained weights and biases
p = np.matmul(X2.numpy(), np.transpose(W2.numpy())) + b2.numpy()

# restore the mean
pm = p #+ Ymean2
pt = pm.T
my_predictions = pt[:, 0]

# sort predictions
ix = tf.argsort(my_predictions, direction="DESCENDING")
ix
# for i in range(17):
#     j = ix[i]
#     if j not in my_rated:
#         print(f"Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}")

# print("\n\nOriginal vs Predicted ratings:\n")
# for i in range(len(my_ratings)):
#     if my_ratings[i] > 0:
#         print(
#             f"Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}"
        # )


: 

In [None]:
my_predictions[3552]

: 

# Méthode 2


split le dataset en train et test

In [None]:
movies = df.drop(columns=["title", "genres"]).copy()

user_train, user_test = train_test_split(users_df, test_size=0.2, random_state=42)

movie_train, movie_test = train_test_split(movies, test_size=0.2, random_state=42)

: 

Je supprime les utilisateurs et films, venant du dataset de train, du dataset de test et vis versa. De cette manière je suis sûr que le modèle n'aura pas vu les donnés de test.

Je dois ainsi mettre à jour les deux ratings pour que mes pivots correspondent bien au dataset de train et test.

In [None]:
user_id_train = np.array(user_train["user_id"])
movie_id_train = np.array(movie_train["movie_id"])

user_id_test = np.array(user_test["user_id"])
movie_id_test = np.array(movie_test["movie_id"])

ratings_train = ratings[ratings['user_id'].isin(user_id_train)].copy()
ratings_test = ratings[ratings['user_id'].isin(user_id_test)].copy()

ratings_train = ratings_train[ratings_train['movie_id'].isin(movie_id_train)]
ratings_test = ratings_test[ratings_test['movie_id'].isin(movie_id_test)]

# y_train = Y.drop(columns=[user_id_train])

: 

Je crée les matrices d'interactions utilisateurs-film

In [None]:
y_train = ratings_train.pivot(index='movie_id', columns='user_id', values='rating')
y_train = y_train.fillna(0)

y_test = ratings_test.pivot(index='movie_id', columns='user_id', values='rating')
y_test = y_test.fillna(0)


: 

In [None]:
# transforme un df en passant le column en index et en applicant la transformation trans
def transforme(df, column, trans):
    df.set_index(column, inplace=True)
    return trans.fit_transform(df)

# calcule le masque de Y
def getR(Y):
    R = Y.copy()
    R[R != 0] = 1
    return R

: 

Je normalise les différents dataset

StandardScaler pour les valeurs des vecteurs de features et un minmaxscaler pour les matrices d'intéractions

In [None]:
scaler = StandardScaler()

user_train = transforme(user_train, "user_id", scaler)
user_test = transforme(user_test, "user_id", scaler)

movie_train = transforme(movie_train, "movie_id", scaler)
movie_test = transforme(movie_test, "movie_id", scaler)

: 

In [None]:
scalerY = MinMaxScaler((-1,1))

R_train = getR(y_train)
R_test = getR(y_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = scalerY.fit_transform(y_train.reshape(-1, 1))
y_test = scalerY.fit_transform(y_test.reshape(-1, 1))

: 

In [None]:
num_user_features = user_train.shape[1]
num_item_features = movie_train.shape[1]

: 

### Model

Réseau de neurone basique vu en cours

In [None]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
    ]
)

item_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
    ]
)

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

: 

In [None]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)

: 

In [None]:
user_train.shape

: 

In [None]:
tf.random.set_seed(1)
model.fit(
    [user_train, movie_train],
    y_train,
    epochs=2,
    validation_data=([user_test, movie_test], y_test),
)

: 