# Load Data

run save data cells in projet.ipynb before

In [23]:
import pandas as pd

movies = pd.read_csv('../data/save/movies.csv')
ratings = pd.read_csv('../data/save/ratings.csv')
users = pd.read_csv('../data/save/users.csv')

In [24]:
display(movies.head())
display(ratings.head())
display(users.head())

Unnamed: 0,MovieId,Title,Year,isAdult,runtimeMinutes,averageRating,numVotes,Action,Adult,Adventure,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,toy story,1995,0,81,4.15,1073870,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,1995,0,104,3.55,379231,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,1995,0,101,3.3,29839,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,waiting to exhale,1995,0,124,3.0,12277,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,father of the bride part ii,1995,0,106,3.05,41877,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Unnamed: 0,UserId,MovieId,Rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


Unnamed: 0,UserId,Gender,Age
0,1,1,1
1,2,0,56
2,3,0,25
3,4,0,45
4,5,0,25


# NNE --

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Flatten, Concatenate, Input
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
# full_data
full_data = pd.merge(pd.merge(ratings, users, on='UserId'), movies, on='MovieId')

# Sélectionner les colonnes pour item_train et user_train
item_features = movies.columns.difference(['Title', 'Year', 'runtimeMinutes', 'MovieId']).tolist()
item_train = full_data[item_features].values
user_train = full_data[['Gender', 'Age']].values
y_train = full_data['Rating'].values

# Définir les dimensions des entrées utilisateur et item
num_user_features = user_train.shape[1]
num_item_features = item_train.shape[1]

# Sauvegarder les données non scalées
item_train_unscaled = item_train.copy()
user_train_unscaled = user_train.copy()
y_train_unscaled = y_train.copy()

In [6]:
# Appliquer le scaling
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))

# Inverser les transformations pour vérification
inverse_item = scalerItem.inverse_transform(item_train)
inverse_user = scalerUser.inverse_transform(user_train)

# Vérifier que les scalers peuvent inverser correctement les transformations
assert np.allclose(item_train_unscaled, inverse_item), "Something is wrong with the item scaler"
assert np.allclose(user_train_unscaled, inverse_user), "Something is wrong with the user scaler"

print("\033[92m" + "All tests passed!" + "\033[0m")

[92mAll tests passed![0m


In [7]:
# Split le dataset
user_item_data = np.concatenate([user_train, item_train], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    user_item_data, y_train, train_size=0.80, shuffle=True, random_state=1
)

user_train = X_train[:, :num_user_features]
item_train = X_train[:, num_user_features:]
user_test = X_test[:, :num_user_features]
item_test = X_test[:, num_user_features:]

print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (727000, 27)
movie/item test data shape: (181750, 27)


In [8]:
# Définir le modèle
num_outputs = 32
tf.random.set_seed(1)

user_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
    ]
)

item_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(num_outputs),
    ]
)

# creer le input des utilisateurs et calculer la norme L2
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# Creer le input des films et calculer la norme L2
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# Calculer le produit scalaire
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Créer le model
model = tf.keras.Model([input_user, input_item], output)

model.summary()




In [10]:
# Compiler le modèle
model.compile(optimizer='adam', loss='mean_squared_error')

# Entraîner le modèle
history = model.fit([user_train, item_train], y_train, epochs=3, batch_size=32, validation_split=0.1)

# Évaluer le modèle
loss = model.evaluate([user_test, item_test], y_test)
print(f'Test Loss: {loss}')


Epoch 1/3
[1m20447/20447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2ms/step - loss: 0.2504 - val_loss: 0.2456
Epoch 2/3
[1m20447/20447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2ms/step - loss: 0.2444 - val_loss: 0.2442
Epoch 3/3
[1m20447/20447[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2ms/step - loss: 0.2428 - val_loss: 0.2435
[1m5680/5680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2429
Test Loss: 0.2406751662492752


In [11]:
def predict_for_user(user_id, model):
    # Extraire les informations de l'utilisateur
    user_data = users[users['UserId'] == user_id].iloc[:, 1:].values
    user_data = scalerUser.transform(user_data)

    movie_data = movies.drop(columns=['Title', 'Year', 'runtimeMinutes', 'MovieId']).values

    movie_data = scalerItem.transform(movie_data)

    user_data = np.repeat(user_data, movie_data.shape[0], axis=0)

    # Faire les prédictions
    predictions = model.predict([user_data, movie_data])

    # Inverser la transformation des prédictions
    predictions = scalerTarget.inverse_transform(predictions)

    # Créer un DataFrame avec les résultats
    results = pd.DataFrame({
        'MovieId': movies['MovieId'],
        'Title': movies['Title'],
        'PredictedRating': predictions.flatten()
    })

    return results

In [12]:
def get_Movies_Name(movies, recommended_movie_ids):
    names = []
    for movie_id in recommended_movie_ids:
        movie_name = movies.loc[movies['MovieId'] == movie_id, 'Title'].values[0]
        names.append(movie_name)
    return names

def get_recommended_movies(user1_id, user2_id, nb_recommendation):
    user1_ratings = predict_for_user(user1_id, model)[['MovieId', 'PredictedRating']]
    user2_ratings = predict_for_user(user2_id, model)[['MovieId', 'PredictedRating']]

    # Merge les deux dataframes
    merged_ratings = pd.merge(user1_ratings, user2_ratings, on='MovieId', suffixes=('_user1', '_user2'))
    merged_ratings['AveragePredictedRating'] = merged_ratings[['PredictedRating_user1', 'PredictedRating_user2']].mean(axis=1)
    # Ne conserver que les colonnes 'MovieId' et 'AveragePredictedRating'
    average_ratings = merged_ratings[['MovieId', 'AveragePredictedRating']]

    # Trier les films par 'AveragePredictedRating' et récupérer les 5 premiers
    recommended_movies = average_ratings.sort_values(by='AveragePredictedRating', ascending=False).head(nb_recommendation)

    # Récupérer les noms des films recommandés
    recommended_movie_ids = recommended_movies['MovieId'].values

    user1_top_ranking = filtered_ratings = user1_ratings[user1_ratings['MovieId'].isin(recommended_movie_ids)]
    user2_top_ranking = filtered_ratings = user2_ratings[user2_ratings['MovieId'].isin(recommended_movie_ids)]


    recommended_movie_names = get_Movies_Name(movies, recommended_movie_ids)

    return recommended_movie_names, user1_top_ranking, user2_top_ranking

In [13]:
user1 = 1
user2 = 2
nb_recommendation = 5
recommended_movie_names, user1_top_ranking, user2_top_ranking = get_recommended_movies(user1, user2, nb_recommendation)

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [14]:
print(f"Recommended movies for user 1 and user 2:\n{recommended_movie_names}")
print(f"User {user1} ratings for recommended movies are:\n{user1_top_ranking}")
print(f"User {user2} ratings for recommended movies are:\n{user2_top_ranking}")

Recommended movies for user 1 and user 2:
['shawshank redemption', 'fight club', 'pulp fiction', 'matrix', 'godfather']
User 1 ratings for recommended movies are:
      MovieId  PredictedRating
245       296         3.575713
265       318         3.575718
684       858         3.575710
2060     2571         3.575713
2373     2959         3.575715
User 2 ratings for recommended movies are:
      MovieId  PredictedRating
245       296         3.499168
265       318         3.499176
684       858         3.499163
2060     2571         3.499167
2373     2959         3.499170
