In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [26]:
#moovielens 
movies = pd.read_csv('movies.csv') 
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

tags['tag'] = tags['tag'].fillna('')

#concatener les tags
movies_with_links = pd.merge(movies, links, on='movieId', how='left')
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# merger les dataframes movies_with_links et movie_tags
movies_with_ratings = pd.merge(movies_with_links, ratings, on='movieId', how='left')

# Calculer la moyenne des ratings
mean_ratings = movies_with_ratings.groupby('movieId')['rating'].mean().reset_index()

# merger les dataframes
data = pd.merge(movies_with_links, movie_tags, on='movieId', how='left')
data = pd.merge(data, mean_ratings, on='movieId', how='left')

#ajouter une colone user_id
data['userId'] = np.random.randint(1, 86537, data.shape[0])

#ajouter une colonne popularity
data['popularity'] = data['rating'].groupby(data['movieId']).transform('count')

data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tag,rating,userId,popularity
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,animation friendship toys animation Disney Pix...,3.893508,55016,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,animals based on a book fantasy magic board ga...,3.278179,6130,1
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,sequel moldy old old age old men wedding old p...,3.171271,51696,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,characters chick flick girl movie characters c...,2.868395,58797,1
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,family pregnancy wedding 4th wall aging baby d...,3.076957,85288,1


In [27]:
#nombre de doublons
print(data.duplicated().sum())   #pas de doublos 

# ============= pas de valeurs manquantes ni de doublons =============
data.dropna(subset=['tmdbId'], inplace=True)
#rempcer les valeurs manquantespar ""
data['tag'] = data['tag'].fillna('')
data['rating'] = data['rating'].fillna(data['rating'].mean())

#====netoyyer les colonnes =================================================
data['genres'] = data['genres'].str.replace('|', ' ')
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')
data['title'] = data['title'].str.strip() 
data['tag'] = data['tag'].str.lower()



0


In [51]:
#=================mapping pour le modele NCF=========================
#mapping des userId
user_id_mapping = {id:i for i, id in enumerate(data['userId'].unique())}
data['userId'] = data['userId'].map(user_id_mapping)
#mapping des movieId
movie_id_mapping = {id:i for i, id in enumerate(data['movieId'].unique())}
data['movieId'] = data['movieId'].map(movie_id_mapping)

#==Normaliser les colonnes rating et popularity =======================
scaler = StandardScaler()
data['rating'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

data = data[(data['rating'] >= -1) & (data['rating'] <= 1)] 
data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tag,rating,userId,popularity
1,1,Jumanji (1995),Adventure Children Fantasy,113497,8844.0,animals based on a book fantasy magic board ga...,0.342364,1,0.199106
2,2,Grumpier Old Men (1995),Comedy Romance,113228,15602.0,sequel moldy old old age old men wedding old p...,0.204515,2,0.199106
3,3,Waiting to Exhale (1995),Comedy Drama Romance,114885,31357.0,characters chick flick girl movie characters c...,-0.186019,3,0.199106
4,4,Father of the Bride Part II (1995),Comedy,113041,11862.0,family pregnancy wedding 4th wall aging baby d...,0.082905,4,0.199106
6,6,Sabrina (1995),Comedy Romance,114319,11860.0,based on a play harrison ford paris romance si...,0.463777,6,0.199106


In [52]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout # type: ignore
from tensorflow.keras.models import Model # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

# Normalisation des notes entre 0 et 1
scaler = MinMaxScaler()
data['rating'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

# Division des données
x = data[['userId', 'movieId']]
y = data['rating']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Construction du modèle
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

user_embedding = Embedding(len(user_id_mapping), 32)(user_input)
movie_embedding = Embedding(len(movie_id_mapping), 32)(movie_input)

user_flatten = Flatten()(user_embedding)
movie_flatten = Flatten()(movie_embedding)

conc = Concatenate()([user_flatten, movie_flatten])

dense = Dense(128, activation='relu')(conc)
dense = Dropout(0.5)(dense)
dense = Dense(64, activation='relu')(dense)
dense = Dropout(0.5)(dense)
dense = Dense(32, activation='relu')(dense)
sortie = Dense(1, activation='sigmoid')(dense)  

model = Model(inputs=[user_input, movie_input], outputs=sortie)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')

# Entraînement du modèle
model.fit([x_train['userId'], x_train['movieId']], y_train, batch_size=64, epochs=10, verbose=1)

# Prédiction et inversion de la normalisation
y_pred_normalized = model.predict([x_test['userId'], x_test['movieId']])
y_pred = scaler.inverse_transform(y_pred_normalized)

# Calcul de la loss sur les notes originales
y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
loss = mean_squared_error(y_test_original, y_pred)
print("Loss sur les notes originales :", loss)

Epoch 1/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 54ms/step - loss: 0.0648
Epoch 2/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 53ms/step - loss: 0.0640
Epoch 3/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 53ms/step - loss: 0.0629
Epoch 4/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 54ms/step - loss: 0.0594
Epoch 5/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 52ms/step - loss: 0.0438
Epoch 6/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 50ms/step - loss: 0.0282
Epoch 7/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 56ms/step - loss: 0.0189
Epoch 8/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 63ms/step - loss: 0.0132
Epoch 9/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 55ms/step - loss: 0.0101
Epoch 10/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45

In [49]:
#evaluation du modele
model.evaluate([x_test['userId'], x_test['movieId']], y_test)


[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.1022


1.1147539615631104