In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
#moovielens 
movies = pd.read_csv('movies.csv') 
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

tags['tag'] = tags['tag'].fillna('')

#concatener les tags
movies_avec_links = pd.merge(movies, links, on='movieId', how='left')  #merge le dasaset movies et links
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index() #groupby le dataset tags par movieId et concatener les tags

# merger les dataframes movies_with_links et movie_tags
movies_with_ratings = pd.merge(movies_avec_links, ratings, on='movieId', how='left') #merge le dataset movies_with_links et ratings

# Calculer la moyenne des ratings
mean_ratings = movies_with_ratings.groupby('movieId')['rating'].mean().reset_index() #groupby le dataset movies_with_ratings par movieId et calculer la moyenne des ratings

# merger les dataframes
data = pd.merge(movies_avec_links, movie_tags, on='movieId', how='left')
data = pd.merge(data, mean_ratings, on='movieId', how='left')

#ajouter une colone user_id
data['userId'] = np.random.randint(1, 86537, data.shape[0])

#ajouter une colonne popularity
data['populaire'] = data['rating'].groupby(data['movieId']).transform('count')

data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tag,rating,userId,popularity
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,animation friendship toys animation Disney Pix...,3.893508,80450,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,animals based on a book fantasy magic board ga...,3.278179,72515,1
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,sequel moldy old old age old men wedding old p...,3.171271,84638,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,characters chick flick girl movie characters c...,2.868395,18411,1
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,family pregnancy wedding 4th wall aging baby d...,3.076957,15657,1


In [8]:
#nombre de doublons
print(data.duplicated().sum())   #pas de doublos 

# ============= pas de valeurs manquantes ni de doublons =============
data.dropna(subset=['tmdbId'], inplace=True)
#rempcer les valeurs manquantespar ""
data['tag'] = data['tag'].fillna('')
data['rating'] = data['rating'].fillna(data['rating'].mean())

#====netoyyer les colonnes =================================================
data['genres'] = data['genres'].str.replace('|', ' ')
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')
data['title'] = data['title'].str.strip() 
data['tag'] = data['tag'].str.lower()



0


In [None]:
#=================mapping pour le modele NCF=========================
#mapping des userId
user_id_mapping = {id:i for i, id in enumerate(data['userId'].unique())} #créer un dictionnaire avec les userId et leur index
data['userId'] = data['userId'].map(user_id_mapping)
#mapping des movieId
movie_id_mapping = {id:i for i, id in enumerate(data['movieId'].unique())}
data['movieId'] = data['movieId'].map(movie_id_mapping)

#==Normaliser les colonnes rating et popularity =======================
scaler = StandardScaler()
data['rating'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

data = data[(data['rating'] >= -1) & (data['rating'] <= 1)] 
data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,tag,rating,userId,popularity
1,1,Jumanji (1995),Adventure Children Fantasy,113497,8844.0,animals based on a book fantasy magic board ga...,0.342364,1,1
2,2,Grumpier Old Men (1995),Comedy Romance,113228,15602.0,sequel moldy old old age old men wedding old p...,0.204515,2,1
3,3,Waiting to Exhale (1995),Comedy Drama Romance,114885,31357.0,characters chick flick girl movie characters c...,-0.186019,3,1
4,4,Father of the Bride Part II (1995),Comedy,113041,11862.0,family pregnancy wedding 4th wall aging baby d...,0.082905,4,1
6,6,Sabrina (1995),Comedy Romance,114319,11860.0,based on a play harrison ford paris romance si...,0.463777,6,1


In [10]:
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout # type: ignore
from tensorflow.keras.models import Model , load_model  # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

# Normalisation des notes entre 0 et 1
scaler = MinMaxScaler()
data['rating'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

# Division des données
x = data[['userId', 'movieId']]
y = data['rating']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Construction du modèle
user_input = Input(shape=(1,)) #créer une couche d'entrée pour les userId
movie_input = Input(shape=(1,)) #créer une couche d'entrée pour les movieId

user_embedding = Embedding(len(user_id_mapping), 32)(user_input) #créer une couche d'embedding pour les userId avec 32 neurones
movie_embedding = Embedding(len(movie_id_mapping), 32)(movie_input) #créer une couche d'embedding pour les movieId 

user_flatten = Flatten()(user_embedding) #applatir la couche d'embedding des userId
movie_flatten = Flatten()(movie_embedding) #applatir la couche d'embedding des movieId

conc = Concatenate()([user_flatten, movie_flatten]) #concaténer les couches d'embedding

dense = Dense(128, activation='relu')(conc) #créer une couche dense de 128 neurones avec une fonction d'activation relu
dense = Dropout(0.5)(dense) #ajouter une couche de dropout avec un taux de 0.5 pour éviter le surapprentissage
dense = Dense(64, activation='relu')(dense) #créer une couche dense de 64 neurones avec une fonction d'activation relu
dense = Dropout(0.5)(dense)
dense = Dense(32, activation='relu')(dense) #créer une couche dense de 32 neurones avec une fonction d'activation relu
sortie = Dense(1, activation='sigmoid')(dense)  #créer une couche de sortie avec une fonction d'activation sigmoid

model = Model(inputs=[user_input, movie_input], outputs=sortie)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')

# Entraînement du modèle
model.fit([x_train['userId'], x_train['movieId']], y_train, batch_size=64, epochs=10, verbose=1)

# Prédiction et inversion de la normalisation
y_pred_normalized = model.predict([x_test['userId'], x_test['movieId']])
y_pred = scaler.inverse_transform(y_pred_normalized)




Epoch 1/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 54ms/step - loss: 0.0657
Epoch 2/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 53ms/step - loss: 0.0643
Epoch 3/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - loss: 0.0632
Epoch 4/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 48ms/step - loss: 0.0585
Epoch 5/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 46ms/step - loss: 0.0403
Epoch 6/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 51ms/step - loss: 0.0251
Epoch 7/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 51ms/step - loss: 0.0169
Epoch 8/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 53ms/step - loss: 0.0123
Epoch 9/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 52ms/step - loss: 0.0094
Epoch 10/10
[1m708/708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37

In [None]:
import numpy as np # type: ignore
from tensorflow.keras.models import load_model # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore

# Charger le modèle
model = load_model('model_NCF.h5')


def top_5_recommandation(user, model, movie_ids, valid_movie_ids, scaler):
    # Filtrer les IDs pour qu'ils soient valides
    movie_ids = [movie for movie in movie_ids if movie in valid_movie_ids]  # Ne garder que les IDs valides

    # Prédictions
    user_array = np.array([user] * len(movie_ids))
    movie_array = np.array(movie_ids)
    predictions = model.predict([user_array, movie_array])
    predictions = scaler.inverse_transform(predictions)

    # Trier les résultats
    indices = np.argsort(predictions.flatten())[::-1]
    top_5 = indices[:5]
    top_5_movies = movie_array[top_5]
    top_5_ratings = predictions.flatten()[top_5]

    return top_5_movies, top_5_ratings

# Exemple : Recommandations pour un utilisateur
user_id = 1  # ID de l'utilisateur
movie_ids = data['movieId'].unique()  # Liste des films disponibles
movies_df = pd.read_csv('movies.csv')  # Charger le fichier movies.csv
valid_movie_ids = movies_df['movieId'].values  # IDs dans movies.csv

# Obtenir les recommandations
top_5_movies, top_5_ratings = top_5_recommandation(user_id, model, movie_ids, valid_movie_ids, scaler)

# Mapper les `movieId` recommandés aux titres
nom_movies = movies_df[movies_df['movieId'].isin(top_5_movies)]['title'].values



# Afficher les résultats
print("Top 5 Movies (IDs):", top_5_movies)
print("Top 5 Ratings:", top_5_ratings)
print("Top 5 Movies (Titles):", nom_movies)










[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Top 5 Movies (IDs): [64285 27716 34292  4585  6428]
Top 5 Ratings: [0.900386   0.88814247 0.88744164 0.8863874  0.8735525 ]
Top 5 Movies (Titles): ['Dream Team, The (1989)' 'Two Mules for Sister Sara (1970)'
 'Green Butchers, The (Grønne slagtere, De) (2003)' 'Hardware (1990)'
 "Wallace and Gromit in 'A Matter of Loaf and Death' (2008)"]
(5,)


In [62]:
#evaluer le modele 
from sklearn.metrics import precision_score, recall_score, mean_squared_error # type: ignore
import numpy as np # type: ignore

# Fonction pour évaluer la précision et le rappel
def evaluate_model(model, x_test, y_test, threshold=0.5):
    predictions = model.predict(x_test)
    predictions = predictions.flatten()
    y_pred = (predictions > threshold).astype(int)
    y_true = (y_test > threshold).astype(int)
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return precision, recall

# Fonction pour calculer le RMSE
def calculate_rmse(model, x_test, y_test):
    predictions = model.predict(x_test)
    predictions = predictions.flatten()
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    return rmse

# Évaluer le modèle
precision, recall = evaluate_model(model, [x_test['userId'], x_test['movieId']], y_test)
rmse = calculate_rmse(model, [x_test['userId'], x_test['movieId']], y_test)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"RMSE: {rmse}") #

[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Precision: 0.5189518787079763
Recall: 0.6262304862284975
RMSE: 0.30809455379201206
