In [28]:
import os
import pandas as pd

# Chemin vers la racine du projet
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # remonte d'un dossier
data_path = os.path.join(project_root, 'data')

print("Chemin vers data :", data_path)


Chemin vers data : /home/jupyter/Fatma_Aziz/movie_recommender/data


In [29]:
# Charger train, test et movies
train_df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_ratings.csv'))
df_movies = pd.read_csv(os.path.join(data_path, 'movies.csv'))

# Vérifier
print("Train shape :", train_df.shape)
print("Test shape  :", test_df.shape)
print("Movies shape:", df_movies.shape)


Train shape : (80, 6)
Test shape  : (20, 6)
Movies shape: (10000, 5)


In [30]:
import pickle

with open(os.path.join(data_path, 'user_encoder.pkl'), 'rb') as f:
    user_encoder = pickle.load(f)

with open(os.path.join(data_path, 'movie_encoder.pkl'), 'rb') as f:
    movie_encoder = pickle.load(f)


In [31]:
# from lightfm import LightFM
# from lightfm.data import Dataset

# dataset = Dataset()
# dataset.fit(train_df['user_idx'], train_df['movie_idx'])

# (interactions, weights) = dataset.build_interactions(
#     [(row['user_idx'], row['movie_idx'], row['rating']) for _, row in train_df.iterrows()]
# )


In [32]:
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
data = Dataset.load_from_df(train_df[['user_idx', 'movie_idx', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
for pred in predictions[:5]:
    print(f"user: {pred.uid}, movie: {pred.iid}, actual: {pred.r_ui}, predicted: {pred.est:.2f}")

user: 5, movie: 40, actual: 0.5, predicted: 0.50
user: 11, movie: 29, actual: 0.5, predicted: 0.50
user: 15, movie: 69, actual: 0.5, predicted: 0.50
user: 21, movie: 24, actual: 0.5, predicted: 0.50
user: 13, movie: 51, actual: 0.5, predicted: 0.50


In [33]:
# Exemple : user réel
real_user_id = 1

# Convertir userId réel en user_idx utilisé par le modèle
user_idx = user_encoder.transform([real_user_id])[0]


In [34]:
# Tous les movieIds
all_movie_ids = df_movies['movieId'].values

# Garder uniquement ceux vus pendant l'entraînement
all_movie_ids = [mid for mid in all_movie_ids if mid in movie_encoder.classes_]

movies_watched = train_df[train_df['user_idx'] == user_idx]['movie_idx'].values


In [35]:
predictions_for_user = []

for movie_id in all_movie_ids:
    movie_idx = movie_encoder.transform([movie_id])[0]
    
    if movie_idx not in movies_watched:
        pred = algo.predict(user_idx, movie_idx)
        predictions_for_user.append((movie_id, pred.est))


In [36]:
# Trier par note prédite décroissante
predictions_for_user.sort(key=lambda x: x[1], reverse=True)

# Top 5 recommandations
top_n = 5
top_recommendations = predictions_for_user[:top_n]

print("Top recommandations pour l'utilisateur", real_user_id)
for movie_id, est_rating in top_recommendations:
    title = df_movies[df_movies['movieId'] == movie_id]['title'].values[0]
    print(f"{title} - note prédite: {est_rating:.2f}")


Top recommandations pour l'utilisateur 1
Spy Kids 3-D: Game Over (2003) - note prédite: 0.50
Batman Forever (1995) - note prédite: 0.50
Charlie's Angels: Full Throttle (2003) - note prédite: 0.50
Pirates of the Caribbean: The Curse of the Black Pearl (2003) - note prédite: 0.50
Three Musketeers, The (1993) - note prédite: 0.50


In [37]:
import numpy as np
import pandas as pd

# Liste des utilisateurs et films
all_user_ids = train_df['user_idx'].unique()
all_movie_ids = train_df['movie_idx'].unique()

# Matrice vide
pred_matrix = pd.DataFrame(index=all_user_ids, columns=all_movie_ids)

# Remplir avec les prédictions
for uid in all_user_ids:
    for iid in all_movie_ids:
        pred_matrix.loc[uid, iid] = algo.predict(uid, iid).est

pred_matrix = pred_matrix.astype(float)


In [38]:
from sklearn.metrics.pairwise import cosine_similarity

# Similarité utilisateur × utilisateur
user_similarity = cosine_similarity(pred_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=all_user_ids, columns=all_user_ids)


In [39]:
user_idx = 0  # exemple
closest_users = user_similarity_df[user_idx].sort_values(ascending=False)[1:15]  # top 5
print(closest_users)


19    1.0
20    1.0
17    1.0
23    1.0
12    1.0
6     1.0
2     1.0
4     1.0
0     1.0
21    1.0
18    1.0
10    1.0
15    1.0
1     1.0
Name: 0, dtype: float64
