In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_absolute_error

# Load dataset MovieLens
ratings = pd.read_csv("dataset/ratings.csv")  # Đảm bảo có file này
movies = pd.read_csv("dataset/movies.csv")

rating = pd.merge(movies,ratings).drop(['genres', 'timestamp'], axis=1)

# Chia tập dữ liệu thành train (80%) và test (20%)
train_data, test_data = train_test_split(rating, test_size=0.2, random_state=42)

#Tạo ma trận User-Movie
train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')
test_matrix = test_data.pivot(index='userId', columns='movieId', values='rating')

# Chuẩn hóa dữ liệu User-Based CF (trừ đi trung bình của từng user)
user_mean_train = train_matrix.mean(axis=1)
normalized_train = train_matrix.sub(user_mean_train, axis=0)

# Chuẩn hóa dữ liệu Item-Based CF (trừ đi trung bình của từng phim)
movie_mean_train = train_matrix.mean(axis=0)
normalized_train_item = train_matrix.sub(movie_mean_train, axis=1)

# Điền NaN thành 0 để tránh lỗi
normalized_train = normalized_train.fillna(0)
normalized_train_item = normalized_train_item.fillna(0)

# Chuyển thành sparse matrix
normalized_train_sparse = csr_matrix(normalized_train.values)
normalized_train_item_sparse = csr_matrix(normalized_train_item.values.T)


item-item


In [2]:
item_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
item_knn.fit(normalized_train_item_sparse)

In [3]:
def predict_item_based_rating(user_id, movie_id, k=5):
    if movie_id not in train_matrix.columns:
        return np.nan  # Phim không có trong tập train
    
    movie_idx = train_matrix.columns.get_loc(movie_id)
    distances, indices = item_knn.kneighbors(normalized_train_item_sparse[movie_idx], n_neighbors=k+1)
    
    similar_movies = indices.flatten()[1:]
    weights = 1 - distances.flatten()[1:]
    
    num = sum(weights[i] * train_matrix.at[user_id, train_matrix.columns[sim_movie]]
              for i, sim_movie in enumerate(similar_movies) if train_matrix.at[user_id, train_matrix.columns[sim_movie]] > 0)
    den = sum(abs(weights[i]) for i, sim_movie in enumerate(similar_movies))
    
    pred = num / den if den != 0 else 0
    return pred + movie_mean_train[movie_id]

user-user

In [4]:
user_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
user_knn.fit(normalized_train_sparse)


In [5]:
def predict_user_based_rating(user_id, movie_id, k=5):
    if movie_id not in train_matrix.columns:
        return np.nan  # Phim không có trong tập train
    
    user_idx = train_matrix.index.get_loc(user_id)
    distances, indices = user_knn.kneighbors(normalized_train_sparse[user_idx], n_neighbors=k+1)
    
    similar_users = indices.flatten()[1:]
    weights = 1 - distances.flatten()[1:]
    
    num = sum(weights[i] * normalized_train.at[train_matrix.index[sim_user], movie_id]
              for i, sim_user in enumerate(similar_users) if train_matrix.at[train_matrix.index[sim_user], movie_id] > 0)
    den = sum(abs(weights[i]) for i, sim_user in enumerate(similar_users))
    
    pred = num / den if den != 0 else 0
    return pred + user_mean_train[user_id]

In [6]:
actual_ratings, predicted_ratings_user, predicted_ratings_item = [], [], []
for _, row in test_data.iterrows():
    user_id, movie_id, actual_rating = row['userId'], row['movieId'], row['rating']
    predicted_rating_user = predict_user_based_rating(user_id, movie_id, k=5)
    predicted_rating_item = predict_item_based_rating(user_id, movie_id, k=5)
    
    if not np.isnan(predicted_rating_user) and not np.isnan(predicted_rating_item):
        actual_ratings.append(actual_rating)
        predicted_ratings_user.append(predicted_rating_user)
        predicted_ratings_item.append(predicted_rating_item)

In [7]:
mae_user = mean_absolute_error(actual_ratings, predicted_ratings_user)
mae_item = mean_absolute_error(actual_ratings, predicted_ratings_item)
print(f"📌 MAE (User-Based CF) = {mae_user:.4f}")
print(f"📌 MAE (Item-Based CF) = {mae_item:.4f}")

📌 MAE (User-Based CF) = 0.6975
📌 MAE (Item-Based CF) = 0.9072


In [8]:
example_user = 5
example_movie = 10
predicted_user_based = predict_user_based_rating(example_user, example_movie, k=5)
predicted_item_based = predict_item_based_rating(example_user, example_movie, k=5)
print(f"🔹 Dự đoán User-Based CF cho user {example_user}, movie {example_movie}: {predicted_user_based:.4f}")
print(f"🔹 Dự đoán Item-Based CF cho user {example_user}, movie {example_movie}: {predicted_item_based:.4f}")

🔹 Dự đoán User-Based CF cho user 5, movie 10: 3.6578
🔹 Dự đoán Item-Based CF cho user 5, movie 10: 3.5472


In [9]:
def recommend_movies(user_id, k=5):
    predictions = {movie: predict_user_based_rating(user_id, movie, k) for movie in train_matrix.columns if pd.isna(train_matrix.at[user_id, movie])}
    top_movies = sorted(predictions, key=predictions.get, reverse=True)[:5]
    return movies[movies['movieId'].isin(top_movies)]

In [10]:
recommended_movies = recommend_movies(example_user, k=5)
print(f"🎬 Gợi ý phim cho user {example_user}:\n", recommended_movies)

🎬 Gợi ý phim cho user 5:
      movieId                             title                       genres
43        47       Seven (a.k.a. Se7en) (1995)             Mystery|Thriller
249      288       Natural Born Killers (1994)        Action|Crime|Thriller
257      296               Pulp Fiction (1994)  Comedy|Crime|Drama|Thriller
395      454                  Firm, The (1993)               Drama|Thriller
510      593  Silence of the Lambs, The (1991)        Crime|Horror|Thriller
