In [3]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k
)
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.utils.notebook_utils import store_metadata

In [4]:
## top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 20
BATCH_SIZE = 256

SEED = DEFAULT_SEED # Set None for non-deterministic results

In [5]:
# Đọc dữ liệu từ file CSV
# Đọc movies.dat và đổi tên cột MovieID -> ItemID
movies_df = pd.read_csv("/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/dataset/1m/movies.dat", sep="::", engine="python", 
                     names=["itemID", "title", "genres"], encoding="latin1")

# Đọc ratings.dat và đổi tên cột MovieID -> ItemID
df = pd.read_csv("/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/dataset/1m/ratings.dat", sep="::", engine="python", 
                      names=["userID", "itemID", "rating", "timestamp"])

print(movies_df.head())
print(df.head())


   itemID                               title                        genres
0       1                    Toy Story (1995)   Animation|Children's|Comedy
1       2                      Jumanji (1995)  Adventure|Children's|Fantasy
2       3             Grumpier Old Men (1995)                Comedy|Romance
3       4            Waiting to Exhale (1995)                  Comedy|Drama
4       5  Father of the Bride Part II (1995)                        Comedy
   userID  itemID  rating  timestamp
0       1    1193       5  978300760
1       1     661       3  978302109
2       1     914       3  978301968
3       1    3408       4  978300275
4       1    2355       5  978824291


In [6]:
df = pd.merge(df, movies_df[['itemID', 'title', 'genres']], how='left', on='itemID')  # Chỉ merge title, genre, không ảnh hưởng đến itemID
print(df.head())


   userID  itemID  rating  timestamp                                   title  \
0       1    1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1     661       3  978302109        James and the Giant Peach (1996)   
2       1     914       3  978301968                     My Fair Lady (1964)   
3       1    3408       4  978300275                  Erin Brockovich (2000)   
4       1    2355       5  978824291                    Bug's Life, A (1998)   

                         genres  
0                         Drama  
1  Animation|Children's|Musical  
2               Musical|Romance  
3                         Drama  
4   Animation|Children's|Comedy  


In [7]:
print(df.columns)


Index(['userID', 'itemID', 'rating', 'timestamp', 'title', 'genres'], dtype='object')


In [8]:
print(df[df["title"].isna()])


Empty DataFrame
Columns: [userID, itemID, rating, timestamp, title, genres]
Index: []


In [9]:
df["title"].fillna("Unknown Title", inplace=True)
df["genres"].fillna("Unknown Genres", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["title"].fillna("Unknown Title", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["genres"].fillna("Unknown Genres", inplace=True)


In [10]:
df = df[["userID", "itemID", "rating", "timestamp"]]
print(df.head())


   userID  itemID  rating  timestamp
0       1    1193       5  978300760
1       1     661       3  978302109
2       1     914       3  978301968
3       1    3408       4  978300275
4       1    2355       5  978824291


In [11]:
train, test = python_chrono_split(df, 0.75)

In [12]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

In [13]:
leave_one_out_test = test.groupby("userID").last().reset_index()

In [14]:
train_file = "/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/train_1m.csv"
test_file = "/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/test_1m.csv"
leave_one_out_test_file = "/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/leave_one_out_test_1m.csv"
#train.to_csv(train_file, index=False)
#test.to_csv(test_file, index=False)
#leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [15]:
# Đọc file CSV huấn luyện
df_train = pd.read_csv(train_file)

# Kiểm tra dữ liệu
print(df_train.head())

# Tách dữ liệu thành đầu vào và nhãn
user_input = df_train["userID"].values
item_input = df_train["itemID"].values
labels = df_train["rating"].values  # Nếu cần nhị phân hóa: labels = (labels > 0).astype(int)

print(user_input.shape, item_input.shape, labels.shape)

   userID  itemID  rating  timestamp
0       1    3186       4  978300019
1       1    1270       5  978300055
2       1    1721       4  978300055
3       1    1022       5  978300055
4       1    2340       3  978300103
(750121,) (750121,) (750121,)


In [16]:
df_train.dtypes

userID       int64
itemID       int64
rating       int64
timestamp    int64
dtype: object

In [17]:
print(df_train.isna().sum())


userID       0
itemID       0
rating       0
timestamp    0
dtype: int64


In [18]:
print(df_train.duplicated(subset=["userID", "itemID"]).sum())


0


In [19]:
print(df_train["rating"].describe())


count    750121.000000
mean          3.626138
std           1.108921
min           1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [20]:
df_train = df_train[(df_train["rating"] >= 1) & (df_train["rating"] <= 5)]


In [21]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)


INFO:recommenders.models.ncf.dataset:Indexing /Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/train_1m.csv ...
INFO:recommenders.models.ncf.dataset:Indexing /Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/leave_one_out_test_1m.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file /Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/test_train_1m/leave_one_out_test_1m_full.csv ...
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, 

In [22]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=8,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

I0000 00:00:1743577026.575665 2024271 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


In [23]:
with Timer() as train_time: model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [26.78s]: train_loss = 0.254602 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [24.72s]: train_loss = 0.247605 


Took 480.8448787909874 seconds for training.


In [24]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1,1545,0.017816
1,1,527,0.948629
2,1,595,0.964207
3,1,2687,0.527584
4,1,745,0.263206


In [25]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 53.13307320899912 seconds for prediction.


In [26]:
eval_map = map(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.026746
NDCG:	0.158219
Precision@K:	0.146623
Recall@K:	0.059185


In [27]:
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.560265
NDCG:	0.444592


In [28]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="GMF",
    n_factors=8,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [29]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/GMF")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [18.55s]: train_loss = 0.268890 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [19.75s]: train_loss = 0.265000 


Took 380.5392462500022 seconds for training.


In [30]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="MLP",
    n_factors=8,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

In [31]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/MLP")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [21.28s]: train_loss = 0.292714 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [21.18s]: train_loss = 0.285702 


Took 415.57524566701613 seconds for training.


In [32]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=8,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

model.load(gmf_dir=".pretrain/GMF", mlp_dir=".pretrain/MLP", alpha=0.5)

In [33]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [27.99s]: train_loss = 0.249214 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [27.95s]: train_loss = 0.245537 


Took 504.2612461660174 seconds for training.


In [34]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 53.47270949999802 seconds for prediction.


In [35]:
eval_map2 = map(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg2 = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision2 = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall2 = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map2,
      "NDCG:\t%f" % eval_ndcg2,
      "Precision@K:\t%f" % eval_precision2,
      "Recall@K:\t%f" % eval_recall2, sep='\n')

MAP:	0.026104
NDCG:	0.156184
Precision@K:	0.145596
Recall@K:	0.059080


In [36]:


model_path = "/Users/chi.nguyenth/Documents/DoAn_63133022_NguyenThiHaChi/model/ncf_model"
model.save(model_path)  # Sẽ tạo thư mục chứa toàn bộ mô hình


In [37]:
# Nối dữ liệu gợi ý với bảng chứa tên phim
all_predictions = all_predictions.merge(movies_df, on="itemID", how="left")

# Sắp xếp theo điểm dự đoán cao nhất
all_predictions = all_predictions.sort_values(by=["userID", "prediction"], ascending=[True, False])

# Xem dữ liệu sau khi nối
print(all_predictions.head())


      userID  itemID  timestamp  prediction                             title  \
2584       1    2858        NaN    0.982442            American Beauty (1999)   
509        1     527        NaN    0.974312           Schindler's List (1993)   
306        1     318        NaN    0.972040  Shawshank Redemption, The (1994)   
2145       1    2396        NaN    0.971348        Shakespeare in Love (1998)   
33         1      34        NaN    0.963188                       Babe (1995)   

                       genres  
2584             Comedy|Drama  
509                 Drama|War  
306                     Drama  
2145           Comedy|Romance  
33    Children's|Comedy|Drama  


In [38]:
# Nhóm theo userID và hiển thị top phim gợi ý
top_n = 10  # Số lượng phim muốn hiển thị

for user, group in all_predictions.groupby("userID"):
    print(f"\n🎬 Recommended movies for User {user}:")
    for i, row in group.head(top_n).iterrows():
        print(f"   ⭐ {row['title']} (Predicted rating: {row['prediction']:.2f})")



🎬 Recommended movies for User 1:
   ⭐ American Beauty (1999) (Predicted rating: 0.98)
   ⭐ Schindler's List (1993) (Predicted rating: 0.97)
   ⭐ Shawshank Redemption, The (1994) (Predicted rating: 0.97)
   ⭐ Shakespeare in Love (1998) (Predicted rating: 0.97)
   ⭐ Babe (1995) (Predicted rating: 0.96)
   ⭐ Star Wars: Episode V - The Empire Strikes Back (1980) (Predicted rating: 0.96)
   ⭐ Lady and the Tramp (1955) (Predicted rating: 0.96)
   ⭐ Toy Story (1995) (Predicted rating: 0.95)
   ⭐ Jungle Book, The (1967) (Predicted rating: 0.95)
   ⭐ Beauty and the Beast (1991) (Predicted rating: 0.95)

🎬 Recommended movies for User 2:
   ⭐ Schindler's List (1993) (Predicted rating: 0.99)
   ⭐ Fargo (1996) (Predicted rating: 0.98)
   ⭐ Godfather, The (1972) (Predicted rating: 0.98)
   ⭐ Godfather: Part II, The (1974) (Predicted rating: 0.96)
   ⭐ Casablanca (1942) (Predicted rating: 0.96)
   ⭐ Star Wars: Episode IV - A New Hope (1977) (Predicted rating: 0.95)
   ⭐ Pulp Fiction (1994) (Predicte