### Import Dependences

In [231]:
import pandas as pd
import io
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib
import matplotlib.pyplot as plt 
import numpy as np


##### Import Dataset

In [None]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']

data_path_train = '../data/ml-100k/ua.base'  # Đảm bảo thư mục này chứa dữ liệu u.data
train_data = pd.read_csv(data_path_train, sep='\t', names=columns, encoding='latin-1')

data_path_test = '../data/ml-100k/ua.test'  # Đảm bảo thư mục này chứa dữ liệu u.data
test_data = pd.read_csv(data_path_test, sep='\t', names=columns, encoding='latin-1')

movies_set_raw = pd.read_csv("../data/ml-100k/u.item", encoding="latin-1", sep="|", names=["movie_id", "movie_name", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22", "col23", "col24"])
movies_set = movies_set_raw.iloc[:,:2]

# Tạo từ điển ánh xạ movie_id -> movie_name
movie_dict = pd.Series(movies_set['movie_name'].values, index=movies_set['movie_id']).to_dict()
title_to_id = pd.Series(movies_set['movie_id'].values, index=movies_set['movie_name']).to_dict()
index_movieId_map = { index:k for index,k in enumerate(sorted(movie_dict.keys())) }
movieId_index_map = { index_movieId_map[k]:k for k in index_movieId_map.keys() }

Analyze Dataset

In [233]:
len(train_data), len(test_data)

(90570, 9430)

In [234]:
train_data.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [235]:
train_data.nunique()

user_id        943
item_id       1680
rating           5
timestamp    46638
dtype: int64

In [236]:
# convert into train_matrix, test_matrix
train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
test_matrix = test_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [237]:
train_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [238]:
test_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1591,1592,1600,1612,1617,1646,1653,1656,1662,1664
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [239]:
# Scale test_matrix to have the same dimension as train_matrix
test_matrix = test_matrix.reindex(columns=train_matrix.columns, fill_value=0)

In [240]:
test_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
941,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [241]:
# Convert to ndarray to train
train_arr = train_matrix.to_numpy()
test_arr = test_matrix.to_numpy()

## Now let's train using SVD
1. Import dependences

In [242]:
from scipy.sparse.linalg import svds

2. Define functions

In [243]:
#apply svd for different latent dims
def apply_svd_with_dim(latent_dim):
  user_f, features, f_movie = svds(train_arr, k = latent_dim)
  feature_diag_matrix = np.diag(features) if features.ndim == 1 else np.diag(features[0])
  return user_f, feature_diag_matrix, f_movie, features

#for the generated matrixes generate predictions
def generate_predictions(user_f, feature_diag_matrix, f_movie):
  predicted_ratings = np.dot(np.dot(user_f, feature_diag_matrix), f_movie)    
  preds_df = pd.DataFrame(predicted_ratings, columns=train_matrix.columns, index=train_matrix.index)
  return preds_df

#utility method to apply svd and generate a dataframe with it
def get_svd(dim_latent):
  user_feature, feature_diag_matrix, movie_feature, features = apply_svd_with_dim(dim_latent)
  df_predictions = generate_predictions(user_feature, feature_diag_matrix, movie_feature)
  df_movie_feature = pd.DataFrame()
  df_movie_feature["movie_feature"] = movie_feature.T.tolist()
  return user_feature, feature_diag_matrix, df_movie_feature, df_predictions

def get_svd_test(dim_latent):
  user_feature, feature_diag_matrix, movie_feature, features = apply_svd_with_dim_test(dim_latent)
  df_predictions = generate_predictions(user_feature, feature_diag_matrix, movie_feature)
  df_movie_feature = pd.DataFrame()
  df_movie_feature["movie_feature"] = movie_feature.T.tolist()
  return user_feature, feature_diag_matrix, df_movie_feature, df_predictions

def apply_svd_with_dim_test(latent_dim):
  user_f, features, f_movie = svds(test_arr, k = latent_dim)
  feature_diag_matrix = np.diag(features) if features.ndim == 1 else np.diag(features[0])
  return user_f, feature_diag_matrix, f_movie, features


3. Fit

In [244]:
user_feature, feature_diag_matrix, movie_feature, df_predictions = get_svd(15)

In [245]:
# Prediction
df_predictions

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.391476,1.823049,1.303307,3.513321,0.419413,0.481078,4.936361,2.557280,3.227556,1.703640,...,-0.021500,0.002114,-0.002947,-0.001965,0.012314,-0.004967,-0.014900,-0.009934,0.024885,0.063479
2,1.971771,-0.042178,0.055236,0.440683,-0.113997,0.269776,1.262312,0.710196,2.235888,0.618198,...,-0.009854,-0.031754,-0.000136,-0.000091,0.002836,0.007232,0.021696,0.014464,0.004694,-0.031543
3,-0.110717,-0.035977,0.114590,-0.077224,-0.013527,0.033224,0.078510,0.113632,-0.083584,0.026078,...,0.004489,-0.003708,0.016113,0.010742,-0.003000,0.011197,0.033592,0.022395,-0.004395,-0.008616
4,0.051727,-0.065854,-0.012787,-0.072675,0.011861,-0.047576,0.289204,-0.212252,-0.084951,0.039614,...,0.000804,-0.004854,0.001221,0.000814,-0.001124,0.006046,0.018137,0.012091,0.002613,-0.001452
5,2.948045,1.024330,0.342823,1.609379,0.289591,-0.069278,2.349748,1.245255,-0.356102,0.439410,...,-0.016359,0.006607,-0.027369,-0.018246,-0.017258,0.000110,0.000329,0.000220,-0.009767,-0.012757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,2.100430,0.107343,0.227802,0.177400,0.282919,0.079525,1.366954,0.398160,1.839877,0.219944,...,-0.000545,0.003540,0.010438,0.006958,0.014032,-0.005066,-0.015197,-0.010131,0.000960,-0.004969
940,1.909966,0.255450,-0.149992,1.276699,0.330616,0.111717,1.763220,1.320167,0.798162,0.210924,...,0.004956,0.018433,0.004930,0.003286,0.008707,0.009398,0.028195,0.018797,-0.002743,-0.010351
941,0.924743,0.055198,0.145480,0.039599,-0.088868,-0.045113,0.674411,0.286289,0.218750,0.114507,...,-0.006826,-0.001687,0.007722,0.005148,0.006487,0.000893,0.002679,0.001786,0.006091,-0.004413
942,1.177026,0.249120,-0.374430,0.040242,-0.288469,-0.042861,-0.289987,1.421431,-0.111289,0.014550,...,0.015253,0.027409,0.024631,0.016421,0.001990,0.010312,0.030937,0.020625,0.007061,-0.023212


In [246]:
# Original Data
train_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [247]:
# unrated_items

In [248]:
# Recommend for a specific user
utility_matrix = train_matrix

prediction_matrix = df_predictions.to_numpy()

# 1. Get predicted ratings for user 1 (first row of prediction_matrix)
user_id = 0 
predicted_ratings_user1 = prediction_matrix[user_id]

# 2. Get items already rated by user 1 (first row of utility_matrix)
rated_items_user1 = utility_matrix.iloc[user_id] > 0

# 3. Filter out the items that user 1 has already rated
unrated_items = np.where(rated_items_user1 == 0)[0]

# 4. Get predicted ratings for unrated items
predicted_unrated = predicted_ratings_user1[unrated_items]

# 5. Sort the predicted ratings of unrated items in descending order
sorted_unrated_items = unrated_items[np.argsort(predicted_unrated)[::-1]]

# 6. Map the sorted unrated items to their predicted ratings
top_recommended_items = sorted_unrated_items
top_recommended_ratings = predicted_unrated[np.argsort(predicted_unrated)[::-1]]

# Output the recommended items and their predicted ratings for user 1
print("Top recommended items for user 1:")
for item, rating in zip(top_recommended_items, top_recommended_ratings):
    print(f"Item {item + 1} with predicted rating: {rating:.2f}")


Top recommended items for user 1:
Item 475 with predicted rating: 3.78
Item 408 with predicted rating: 3.42
Item 318 with predicted rating: 3.39
Item 433 with predicted rating: 3.34
Item 202 with predicted rating: 3.34
Item 403 with predicted rating: 3.24
Item 655 with predicted rating: 3.19
Item 474 with predicted rating: 3.15
Item 423 with predicted rating: 3.05
Item 357 with predicted rating: 2.92
Item 286 with predicted rating: 2.86
Item 276 with predicted rating: 2.85
Item 275 with predicted rating: 2.74
Item 431 with predicted rating: 2.68
Item 117 with predicted rating: 2.65
Item 265 with predicted rating: 2.64
Item 455 with predicted rating: 2.63
Item 515 with predicted rating: 2.59
Item 405 with predicted rating: 2.57
Item 483 with predicted rating: 2.49
Item 568 with predicted rating: 2.45
Item 285 with predicted rating: 2.45
Item 582 with predicted rating: 2.45
Item 385 with predicted rating: 2.44
Item 273 with predicted rating: 2.41
Item 435 with predicted rating: 2.35
Item

# Evaluating

In [249]:
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, recall_score


### 1. For Train Set

In [250]:
# 6. Tính toán RMSE, Accuracy, F1, Recall
# Ta sẽ tính RMSE giữa ma trận gốc và ma trận dự đoán của test set
y_true = train_matrix.values.flatten()
# y_pred = np.dot(test_matrix_pred, svd.components_).flatten()
y_pred = df_predictions.values.flatten()

# Loại bỏ các giá trị 0 (chưa đánh giá) trong cả y_true và y_pred
mask = y_true != 0
y_true = y_true[mask]
y_pred = y_pred[mask]

# Tính RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

# Tính MAE
mae = np.mean(np.abs(y_true - y_pred))
# Evaluate Precision, recall
def precision_recall_at_k(test_matrix, predicted_matrix, k=10):
    # Precision@K, Recall@K
    precision_at_k = 0
    recall_at_k = 0
    num_users = test_matrix.shape[0]
    
    for user_id in range(num_users):
        # Lấy các chỉ số của người dùng trong test_matrix và predicted_matrix
        actual_ratings = test_matrix.iloc[user_id, :].values
        predicted_ratings = predicted_matrix[user_id, :]
        
        # Lấy top-K sản phẩm gợi ý dựa trên predicted_ratings
        recommended_items = np.argsort(predicted_ratings)[::-1][:k]
        
        # Tính Precision@K và Recall@K
        relevant_items = actual_ratings[recommended_items] > 0  # Các sản phẩm đã được đánh giá
        precision_at_k += np.sum(relevant_items) / k
        recall_at_k += np.sum(relevant_items) / np.sum(actual_ratings > 0)  # Các sản phẩm đã thực sự đánh giá
    
    precision_at_k /= num_users
    recall_at_k /= num_users
    
    return precision_at_k, recall_at_k

# Tính Precision@K và Recall@K cho K = 10
precision, recall = precision_recall_at_k(train_matrix, df_predictions.values, k=10)

# Evaluate F1
# Tính F1-Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Evaluate hit rate
def hit_rate_at_k(test_matrix, predicted_matrix, k=10):
    hit_rate = 0
    num_users = test_matrix.shape[0]
    
    for user_id in range(num_users):
        actual_ratings = test_matrix.iloc[user_id, :].values
        predicted_ratings = predicted_matrix[user_id, :]
        
        # Lấy top-K sản phẩm gợi ý
        recommended_items = np.argsort(predicted_ratings)[::-1][:k]
        
        # Kiểm tra nếu có ít nhất một sản phẩm đúng trong top-K
        if np.sum(actual_ratings[recommended_items] > 0) > 0:
            hit_rate += 1
    
    hit_rate /= num_users
    return hit_rate

# Tính Hit Rate cho K = 10
hit_rate = hit_rate_at_k(train_matrix, df_predictions.values, k=10)

# Đánh giá mô hình (các metric)


print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'Precision@10: {precision}')
print(f'Recall@10: {recall}')
print(f'F1 Score: {f1_score}')
print(f'Hit Rate@10: {hit_rate}')

RMSE: 2.2446275554008377
MAE: 1.9304135308061756
Precision@10: 0.7098621420996801
Recall@10: 0.15020176687591297
F1 Score: 0.24794098873114814
Hit Rate@10: 0.9989395546129375


### 2. For Test Set

In [251]:
_, _, _, pred_test= get_svd_test(700)

In [252]:
# 6. Tính toán RMSE, Accuracy, F1, Recall
# Ta sẽ tính RMSE giữa ma trận gốc và ma trận dự đoán của test set
y_true = test_matrix.values.flatten()
# y_pred = np.dot(test_matrix_pred, svd.components_).flatten()
y_pred = df_predictions.values.flatten()

# Loại bỏ các giá trị 0 (chưa đánh giá) trong cả y_true và y_pred
mask = y_true != 0
y_true = y_true[mask]
y_pred = y_pred[mask]

# Tính RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
# Tính MAE
mae = np.mean(np.abs(y_true - y_pred))
# Evaluate Precision, recall
def precision_recall_at_k(test_matrix, predicted_matrix, k=10):
    # Precision@K, Recall@K
    precision_at_k = 0
    recall_at_k = 0
    num_users = test_matrix.shape[0]
    
    for user_id in range(num_users):
        # Lấy các chỉ số của người dùng trong test_matrix và predicted_matrix
        actual_ratings = test_matrix.iloc[user_id, :].values
        predicted_ratings = predicted_matrix[user_id, :]
        
        # Lấy top-K sản phẩm gợi ý dựa trên predicted_ratings
        recommended_items = np.argsort(predicted_ratings)[::-1][:k]
        
        # Tính Precision@K và Recall@K
        relevant_items = actual_ratings[recommended_items] > 0  # Các sản phẩm đã được đánh giá
        precision_at_k += np.sum(relevant_items) / k
        recall_at_k += np.sum(relevant_items) / np.sum(actual_ratings > 0)  # Các sản phẩm đã thực sự đánh giá
    
    precision_at_k /= num_users
    recall_at_k /= num_users
    
    return precision_at_k, recall_at_k

# Tính Precision@K và Recall@K cho K = 10
precision, recall = precision_recall_at_k(test_matrix, df_predictions.values, k=10)

# Evaluate F1
# Tính F1-Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Evaluate hit rate
def hit_rate_at_k(test_matrix, predicted_matrix, k=10):
    hit_rate = 0
    num_users = test_matrix.shape[0]
    
    for user_id in range(num_users):
        actual_ratings = test_matrix.iloc[user_id, :].values
        predicted_ratings = predicted_matrix[user_id, :]
        
        # Lấy top-K sản phẩm gợi ý
        recommended_items = np.argsort(predicted_ratings)[::-1][:k]
        
        # Kiểm tra nếu có ít nhất một sản phẩm đúng trong top-K
        if np.sum(actual_ratings[recommended_items] > 0) > 0:
            hit_rate += 1
    
    hit_rate /= num_users
    return hit_rate

# Tính Hit Rate cho K = 10
hit_rate = hit_rate_at_k(test_matrix, df_predictions.values, k=10)

# Đánh giá mô hình (các metric)


print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'Precision@10: {precision}')
print(f'Recall@10: {recall}')
print(f'F1 Score: {f1_score}')
print(f'Hit Rate@10: {hit_rate}')

RMSE: 2.8261783357298937
MAE: 2.5495542624004046
Precision@10: 0.1010604453870626
Recall@10: 0.1010604453870626
F1 Score: 0.1010604453870626
Hit Rate@10: 0.559915164369035


##### Recommend Films by title using cosine_similarity

In [253]:
from sklearn.metrics.pairwise import cosine_similarity


def get_similarity_for_movie(features_df,title,top_n=10,dump_results=False):
  
  features_numpy = np.array( [ np.array(x) for x in  features_df[ "movie_feature"].to_numpy() ])
  movieId = title_to_id[title]
  all_cosines = cosine_similarity(features_numpy[movieId - 1].reshape(1,-1),features_numpy)
  all_cosines = all_cosines.flatten()
  top_indexes = all_cosines.argsort()[-1 * top_n:][::-1]
  #print(top_indexes)
  similar_results = []
  for top_index in top_indexes:
    #top_index+1 because movie ids begin from 0
    if dump_results: 
      print( f"{index_movieId_map[top_index]} ){all_cosines[top_index]} -> {movie_dict[index_movieId_map[top_index]]}")
    movie_name = movie_dict[index_movieId_map[top_index]]
    similar_results.append( [index_movieId_map[top_index],all_cosines[top_index],movie_name]  )
  return similar_results    

In [254]:
sim = get_similarity_for_movie(movie_feature,"Toy Story (1995)")
sim

[[1, 0.9999999999999999, 'Toy Story (1995)'],
 [993, 0.6756344121990079, 'Hercules (1997)'],
 [596, 0.6634940885568659, 'Hunchback of Notre Dame, The (1996)'],
 [151, 0.6267800010935755, 'Willy Wonka and the Chocolate Factory (1971)'],
 [473, 0.6234669159941061, 'James and the Giant Peach (1996)'],
 [95, 0.6230755300640446, 'Aladdin (1992)'],
 [257, 0.6075565945298429, 'Men in Black (1997)'],
 [408, 0.6061298716916153, 'Close Shave, A (1995)'],
 [1060, 0.5915031563249317, 'Adventures of Pinocchio, The (1996)'],
 [588, 0.581424460736398, 'Beauty and the Beast (1991)']]

In [255]:
df_predictions

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.391476,1.823049,1.303307,3.513321,0.419413,0.481078,4.936361,2.557280,3.227556,1.703640,...,-0.021500,0.002114,-0.002947,-0.001965,0.012314,-0.004967,-0.014900,-0.009934,0.024885,0.063479
2,1.971771,-0.042178,0.055236,0.440683,-0.113997,0.269776,1.262312,0.710196,2.235888,0.618198,...,-0.009854,-0.031754,-0.000136,-0.000091,0.002836,0.007232,0.021696,0.014464,0.004694,-0.031543
3,-0.110717,-0.035977,0.114590,-0.077224,-0.013527,0.033224,0.078510,0.113632,-0.083584,0.026078,...,0.004489,-0.003708,0.016113,0.010742,-0.003000,0.011197,0.033592,0.022395,-0.004395,-0.008616
4,0.051727,-0.065854,-0.012787,-0.072675,0.011861,-0.047576,0.289204,-0.212252,-0.084951,0.039614,...,0.000804,-0.004854,0.001221,0.000814,-0.001124,0.006046,0.018137,0.012091,0.002613,-0.001452
5,2.948045,1.024330,0.342823,1.609379,0.289591,-0.069278,2.349748,1.245255,-0.356102,0.439410,...,-0.016359,0.006607,-0.027369,-0.018246,-0.017258,0.000110,0.000329,0.000220,-0.009767,-0.012757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,2.100430,0.107343,0.227802,0.177400,0.282919,0.079525,1.366954,0.398160,1.839877,0.219944,...,-0.000545,0.003540,0.010438,0.006958,0.014032,-0.005066,-0.015197,-0.010131,0.000960,-0.004969
940,1.909966,0.255450,-0.149992,1.276699,0.330616,0.111717,1.763220,1.320167,0.798162,0.210924,...,0.004956,0.018433,0.004930,0.003286,0.008707,0.009398,0.028195,0.018797,-0.002743,-0.010351
941,0.924743,0.055198,0.145480,0.039599,-0.088868,-0.045113,0.674411,0.286289,0.218750,0.114507,...,-0.006826,-0.001687,0.007722,0.005148,0.006487,0.000893,0.002679,0.001786,0.006091,-0.004413
942,1.177026,0.249120,-0.374430,0.040242,-0.288469,-0.042861,-0.289987,1.421431,-0.111289,0.014550,...,0.015253,0.027409,0.024631,0.016421,0.001990,0.010312,0.030937,0.020625,0.007061,-0.023212


In [256]:
test_matrix_for_user1 = test_matrix.iloc[0]
test_matrix_for_user1 = test_matrix_for_user1[test_matrix_for_user1 != 0]
test_matrix_for_user1

item_id
20     4.0
33     4.0
61     4.0
117    3.0
155    2.0
160    4.0
171    5.0
189    3.0
202    5.0
265    4.0
Name: 1, dtype: float64

In [257]:
pred_1 = df_predictions.iloc[0]

matched_values = pred_1.loc[test_matrix_for_user1.index]
selected_pred = pred_1.loc[test_matrix_for_user1.index]
result = selected_pred[selected_pred.index.isin(test_matrix_for_user1.index)]

result

item_id
20     1.537723
33     1.410779
61     1.380730
117    2.648297
155    0.794935
160    1.163817
171    1.710727
189    2.049628
202    3.336566
265    2.640828
Name: 1, dtype: float64