## 这是基于`BPR:Bayesian Personalized Ranking`实现的`Matrix Factorization`的协同过滤算法来进行的书籍评分的预测
### BPR是一种 `pair-wise`的方法，在个性化排序的任务上表现很好，所以我们使用`BPR`来与其他方法进行对比


In [53]:
#导入所需库
import torch 
from torch import nn
import pandas as pd
import numpy as np
import random
import torch.functional as F
#sklearn库
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [54]:
#读取book_score_csv
file_path = ('data/book_score.csv')
read_data = pd.read_csv(file_path)
print(read_data)

           User     Book  Rate                       Time         Tag
0       1398478  1467022     0  2011-03-29T12:48:35+08:00         NaN
1       1398478  1777823     0  2011-02-02T21:58:55+08:00         NaN
2       1398478  1902628     0  2011-01-31T15:57:58+08:00         NaN
3       1398478  1878708     0  2011-01-26T11:27:59+08:00         NaN
4       1398478  4238362     0  2011-01-21T13:04:15+08:00         NaN
...         ...      ...   ...                        ...         ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00  张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00  金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00     彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00   小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00          爱情

[637254 rows x 5 columns]


In [55]:

#用户和书籍的个数，构建 P Q 矩阵需要
users_nums = len(read_data['User'].unique())
books_nums = len(read_data['Book'].unique())

In [56]:
# 获取每个用户未评过分的书籍列表，用于充当负例
all_books = read_data['Book'].unique()
user_books = read_data.groupby('User')['Book'].unique()

user_unrated_books = {}
for user, rated_books in user_books.items():
    # 找到用户没有评价过的书籍
    unrated_books = list(set(all_books) - set(rated_books))
    user_unrated_books[user] = unrated_books

In [None]:
#自己的书籍评分数据集类，返回用户 id的index ，评过分和未评过分的书籍 id的index , 以及对应的评分
class BookScoreDataSet(torch.utils.data.Dataset):
    def __init__(self, read_data, user_unrated_books, all_dataset = None):
        self.read_data = read_data
        if all_dataset == None:
            #依据字典索引得到 user 和 book 的 id 列表
            self.users_unique_id_list = read_data['User'].unique()
            self.books_unique_id_list = read_data['Book'].unique()
            self.users_unique_id_list = sorted(set(self.users_unique_id_list))
            self.books_unique_id_list = sorted(set(self.books_unique_id_list))
            #依据id列表，创建 user 和 book 的分别从 id 到 index 的转换，便于实现利用索引访问对应的矩阵 factor
            self.user_id_to_index = {id : index for index, id in enumerate(self.users_unique_id_list)}
            self.book_id_to_index = {id : index for index, id in enumerate(self.books_unique_id_list)}
            self.user_unrated_books = user_unrated_books
        else:
            self.user_id_to_index = all_dataset.user_id_to_index
            self.book_id_to_index = all_dataset.book_id_to_index
            self.user_unrated_books = all_dataset.user_unrated_books
            
    def __getitem__(self, index):
        # 返回（user, rated_book, unrated_book）三元组，附带真实评分
        # 得到数据文件中index对应的一行
        one_row = self.read_data.iloc[index]
        user_index = self.user_id_to_index[one_row['User']]
        rated_book_index = self.book_id_to_index[one_row['Book']]
        u_b_rating = one_row['Rate'].astype('float32')
        unrated_book_index = self.book_id_to_index[random.choice(self.user_unrated_books[one_row['User']])]
        #返回 index
        return user_index, rated_book_index, unrated_book_index, u_b_rating
    
    def __len__(self):
        return len(self.read_data)

In [None]:

#划分数据集，这里我们使用助教样例代码中的train_test_split
train_data, test_data = train_test_split(read_data, test_size=0.5, random_state=42)

#创建数据集
all_dataset = BookScoreDataSet(read_data, user_unrated_books)
train_dataset = BookScoreDataSet(train_data, user_unrated_books, all_dataset)
test_dataset = BookScoreDataSet(test_data, user_unrated_books, all_dataset)

#创建训练和测试数据迭代器
batch_size = 8
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_iter = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [None]:
# 基于 BPR 的 MF 模型
class BPRMFModel(nn.Module):
    def __init__(self, factor_dim, users_nums, books_nums):
        super(BPRMFModel, self).__init__()
        self.user_matrix = nn.Embedding(users_nums, factor_dim)
        self.book_matrix = nn.Embedding(books_nums, factor_dim)

    # 前向传播函数，得到预测评分值   
    def forward(self, X):
        user_index, rated_book_index, unrated_book_index = X
        user_vector = self.user_matrix(user_index)
        rated_book_vector = self.book_matrix(rated_book_index)
        unrated_book_vector = self.book_matrix(unrated_book_index)
        # 经过Embedding层出来的vector形状应该是（batch_size x factor_dim）
        # 做点积，得到（batch_size X 1）的预测评分
        return (user_vector * rated_book_vector).sum(dim = 1), (user_vector * unrated_book_vector).sum(dim = 1)


### 下面是主函数定义

In [None]:
def main(net, train_iter, test_iter, batch_size, device, factor_dim, num_epochs = 40, lr = 0.04):
    # 正则化超参数
    lambda_1 = 0.001
    # 迭代训练，每一个 epoch 打印 ndcg_score
    for epoch in range(num_epochs):
        # 训练
        net.train()
        for i, X in enumerate(train_iter):
            user_index, rated_book_index, unrated_book_index, u_b_rating = X 
            X = (user_index.to(device),rated_book_index.to(device),unrated_book_index.to(device))

            # 模型预测
            hat_rate_rated, hat_rate_unrated= net(X)
            # 目标函数为 BPR_opt
            with torch.no_grad():
                # 使用SGD手动更新参数
                # print(torch.sigmoid(-(hat_rate_rated - hat_rate_unrated)).shape)
                net.user_matrix.weight[user_index] += lr * (torch.sigmoid(-(hat_rate_rated - hat_rate_unrated)).unsqueeze(1).repeat(1, factor_dim) * 
                (net.book_matrix.weight[rated_book_index] - net.book_matrix.weight[unrated_book_index]) +
                lambda_1 * net.user_matrix.weight[user_index])

                net.book_matrix.weight[rated_book_index] += lr * (torch.sigmoid(-(hat_rate_rated - hat_rate_unrated)).unsqueeze(1).repeat(1, factor_dim) * 
                (net.user_matrix.weight[user_index]) +
                lambda_1 * net.book_matrix.weight[rated_book_index])

                net.book_matrix.weight[unrated_book_index] += lr * (torch.sigmoid(-(hat_rate_rated - hat_rate_unrated)).unsqueeze(1).repeat(1, factor_dim) * 
                (-net.user_matrix.weight[user_index]) +
                lambda_1 * net.book_matrix.weight[unrated_book_index])
        # 测试评估
        net.eval()
        results = []
        with torch.no_grad():
            for i, X in enumerate(test_iter):
                user_index, rated_book_index, unrated_book_index, u_b_rating = X 
                X = (user_index.to(device),rated_book_index.to(device),unrated_book_index.to(device))

                #得到预测得分
                predict_rate_rated, predict_rate_unrated = net(X)

                #下面计算测试集的ndcg_socre，来评估预测的排序的效果
                res = torch.cat([user_index.unsqueeze(1), predict_rate_rated.cpu().unsqueeze(1), u_b_rating.unsqueeze(1)], dim = 1)
                results.append(res)

            # results变为一个(num_test_instance, 3)的张量
            results = torch.stack(results, dim = 0)
            # 对不同用户分组，分别计算ndcg
            # 将 user_index, predict_rate 和 u_b_rating 拆分开
            user_indexs = results[:, 0].long()   # 用户 index
            pred_ratings = results[:, 1]         # 预测评分
            true_ratings = results[:, 2]         # 真实评分

            # 将用户 ID 转换为 numpy 数组，方便后续操作
            user_indexs_np = user_indexs.cpu().numpy()
            pred_ratings_np = pred_ratings.cpu().numpy()
            true_ratings_np = true_ratings.cpu().numpy()

            # 为每个用户计算 NDCG 分数
            ndcg_scores = []

            # 获取每个用户的唯一 index
            unique_users = np.unique(user_indexs_np)

            for user in unique_users:
                # 获取当前用户的所有评分数据
                user_true_ratings = true_ratings_np[user_indexs_np == user]
                user_pred_ratings = pred_ratings_np[user_indexs_np == user]
                user_true_ratings_abs = np.abs(user_true_ratings)
                
                # 计算该用户的 NDCG 分数
                if len(user_true_ratings_abs) > 1:
                    # ndcg@k，k取样例代码中的50
                    ndcg = ndcg_score([np.nan_to_num(user_true_ratings_abs)], [np.nan_to_num(user_pred_ratings)], k=50)
                    ndcg_scores.append(torch.from_numpy(np.array(ndcg)))

            # 将所有用户的 NDCG 分数存储在一个张量中
            ndcg_scores = torch.stack(ndcg_scores)

            # 计算平均 ndcg
            ndcg_score_ = ndcg_scores.mean()
            
        # print protocols
        print(f'At epoch {epoch}, ndcg_score {ndcg_score_}')        

In [61]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
factor_dim = 20
net = BPRMFModel(factor_dim, users_nums, books_nums).to(device)
main(net, train_iter, test_iter, batch_size, device, factor_dim=factor_dim)

At epoch 0, ndcg_score 0.8350453978872957
At epoch 1, ndcg_score 0.8351739430054276


KeyboardInterrupt: 