## 我们实现了两种基于`Matrix Factorization`的协同过滤算法来进行书籍评分的预测，并会在实验报告中给出两种推荐算法的结果比较分析
1. 加入正则项的普通 \( MSE \) 矩阵分解算法
2. 基于`BPR: Bayesian Personalized Ranking`实现的矩阵分解算法


In [132]:
#导入所需库
import torch 
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
#sklearn库
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [133]:
#读取book_score_csv
file_path = ('data/book_score.csv')
read_data = pd.read_csv(file_path)
print(read_data)

           User     Book  Rate                       Time         Tag
0       1398478  1467022     0  2011-03-29T12:48:35+08:00         NaN
1       1398478  1777823     0  2011-02-02T21:58:55+08:00         NaN
2       1398478  1902628     0  2011-01-31T15:57:58+08:00         NaN
3       1398478  1878708     0  2011-01-26T11:27:59+08:00         NaN
4       1398478  4238362     0  2011-01-21T13:04:15+08:00         NaN
...         ...      ...   ...                        ...         ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00  张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00  金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00     彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00   小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00          爱情

[637254 rows x 5 columns]


In [134]:

#用户和书籍的个数，构建矩阵需要
users_nums = len(read_data['User'].unique())
books_nums = len(read_data['Book'].unique())

In [135]:
#自己的书籍评分数据集类，返回用户 id的index ，书籍 id的index , 以及对应的评分
class BookScoreDataSet(torch.utils.data.Dataset):
    def __init__(self, read_data, all_dataset = None):
        self.read_data = read_data
        if all_dataset == None:
            #依据字典索引得到 user 和 book 的 id 列表
            self.users_unique_id_list = read_data['User'].unique()
            self.books_unique_id_list = read_data['Book'].unique()
            self.users_unique_id_list = sorted(set(self.users_unique_id_list))
            self.books_unique_id_list = sorted(set(self.books_unique_id_list))
            #依据id列表，创建 user 和 book 的分别从 id 到 index 的转换，便于实现利用索引访问对应的矩阵 factor
            self.user_id_to_index = {id : index for index, id in enumerate(self.users_unique_id_list)}
            self.book_id_to_index = {id : index for index, id in enumerate(self.books_unique_id_list)}
        else:
            self.user_id_to_index = all_dataset.user_id_to_index
            self.book_id_to_index = all_dataset.book_id_to_index
            
    def __getitem__(self, index):
        # 得到数据文件中index对应的一行
        one_row = self.read_data.iloc[index]
        user_index = self.user_id_to_index[one_row['User']]
        book_index = self.book_id_to_index[one_row['Book']]
        u_b_rating = one_row['Rate'].astype('float32')
        #返回 index
        return user_index, book_index, u_b_rating
    
    def __len__(self):
        return len(self.read_data)

In [136]:

#划分数据集，这里我们使用助教样例代码中的train_test_split
train_data, test_data = train_test_split(read_data, test_size=0.5, random_state=42)

#创建数据集
all_dataset = BookScoreDataSet(read_data)
train_dataset = BookScoreDataSet(train_data, all_dataset)
test_dataset = BookScoreDataSet(test_data, all_dataset)

#创建训练和测试数据迭代器，这里批量大小按照助教给出样例中的4096设置
batch_size = 512
train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [137]:
#基础的基于MSE损失的 MF 模型
class BaseMSEMFModel(nn.Module):
    def __init__(self, factor_dim, users_nums, books_nums):
        super(BaseMSEMFModel, self).__init__()
        self.user_matrix = nn.Embedding(users_nums, factor_dim)
        self.book_matrix = nn.Embedding(books_nums, factor_dim)

    # 前向传播函数，得到预测评分值   
    def forward(self, X):
        user_index, book_index = X
        user_vector = self.user_matrix(user_index)
        book_vector = self.book_matrix(book_index)
        # 经过Embedding层出来的vector形状应该是（batch_size x factor_dim）
        # 做点积，得到（batch_size X 1）的预测评分
        return (user_vector * book_vector).sum(dim = 1)


In [138]:
def train(net, loss, train_iter, batch_size, device, num_epochs = 20, lr = 0.01):
    trainer = torch.optim.SGD(net.parameters(), lr = lr)
    for epoch in range(num_epochs):
        net.train()
        train_loss = 0.0
        for i, X in enumerate(train_iter):
            user_index, book_index, u_b_rating = X 
            # 梯度清零
            trainer.zero_grad()
            X = (user_index.to(device),book_index.to(device))
            # 模型预测
            hat_rate = net(X)
            # 计算损失
            l = loss(hat_rate, u_b_rating.to(device))
            train_loss += l.sum()
            # 反向传播
            l.mean().backward()
            # 更新参数
            trainer.step()
        train_loss = train_loss / ((i + 1) * batch_size)
        print(f'At epoch {epoch}, train_loss {train_loss}')        

In [139]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = BaseMSEMFModel(50, users_nums, books_nums)
loss = nn.MSELoss(reduction='none')
train(net.to(device), loss, train_iter, batch_size, device)

At epoch 0, train_loss 59.39773178100586
At epoch 1, train_loss 57.1633186340332
At epoch 2, train_loss 55.099525451660156
At epoch 3, train_loss 53.17218017578125
At epoch 4, train_loss 51.363590240478516
At epoch 5, train_loss 49.684539794921875
At epoch 6, train_loss 48.10369873046875
At epoch 7, train_loss 46.6208381652832
At epoch 8, train_loss 45.22600555419922
At epoch 9, train_loss 43.90733337402344
At epoch 10, train_loss 42.66625213623047
At epoch 11, train_loss 41.48992919921875
At epoch 12, train_loss 40.384735107421875
At epoch 13, train_loss 39.32802200317383
At epoch 14, train_loss 38.33295440673828
At epoch 15, train_loss 37.38587188720703
At epoch 16, train_loss 36.482826232910156
At epoch 17, train_loss 35.62949752807617
At epoch 18, train_loss 34.81391906738281
At epoch 19, train_loss 34.03607177734375
