## 这是基于`Matrix Factorization`的加入正则项的普通 \( MSE \)的协同过滤算法来进行的电影评分的预测，


In [1]:
# 导入所需库
import torch 
from torch import nn
import pandas as pd
import numpy as np
# sklearn 库
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [2]:
# 读取 movie_score_csv
file_path = ('data/movie_score.csv')
read_data = pd.read_csv(file_path)
print('movie_score.csv readed......')

movie_score.csv readed......


In [3]:

#用户和电影的个数，构建 P Q 矩阵需要
users_nums = len(read_data['User'].unique())
movies_nums = len(read_data['Movie'].unique())

In [4]:
#自己的电影评分数据集类，返回用户 id的index ，电影 id的index , 以及对应的评分
class MovieScoreDataSet(torch.utils.data.Dataset):
    def __init__(self, read_data, all_dataset = None):
        self.read_data = read_data
        if all_dataset == None:
            #依据字典索引得到 user 和 movie 的 id 列表
            self.users_unique_id_list = read_data['User'].unique()
            self.movies_unique_id_list = read_data['Movie'].unique()
            self.users_unique_id_list = sorted(set(self.users_unique_id_list))
            self.movies_unique_id_list = sorted(set(self.movies_unique_id_list))
            #依据id列表，创建 user 和 movie 的分别从 id 到 index 的转换，便于实现利用索引访问对应的矩阵 factor
            self.user_id_to_index = {id : index for index, id in enumerate(self.users_unique_id_list)}
            self.movie_id_to_index = {id : index for index, id in enumerate(self.movies_unique_id_list)}
        else:
            self.user_id_to_index = all_dataset.user_id_to_index
            self.movie_id_to_index = all_dataset.movie_id_to_index
            
    def __getitem__(self, index):
        # 得到数据文件中index对应的一行
        one_row = self.read_data.iloc[index]
        user_index = self.user_id_to_index[one_row['User']]
        movie_index = self.movie_id_to_index[one_row['Movie']]
        u_b_rating = one_row['Rate'].astype('float32')
        #返回 index
        return user_index, movie_index, u_b_rating
    
    def __len__(self):
        return len(self.read_data)

In [None]:

#划分数据集
train_data, test_data = train_test_split(read_data, test_size=0.5, random_state=42)

#创建数据集
all_dataset = MovieScoreDataSet(read_data)
train_dataset = MovieScoreDataSet(train_data, all_dataset)
test_dataset = MovieScoreDataSet(test_data, all_dataset)

#创建训练和测试数据迭代器，批量大小设置为256
batch_size = 256
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_iter = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [6]:
#基础的基于MSE损失的 MF 模型
class BaseMSEMFModel(nn.Module):
    def __init__(self, factor_dim, users_nums, movies_nums):
        super(BaseMSEMFModel, self).__init__()
        self.user_matrix = nn.Embedding(users_nums, factor_dim)
        self.movie_matrix = nn.Embedding(movies_nums, factor_dim)

    # 前向传播函数，得到预测评分值   
    def forward(self, X):
        user_index, movie_index = X
        user_vector = self.user_matrix(user_index)
        movie_vector = self.movie_matrix(movie_index)
        # 经过Embedding层出来的vector形状应该是（batch_size x factor_dim）
        # 做点积，得到（batch_size X 1）的预测评分
        return (user_vector * movie_vector).sum(dim = 1)


In [7]:
def main(net, loss, train_iter, test_iter, batch_size, device, num_epochs = 100, lr = 0.02):
    # 定义优化器为 SGD 
    trainer = torch.optim.SGD(net.parameters(), lr = lr)
    # 正则化超参数
    lambda_1 = 0.001
    lambda_2 = 0.001
    # 迭代训练，每一个 epoch 打印 train loss 、test loss and test ndcg_score
    for epoch in range(num_epochs):
        # 训练
        net.train()
        train_loss = 0.0
        for i, X in enumerate(train_iter):
            user_index, movie_index, u_b_rating = X 
            # 梯度清零
            trainer.zero_grad()
            X = (user_index.to(device),movie_index.to(device))
            # 模型预测
            hat_rate = net(X)

            # 计算损失，加入L2范数进行正则化
            l = loss(hat_rate, u_b_rating.to(device)).mean() + lambda_1 * net.user_matrix.weight.norm(2) 
            + lambda_2 * net.movie_matrix.weight.norm(2)

            train_loss += l
            # 反向传播
            l.backward()
            # 更新参数
            trainer.step()

        train_loss /= i+1

        # 测试评估
        net.eval()
        test_loss = 0.0
        results = []
        with torch.no_grad():
            for i, X in enumerate(test_iter):
                user_index, movie_index, u_b_rating = X
                X = (user_index.to(device),movie_index.to(device))
                #得到预测得分
                predict_rate = net(X)
                #计算损失
                l = loss(predict_rate, u_b_rating.to(device)).mean()
                test_loss += l

                #下面计算测试集的ndcg_socre，来评估预测的排序的效果
                res = torch.cat([user_index.unsqueeze(1), predict_rate.cpu().unsqueeze(1), u_b_rating.unsqueeze(1)], dim = 1)
                results.append(res)

            # results变为一个(num_test_instance, 3)的张量
            results = torch.stack(results, dim = 0)
            results = results.flatten(start_dim=0, end_dim=1)
            # 对不同用户分组，分别计算ndcg
            # 将 user_index, predict_rate 和 u_b_rating 拆分开
            user_indexs = results[:, 0].long()   # 用户 index
            pred_ratings = results[:, 1]         # 预测评分
            true_ratings = results[:, 2]         # 真实评分

            # 将用户 ID 转换为 numpy 数组，方便后续操作
            user_indexs_np = user_indexs.cpu().numpy()
            pred_ratings_np = pred_ratings.cpu().numpy()
            true_ratings_np = true_ratings.cpu().numpy()

            # 为每个用户计算 NDCG 分数
            ndcg_scores = []

            # 获取每个用户的唯一 index
            unique_users = np.unique(user_indexs_np)

            for user in unique_users:
                # 获取当前用户的所有评分数据
                user_true_ratings = true_ratings_np[user_indexs_np == user]
                user_pred_ratings = pred_ratings_np[user_indexs_np == user]
                
                # 计算该用户的 NDCG 分数
                if len(user_true_ratings) > 1:
                    # ndcg@k，k取样例代码中的50
                    ndcg = ndcg_score([user_true_ratings], [user_pred_ratings], k=50)
                    ndcg_scores.append(torch.from_numpy(np.array(ndcg)))

            # 将所有用户的 NDCG 分数存储在一个张量中
            ndcg_scores = torch.stack(ndcg_scores)

            # 计算平均 ndcg
            ndcg_score_ = ndcg_scores.mean()
            
            test_loss /= i+1
        # print protocols
        print('At epoch [{}/{}], train_loss {:.6f}, test_loss {:.6f}, ndcg_score {:.6f}'.format(epoch+1, num_epochs, train_loss, test_loss, ndcg_score_))        

In [8]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {device}......')
factor_dim = 50
net = BaseMSEMFModel(factor_dim, users_nums, movies_nums).to(device)
loss = nn.MSELoss(reduction='none')
main(net, loss, train_iter, test_iter, batch_size, device)

Running on cuda......
At epoch [1/100], train_loss 56.504391, test_loss 51.974949, ndcg_score 0.562953
At epoch [2/100], train_loss 47.222401, test_loss 45.304798, ndcg_score 0.562302
At epoch [3/100], train_loss 40.493698, test_loss 40.268356, ndcg_score 0.561438
At epoch [4/100], train_loss 35.446037, test_loss 36.343273, ndcg_score 0.560624
At epoch [5/100], train_loss 31.528646, test_loss 33.200550, ndcg_score 0.560548
At epoch [6/100], train_loss 28.410866, test_loss 30.621618, ndcg_score 0.560230
At epoch [7/100], train_loss 25.867178, test_loss 28.455381, ndcg_score 0.559970
At epoch [8/100], train_loss 23.743757, test_loss 26.592670, ndcg_score 0.559925
At epoch [9/100], train_loss 21.922279, test_loss 24.952925, ndcg_score 0.560072
At epoch [10/100], train_loss 20.327236, test_loss 23.473301, ndcg_score 0.560467
At epoch [11/100], train_loss 18.894247, test_loss 22.104540, ndcg_score 0.560956
At epoch [12/100], train_loss 17.576088, test_loss 20.808287, ndcg_score 0.561449
At 