## 需要安装推荐系统库surprise , pip install scikit-surprise

In [1]:
import numpy as np
import surprise

Surprise 库本身没有提供矩阵分解的算法，这里实现基于　Alternating Least Squares 的矩阵分解，使用梯度下降法优化
矩阵分解类 MatrixFactorization　继承了　suprise.AlgoBase, 方便我们使用surprise 库提供的其它功能

In [6]:
class MatrixFactorization(surprise.AlgoBase):
    def __init__(self, learning_rate, n_epochs, n_factors, lmd):
        self.lr = learning_rate     # 梯度下降法的学习率
        self.n_epochs = n_epochs    # 梯度下降法的迭代次数
        self.n_factors = n_factors  # 分解的矩阵的秩(rank)
        self.lmd = lmd              # 防止过拟合的正则化的强度
    
    def fit(self, trainset):
        '''
        梯度下降法训练，得出所有的　u_i 和　p_j 的值
        '''
        print("Fitting data with SGD")
        
        # 随机初始化　user 和　item 矩阵
        u = np.random.normal(0, .1, (trainset.n_users, self.n_factors))
        p = np.random.normal(0, .1, (trainset.n_items, self.n_factors))
        
        for _ in range(self.n_epochs):
            for i, j, r_ij in trainset.all_ratings():
                err = r_ij - np.dot(u[i], p[j])
                # 利用梯度调整　u_i 和　p_j
                u[i] -= -self.lr * err * p[j] + self.lr * self.lmd * u[i]
                p[j] -= -self.lr * err * u[i] + self.lr * self.lmd * p[j]
                # 注意: 修正 p_j 时, 按照严格定义, 我们应该使用 u_i 修正之前的值, 但是实际上差别微乎其微
        self.u = u
        self.p = p
        self.trainset = trainset
        
    def estimate(self, i, j):
        '''
        预测　user i 对　item j 的评分
        '''
        # 如果用户　i 和物品　j 是已知的值，返回　u_i 和　p_j 的点积
        # 否则使用全局平均评分 rating 值（cold start 冷启动问题）
        if self.trainset.knows_user(i) and self.trainset.knows_item(j):
            return np.dot(self.u[i], self.p[j])
        else:
            return self.trainset.global_mean

## 演示如何调用以上定义的矩阵分解类实现短视频的推荐

In [15]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
import os

# 数据文件
file_path = os.path.expanduser('./ml-100k/u.data')

# 数据文件的格式如下
reader = Reader(line_format = 'user item rating timestamp', sep='\t', rating_scale=(1,5))
data = Dataset.load_from_file(file_path, reader=reader)

In [9]:
## 拆分数据集，随机分为训练和测试数据集
trainset, testset = train_test_split(data, test_size=0.25)

# 初始化以上定义的矩阵分解类
algo = MatrixFactorization(learning_rate=0.005, n_epochs=60, n_factors=2, lmd=0.2)

# 训练
algo.fit(trainset)

# 预测
pred = algo.test(testset)

#　计算平均绝对误差
accuracy.mae(pred)

Fitting data with SGD
MAE:  0.7793


0.7792896831830767

In [10]:
# 使用 surprise 内建的基于最邻近的方法做比较　surprise.KNNBasic() 
algo = surprise.KNNBasic()
algo.fit(trainset)
pred = algo.test(testset)
accuracy.mae(pred)

Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.7744


0.774416921938454

In [11]:
# 使用 surprise 内建的基于SVD 的方法做比较
algo = surprise.SVD()
algo.fit(trainset)
pred = algo.test(testset)
accuracy.mae(pred)

MAE:  0.7399


0.7399076400916318