In [12]:
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import SlopeOne
from surprise import accuracy
from surprise.model_selection import KFold

In [2]:
# 数据读取(读取的数据必须至少包含三列数据： user、item、ratings)
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
train_set = data.build_full_trainset()

### SlopeOne算法

算法原理概述：  
当要评估用户u对项目i的评分时，通过找到其他与用户u有相同已评分项目j以及有对项目i进行评分的用户，计算这些用户对项目j和i的评分偏差求均值，再基于这个平均偏差来预测用户u对项目i的评分。

1. 只用最后一个模型进行预测

In [8]:
kf = KFold(n_splits=5)
for trainset, testset in kf.split(data):
    algo = SlopeOne()
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid, r_ui=4, verbose=True)  # 最后一次训练的模型的预测结果

RMSE: 0.8630
RMSE: 0.8637
RMSE: 0.8674
RMSE: 0.8663
RMSE: 0.8643
user: 196        item: 302        r_ui = 4.00   est = 4.30   {'was_impossible': False}


2. 取训练的k个模型进行预测的平均值

In [15]:
kf = KFold(n_splits=5)
ls_algo = []
for trainset, testset in kf.split(data):
    algo = SlopeOne()
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)
    ls_algo.append(algo)
    
uid = str(196)
iid = str(302)
ls_res = []
for algo in ls_algo:
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)
    ls_res.append(pred.est)
print(f'实际结果为{pred.r_ui}，预测结果为:{np.mean(ls_res)}')

RMSE: 0.8642
RMSE: 0.8633
RMSE: 0.8677
RMSE: 0.8664
RMSE: 0.8637
user: 196        item: 302        r_ui = 4.00   est = 4.29   {'was_impossible': False}
user: 196        item: 302        r_ui = 4.00   est = 4.16   {'was_impossible': False}
user: 196        item: 302        r_ui = 4.00   est = 4.45   {'was_impossible': False}
user: 196        item: 302        r_ui = 4.00   est = 4.44   {'was_impossible': False}
user: 196        item: 302        r_ui = 4.00   est = 4.24   {'was_impossible': False}
实际结果为4，预测结果为:4.316578010279427
