In [16]:
import pandas as pd
from surprise import KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import KFold
from surprise.accuracy import mae, rmse

### 数据读取

In [7]:
# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
trainset = data.build_full_trainset()

### 模型选择与训练

In [17]:
algo = KNNBaseline(k=30, sim_options={'user_based': False, 'name': 'cosine'}, bsl_options={'method': 'sgd'})

In [19]:
kf = KFold(n_splits=3, random_state=12)
d_fold = {}
for i, (trainset, testset) in enumerate(kf.split(data)):
    fold = f'Fold {i+1}'
    d_fold[fold] = []
    algo.fit(trainset)
    predictions = algo.test(testset)
    d_fold[fold].append(rmse(predictions, verbose=True))
    d_fold[fold].append(mae(predictions, verbose=True))

Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8711
MAE:  0.6688
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8710
MAE:  0.6685
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8711
MAE:  0.6683


In [21]:
df_res = pd.DataFrame(d_fold, index=['RMSE', 'MAE'])
df_res['Mean'] = df_res.mean(axis=1)
df_res['Std'] = df_res.iloc[:, :3].std(axis=1)
df_res

Unnamed: 0,Fold 1,Fold 2,Fold 3,Mean,Std
RMSE,0.871082,0.870958,0.871054,0.871031,6.5e-05
MAE,0.668752,0.668496,0.668257,0.668501,0.000248
