# 数据准备

In [1]:
import pandas as pd
import numpy as np
train = pd.read_table('../datasets/ml-100k/u1.base',
                      sep='\t', header=None).iloc[:, :3].values
test = pd.read_table('../datasets/ml-100k/u1.test',
                     sep='\t', header=None).iloc[:, :3].values
n_users, n_items = 943+1, 1682+1    # 数据idx从1开始
n_samples = train.shape[0]

print(train.shape, test.shape)

(80000, 3) (20000, 3)


# 模型基本
## 参数设定

In [3]:
k = 20    # 隐因子数量

glob_mean = np.mean(train[:, 2])    # 全局均分

bi = np.random.randn(n_items)
bu = np.random.randn(n_users)
qi = np.random.randn(n_items, k)
pu = np.random.randn(n_users, k)

# 查询用字典，避免生成大型稀疏矩阵
item_user_dict = dict()
user_item_dict = dict()

In [4]:
for sample in train:
    user_id,item_id,rating=sample
    item_user_dict.setdefault(item_id,{})
    user_item_dict.setdefault(user_id,{})
    
    item_user_dict[item_id][user_id]=rating
    user_item_dict[user_id][item_id]=rating

## 训练代码

In [5]:
max_iter = 20    # 迭代次数
lr = 0.01    # 学习率
alpha = 0.1    # 正则项系数

for epoch in range(max_iter):
    MSE = 0
    random_idxs = np.random.permutation(n_samples)

    for idx in random_idxs:
        user_id, item_id, rating = train[idx]
        y_pred = glob_mean+bi[item_id]+bu[user_id] + \
            np.dot(pu[user_id], qi[item_id].T)
        err = rating-y_pred
        MSE += err**2

        bu[user_id] += lr*(err-alpha*bu[user_id])
        bi[item_id] += lr*(err-alpha*bi[item_id])
        tmp = qi[item_id]
        qi[item_id] += lr*(err*pu[user_id]-alpha*qi[item_id])
        pu[user_id] += lr*(err*tmp-alpha*pu[user_id])

    MSE /= n_samples
    print(epoch, MSE)

0 1.034226696186526
1 0.9193063490388131
2 0.8905347580429747
3 0.8771195885444455
4 0.8688715384079316
5 0.8638050550667303
6 0.8600667901603428
7 0.8577347026688281
8 0.8552588375611702
9 0.8541307628195199
10 0.8527736487592168
11 0.8518794148114602
12 0.8508790143733819
13 0.8502341996279189
14 0.8494970644918951
15 0.8490790063503307
16 0.8486372883947207
17 0.8482090808401567
18 0.8477907383287997
19 0.8472711192077239


## 测试误差

In [6]:
Y_pred = list()
test_mse = 0
for sample in test:
    user_id, item_id, rating = sample
    y_pred = glob_mean+bi[item_id]+bu[user_id] + \
        np.dot(pu[user_id], qi[item_id].T)
    test_mse += (rating-y_pred)**2
test_mse /= len(test)

print(test_mse)

0.9148970313988244
