In [1]:
# 这些模块和包都是在逐步的探索中所需要的，然后全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
import _pickle as cPickle    # 数据以二进制进行高效的储存到文件
from collections import defaultdict     # 利用Python设置稀疏矩阵的NULL位置的默认值
import scipy.sparse as ss     # 利用scipy构建稀疏矩阵
import scipy.io as sio    # 利用scipy储存评分矩阵
import numpy as np    # 利用numpy创建指定长度或形状的矩阵以及矩阵运算
from numpy.random import random    # numpy.random中的randn函数生成一些正态分布的随机数据
import time    # 利用Python内置模块，计算训练时迭代的时间
import json    # 将模型参数保存为json文件，加载模型参数json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵形式

In [50]:
# 4.2.1、LMF模型

# ##########################
#
# 核心算法实现
#
# @输入参数
#     R —— M*N 评分矩阵
#     k —— 隐向量的维度
#     theta —— 迭代次数
#     alpha —— 步长（学习率）
#     lamda —— 正则化系数
#
# @输出参数
#     分解之后的 P，Q
#     P：初始化用户特征矩阵 M*K
#     Q：初始化物品特征矩阵 N*K
#
# ##########################

# 设定模型参数
K = 20
theta = 10
alpha = 0.04
lamda = 0.15

# 核心算法
def LFM_grad_desc( R, K, theta, alpha, lamda ):
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    
    # P,Q初始值，随机生成
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T
    
    # 第一次迭代时间开始
    time_start = time.time()
    print("开始进行{}个step的训练".format(theta))
    each_time_start = time_start
        
    # 开始迭代
    for step in range(theta):
        print('The {}-th  step is running'.format(step))
        # 对所有的用户u、物品i做遍历，对应的特征向量Pu、Qi梯度下降
        for u in range(M):
            for i in range(N):
                # 对于每一个大于0的评分，求出预测评分误差
                if R[u, i] > 0:
                    eui = np.dot( P[u,:], Q[:,i] ) - R[u, i]
                    
                    # 代入公式，按照梯度下降算法更新当前的Pu、Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha * ( 2 * eui * Q[k][i] + 2 * lamda * P[u][k] )
                        Q[k][i] = Q[k][i] - alpha * ( 2 * eui * P[u][k] + 2 * lamda * Q[k][i] )
        
        # u、i遍历完成，所有特征向量更新完成，可以得到P、Q，可以计算预测评分矩阵
        predR = np.dot( P, Q )
        
        # 计算当前损失函数
        cost = 0
        for u in range(M):
            for i in range(N):
                if R[u, i] > 0:
                    cost += ( np.dot( P[u,:], Q[:,i] ) - R[u, i] ) ** 2
                    # 加上正则化项
                    for k in range(K):
                        cost += lamda * ( P[u][k] ** 2 + Q[k, i] ** 2 )
        # 学习率递减
        alpha = alpha * 0.93
        # 每次迭代时间结束
        each_time_tick = time.time()
        # 每次迭代消耗的时间
        each_cost_time = each_time_tick - each_time_start
        # 更新计算每次迭代的时间
        each_time_start = each_time_tick

        print("完成第{}个step的训练, each_cost={}, 耗时{:.4f}秒".format( step + 1, cost, each_cost_time))
        
        if cost < 0.0001:
            break
    # 计算训练数据集消耗的总时间
    time_end = time.time()
    total_cost_time = time_end - time_start
    print("结束了{}个step的训练，总耗时{:.4f}秒".format(theta, total_cost_time))
    
    return P, Q.T, cost

In [51]:
# 加载评分矩阵
data_path = "./../dataset/amazon-ratings/"
user_item_score = sio.mmread(data_path + "user_item_score")
# todense() 转换为矩阵 numpy 
R = scipy.sparse.csc_matrix.todense(user_item_score)
type(R)

numpy.matrix

In [52]:
# 开始训练 LFM 模型
P, Q, ess = LFM_grad_desc( R, K, theta, alpha, lamda )

开始进行10个step的训练
The 0-th  step is running
完成第1个step的训练, each_cost=3580.136309606088, 耗时7.1254秒
The 1-th  step is running
完成第2个step的训练, each_cost=3046.649098478592, 耗时6.9594秒
The 2-th  step is running
完成第3个step的训练, each_cost=2886.7610273649984, 耗时7.0084秒
The 3-th  step is running
完成第4个step的训练, each_cost=2801.4821734317493, 耗时6.8974秒
The 4-th  step is running
完成第5个step的训练, each_cost=2745.4728568742125, 耗时6.9314秒
The 5-th  step is running
完成第6个step的训练, each_cost=2704.860808819116, 耗时6.9614秒
The 6-th  step is running
完成第7个step的训练, each_cost=2673.435193903154, 耗时7.1084秒
The 7-th  step is running
完成第8个step的训练, each_cost=2648.019426194088, 耗时7.1244秒
The 8-th  step is running
完成第9个step的训练, each_cost=2626.852875087503, 耗时6.8634秒
The 9-th  step is running
完成第10个step的训练, each_cost=2608.8723528627575, 耗时7.0094秒
结束了10个step的训练，总耗时69.9890秒


In [53]:
# 通过训练的 P和 Q 计算出预测评分矩阵
pred_R = np.dot( P, Q.T )

# 将预测评分矩阵保存
sio.mmwrite(data_path + "pred_R", pred_R)

In [54]:
# 查看原始评分矩阵
print(R)

# 查看预测评分矩阵
print(pred_R)

[[5. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 5. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[4.83857931 3.73811531 4.77352667 ... 6.27202923 4.16146118 5.80847694]
 [3.04178985 1.00650255 2.69563913 ... 2.92992615 2.5458185  2.7614196 ]
 [5.23057142 4.16376098 4.85434267 ... 5.86524962 4.14283937 5.65993348]
 ...
 [4.61739479 3.95111858 4.39575309 ... 5.43838556 3.86436457 5.12906015]
 [4.85752078 3.59755094 4.83133172 ... 5.92794067 4.14090859 5.23389289]
 [1.70297629 1.58531603 1.87149025 ... 2.2917259  1.82844772 2.09862004]]


In [55]:
# 查看训练后的 P 和 Q 矩阵
print(P)
print("====================================================")
print(Q)

[[ 0.63515899  0.70087118  0.27532434 ...  0.70137965  0.46049864
   0.48081873]
 [ 0.42334738 -0.0409308   0.52977964 ...  0.13370648  0.51420447
   0.09585611]
 [ 0.59403949  0.20872392  0.95027102 ...  0.35812882  0.17618668
   0.91773258]
 ...
 [ 0.33612095  0.5602776   0.23844083 ...  0.92135     0.20579591
   0.46387385]
 [ 0.69463388  0.36420377  0.18903136 ...  0.09281428  0.14197468
   0.06744615]
 [ 0.76552926 -0.05297563  0.20699245 ...  0.28956855  0.58302395
   0.01316316]]
[[ 0.58639847  0.00698315  0.39374084 ...  0.76269012  0.7330314
   0.71257912]
 [ 0.37649124  0.90481357  0.61311008 ...  0.22732441 -0.0253832
   0.75034403]
 [ 0.51867303  0.22551452  0.48670581 ...  0.30595939  0.43547645
   0.48006123]
 ...
 [ 0.90491178  0.88393711  0.43567433 ...  0.0850964   0.42636425
   0.58598227]
 [ 0.23999208  0.37824131  0.24921961 ...  0.31357549  0.66681144
   0.21138466]
 [ 0.73067211  0.70065972  0.77790943 ...  0.06385877  0.40266697
   0.86059044]]


In [56]:
# LFM模型评测指标
# RMSE 和 MAE

def calc_evaluation(R, pred_R):
    mae_sum = 0.0
    rmse_sum = 0.0
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    T = 0
    
    for user in range(M):
        for item in range(N):
            if R[user, item] > 0:
                T += 1
                ess = R[user, item] - pred_R[user, item]
                mae_sum += abs(ess)
                rmse_sum += ess**2
    
    # 计算总的 MAE 指标
    MAE = mae_sum / T
    # Root Mean Square Error RMSE
    RMSE = np.sqrt(rmse_sum / T)
    return MAE, RMSE

In [57]:
MAE, RMSE = calc_evaluation(R, pred_R)
print("the MAE of LFM:", MAE)
print("the RMSE OF LFM:", RMSE) # 迭代次数为 100

the MAE of LFM: 0.13170055561856733
the RMSE OF LFM: 0.146282740274264
