In [27]:
# 模块和包都是在逐步的探索中所需要的，全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^ 
# 不熟悉的模块和包，强烈建议查看官方文档说明以及例子
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
# cPickle 数据以二进制进行高效的储存到文件
import _pickle as cPickle 
# defaultdict 设置稀疏矩阵的 NULL 位置的默认值
from collections import defaultdict 
# 利用scipy sparse 构建稀疏矩阵
import scipy.sparse as ss     
import scipy.io as sio    # 利用scipy储存评分矩阵
# 利用numpy创建指定长度或形状的矩阵以及矩阵运算
import numpy as np 
# numpy.random中的randn函数生成正态分布的随机数据
from numpy.random import random    
import time    # 计算训练时迭代的时间
import json    # 将模型参数保存和加载 json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵

In [120]:
# 4.2.1、LMF模型
# ############################################
#
# 核心算法实现
#
# @输入参数
#     R —— M*N 评分矩阵
#     k —— 隐向量的维度
#     theta —— 迭代次数
#     alpha —— 步长（学习率）
#     lamda —— 正则化系数
#
# @输出参数
#     分解之后的 P，Q
#     P：初始化用户特征矩阵 M*K
#     Q：初始化物品特征矩阵 N*K
#
# ############################################
# 设定模型参数
# 18 个电影类型 10、12、14、16、18、20、24、36
K = 16
# 迭代次数：20、40、60、80、100
theta = 80
alpha = 0.005
lamda = 0.15
# 核心算法
def LFM_grad_desc( R, K, theta, alpha, lamda ):
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    
    # P,Q初始值，随机生成
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T
    
    # 第一次迭代时间开始
    time_start = time.time()
    print("开始进行{}个step的训练".format(theta))
    each_time_start = time_start
        
    # 开始迭代
    for step in range(theta):
        print('The {}-th  step is running'.format(step))
        # 对所有的用户u、物品i做遍历，
        # 对应的特征向量Pu、Qi梯度下降
        for u in range(M):
            for i in range(N):
                # 对于每一个大于0的评分，
                # 求出预测评分误差
                if R[u, i] > 0:
                    eui = np.dot( P[u,:], Q[:,i] ) - R[u, i]
                    
                    # 按照梯度下降算法更新当前的Pu、Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha * ( eui * Q[k][i] + lamda * P[u][k] )
                        Q[k][i] = Q[k][i] - alpha * ( eui * P[u][k] + lamda * Q[k][i] )
        
        # u、i遍历完成，所有特征向量更新完成，
        # 可以得到P、Q，可以计算预测评分矩阵
        predR = np.dot( P, Q )
        
        # 计算当前损失函数
        cost = 0
        for u in range(M):
            for i in range(N):
                if R[u, i] > 0:
                    cost += ( np.dot( P[u,:], Q[:,i] ) - R[u, i] ) ** 2
                    # 加上正则化项
                    for k in range(K):
                        cost += lamda * ( P[u][k] ** 2 + Q[k][i] ** 2 )
                        each_cost = np.sqrt(cost / K)
        
        # 学习率递减 / learning rate improve 3th
        alpha = alpha * 0.93
        # 每次迭代时间结束
        each_time_tick = time.time()
        # 每次迭代消耗的时间
        each_cost_time = each_time_tick - each_time_start
        # 更新计算每次迭代的时间
        each_time_start = each_time_tick
        print("完成第{}个step的训练, each_cost={}, 耗时{:.4f}秒".format( step + 1, cost, each_cost_time))
        
        if cost < 0.0001:
            break
    # 计算训练数据集消耗的总时间
    time_end = time.time()
    total_cost_time = time_end - time_start
    print("结束了{}个step的训练，总耗时{:.4f}秒".format(theta, total_cost_time))    
    return P, Q.T, cost

In [121]:
# 加载评分矩阵
data_path = "./../dataset/ml-25m/"
user_item_score = sio.mmread(data_path + "user_item_score")
# todense() 转换为矩阵 numpy 
R = scipy.sparse.csc_matrix.todense(user_item_score)
type(R)

numpy.matrix

In [122]:
# 开始训练 LFM 模型
P, Q, ess = LFM_grad_desc( R, K, theta, alpha, lamda )

开始进行100个step的训练
The 0-th  step is running
完成第1个step的训练, each_cost=7184.14612003106, 耗时20.3322秒
The 1-th  step is running
完成第2个step的训练, each_cost=6695.706017622515, 耗时20.1422秒
The 2-th  step is running
完成第3个step的训练, each_cost=6308.151316812769, 耗时20.2222秒
The 3-th  step is running
完成第4个step的训练, each_cost=5994.669948825578, 耗时20.2682秒
The 4-th  step is running
完成第5个step的训练, each_cost=5737.061884935879, 耗时20.4552秒
The 5-th  step is running
完成第6个step的训练, each_cost=5522.54060775252, 耗时20.1332秒
The 6-th  step is running
完成第7个step的训练, each_cost=5341.867813744572, 耗时20.1732秒
The 7-th  step is running
完成第8个step的训练, each_cost=5188.210800808784, 耗时20.2842秒
The 8-th  step is running
完成第9个step的训练, each_cost=5056.414306819959, 耗时20.1532秒
The 9-th  step is running
完成第10个step的训练, each_cost=4942.521010529235, 耗时20.2352秒
The 10-th  step is running
完成第11个step的训练, each_cost=4843.44689524589, 耗时20.1842秒
The 11-th  step is running
完成第12个step的训练, each_cost=4756.756146745573, 耗时20.1402秒
The 12-th  step is run

In [123]:
# 通过训练的 P和 Q 计算出预测评分矩阵
pred_R = np.dot( P, Q.T )
# 将预测评分矩阵保存
sio.mmwrite(data_path + "pred_R", pred_R)

In [124]:
# 查看原始评分矩阵
print(R)
# 查看预测评分矩阵
print(pred_R)

[[3.5 0.  0.  ... 0.  0.  0. ]
 [0.  4.  0.  ... 0.  0.  0. ]
 [0.  0.  1.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]
[[4.42265701 3.70424086 2.92530286 ... 3.28586296 3.66743726 3.31891823]
 [4.91853062 3.86737123 3.000895   ... 4.35648991 4.17886201 3.463823  ]
 [3.38731123 3.08068239 2.71844636 ... 3.45452348 3.02973188 3.00132094]
 ...
 [5.41812968 5.07058586 4.3197803  ... 4.66438458 4.63318126 3.63906152]
 [4.277156   2.99367351 2.93057628 ... 4.03656947 3.45455137 2.62244854]
 [4.77669697 3.37390768 3.54481507 ... 4.35588632 3.9031377  3.24295518]]


In [125]:
# 查看训练后的 P 和 Q 矩阵
print(P)
print("====================================================")
print(Q)

[[ 0.06593345  0.32186047  0.4976257  ... -0.09958345  0.1584439
   0.6848624 ]
 [ 0.36620279  0.31416959  0.41308537 ...  0.66331463  0.98444888
   0.28672297]
 [ 0.78441498  0.69788196  0.28960103 ...  0.50051075  0.58106854
   0.47003226]
 ...
 [ 0.9332154   0.70923459  0.33642715 ...  0.93963462  0.46401874
   0.96817776]
 [ 0.06704608  0.67819881  0.64192002 ...  0.66981962  0.52080464
   0.81221157]
 [ 0.61845541  0.46915964  0.55543854 ...  0.47280961  0.95707288
   0.1364307 ]]
[[ 1.79483818e-01  4.74321878e-01  7.15267388e-01 ...  9.94582543e-01
   6.72835420e-01  6.69857406e-01]
 [ 7.75868717e-01  5.20468549e-01  2.46878674e-01 ...  1.99141891e-01
  -2.17662902e-04  5.55480437e-01]
 [ 8.38112305e-01  4.71472199e-01  7.13348754e-01 ...  7.16759635e-01
   8.27744758e-03  1.50912311e-01]
 ...
 [ 6.13879168e-01  2.04580626e-01  2.17737030e-01 ...  8.83554850e-01
   8.03421987e-01  4.04945353e-01]
 [ 4.09148213e-01  5.65906870e-01  5.99735544e-01 ...  4.57723593e-01
   6.36497808e

In [126]:
# LFM模型评测指标
# RMSE 和 MAE
def calc_evaluation(R, pred_R):
    mae_sum = 0.0
    rmse_sum = 0.0
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    T = 0
    
    for user in range(M):
        for item in range(N):
            if R[user, item] > 0:
                T += 1
                ess = R[user, item] - pred_R[user, item]
                mae_sum += abs(ess)
                rmse_sum += ess**2
    
    # 计算总的 MAE 指标
    MAE = mae_sum / T
    # Root Mean Square Error RMSE
    RMSE = np.sqrt(rmse_sum / T)
    return MAE, RMSE

In [127]:
MAE, RMSE = calc_evaluation(R, pred_R)
print("the MAE of LFM:", MAE)
print("the RMSE OF LFM:", RMSE)

the MAE of LFM: 0.5479792516762387
the RMSE OF LFM: 0.7009523875427962
