In [1]:
# 模块和包都是在逐步的探索中所需要的，全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^ 
# 不熟悉的模块和包，强烈建议查看官方文档说明以及例子
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
# cPickle 数据以二进制进行高效的储存到文件
import _pickle as cPickle 
# defaultdict 设置稀疏矩阵的 NULL 位置的默认值
from collections import defaultdict 
# 利用scipy sparse 构建稀疏矩阵
import scipy.sparse as ss     
import scipy.io as sio    # 利用scipy储存评分矩阵
# 利用numpy创建指定长度或形状的矩阵以及矩阵运算
import numpy as np 
# numpy.random中的randn函数生成正态分布的随机数据
from numpy.random import random    
import time    # 计算训练时迭代的时间
import json    # 将模型参数保存和加载 json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵

In [2]:
# 4.2.1、LMF模型
# ############################################
#
# 核心算法实现
#
# @输入参数
#     R —— M*N 评分矩阵
#     k —— 隐向量的维度
#     theta —— 迭代次数
#     alpha —— 步长（学习率）
#     lamda —— 正则化系数
#
# @输出参数
#     分解之后的 P，Q
#     P：初始化用户特征矩阵 M*K
#     Q：初始化物品特征矩阵 N*K
#
# ############################################
# 设定模型参数\
# 18 个电影类型 1*18=18、2*18=36、3*18=54
K = 16
# 迭代次数：20、40、60、80、100
theta = 80
alpha = 0.04
lamda = 0.15
# 核心算法
def LFM_grad_desc( R, K, theta, alpha, lamda ):
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    
    # P,Q初始值，随机生成
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T
    
    # 第一次迭代时间开始
    time_start = time.time()
    print("开始进行{}个step的训练".format(theta))
    each_time_start = time_start
        
    # 开始迭代
    for step in range(theta):
        print('The {}-th  step is running'.format(step))
        # 对所有的用户u、物品i做遍历，
        # 对应的特征向量Pu、Qi梯度下降
        for u in range(M):
            for i in range(N):
                # 对于每一个大于0的评分，
                # 求出预测评分误差
                if R[u, i] > 0:
                    eui = np.dot( P[u,:], Q[:,i] ) - R[u, i]
                    
                    # 按照梯度下降算法更新当前的Pu、Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha * ( eui * Q[k][i] + lamda * P[u][k] )
                        Q[k][i] = Q[k][i] - alpha * ( eui * P[u][k] + lamda * Q[k][i] )
        
        # u、i遍历完成，所有特征向量更新完成，
        # 可以得到P、Q，可以计算预测评分矩阵
        predR = np.dot( P, Q )
        
        # 计算当前损失函数
        cost = 0
        for u in range(M):
            for i in range(N):
                if R[u, i] > 0:
                    cost += ( np.dot( P[u,:], Q[:,i] ) - R[u, i] ) ** 2
                    # 加上正则化项
                    for k in range(K):
                        cost += lamda * ( P[u][k] ** 2 + Q[k][i] ** 2 )
                        each_cost = np.sqrt(cost / K)
        # 每次迭代时间结束
        each_time_tick = time.time()
        # 每次迭代消耗的时间
        each_cost_time = each_time_tick - each_time_start
        # 更新计算每次迭代的时间
        each_time_start = each_time_tick
        print("完成第{}个step的训练, each_cost={}, 耗时{:.4f}秒".format( step + 1, cost, each_cost_time))
        
        if cost < 0.0001:
            break
    # 计算训练数据集消耗的总时间
    time_end = time.time()
    total_cost_time = time_end - time_start
    print("结束了{}个step的训练，总耗时{:.4f}秒".format(theta, total_cost_time))    
    return P, Q.T, cost

In [3]:
# 加载评分矩阵
data_path = "./../dataset/BX-CSV-Dump/"
user_item_score = sio.mmread(data_path + "user_item_score")
# todense() 转换为矩阵 numpy 
R = scipy.sparse.csc_matrix.todense(user_item_score)
type(R)

numpy.matrix

In [4]:
# 开始训练 LFM 模型
P, Q, ess = LFM_grad_desc( R, K, theta, alpha, lamda )

开始进行80个step的训练
The 0-th  step is running
完成第1个step的训练, each_cost=5071.82407674086, 耗时25.4025秒
The 1-th  step is running
完成第2个step的训练, each_cost=2428.30570024566, 耗时24.6024秒
The 2-th  step is running
完成第3个step的训练, each_cost=1936.9555259413457, 耗时24.0664秒
The 3-th  step is running
完成第4个step的训练, each_cost=1859.2634231259572, 耗时22.3633秒
The 4-th  step is running
完成第5个step的训练, each_cost=1841.632759960262, 耗时23.0523秒
The 5-th  step is running
完成第6个step的训练, each_cost=1833.8909418587646, 耗时21.9993秒
The 6-th  step is running
完成第7个step的训练, each_cost=1828.4322400637898, 耗时22.5573秒
The 7-th  step is running
完成第8个step的训练, each_cost=1823.7449300678002, 耗时22.1033秒
The 8-th  step is running
完成第9个step的训练, each_cost=1819.426104784929, 耗时22.1153秒
The 9-th  step is running
完成第10个step的训练, each_cost=1815.3373518755488, 耗时22.1123秒
The 10-th  step is running
完成第11个step的训练, each_cost=1811.42079909907, 耗时22.3393秒
The 11-th  step is running
完成第12个step的训练, each_cost=1807.6478852429195, 耗时23.2553秒
The 12-th  step 

In [5]:
# 通过训练的 P和 Q 计算出预测评分矩阵
pred_R = np.dot( P, Q.T )
# 将预测评分矩阵保存
sio.mmwrite(data_path + "pred_R", pred_R)

In [6]:
# 查看原始评分矩阵
print(R)
# 查看预测评分矩阵
print(pred_R)

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0. 10.  0. ...  0.  0.  0.]
 [ 0.  0.  6. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  4.  0.  0.]
 [ 0.  0.  0. ...  0.  5.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]
[[2.9543619  4.12678778 2.96176654 ... 2.57494027 2.8081596  3.12045711]
 [4.36364121 9.85162004 6.48141293 ... 5.76056107 6.30100043 5.08428771]
 [3.30191753 6.16128313 5.85316684 ... 3.95818428 4.15632789 4.16597372]
 ...
 [3.32815127 6.52026194 4.50157933 ... 3.86164959 4.39651496 3.8632879 ]
 [3.22373506 6.44220704 4.58782175 ... 3.67828396 4.85568411 3.8349583 ]
 [3.71136521 4.54247141 3.65722122 ... 2.94600587 3.35376052 3.07940526]]


In [7]:
# 查看训练后的 P 和 Q 矩阵
print(P)
print("====================================================")
print(Q)

[[0.00468092 0.40087031 0.28547441 ... 0.73797073 0.55272424 0.53061885]
 [0.75937665 0.48595437 0.30585587 ... 0.85160014 0.64951362 0.62342984]
 [0.84318547 0.65630566 0.27864805 ... 0.33177051 0.97956943 0.4131765 ]
 ...
 [0.47293716 0.62860248 0.46227091 ... 0.53530048 0.43329437 0.4442142 ]
 [0.83465556 0.5008916  0.24820184 ... 0.53286364 0.5616119  0.57069278]
 [0.39045181 0.04260213 0.48856657 ... 0.06984424 0.74276618 0.3699813 ]]
[[0.08080875 0.08849171 0.87807062 ... 0.27572461 0.39584158 0.75256576]
 [0.60105077 0.59872701 0.37296041 ... 0.91352148 0.73517096 0.75862097]
 [0.99095435 0.5244761  0.28116595 ... 0.30451613 0.8609495  0.35814884]
 ...
 [0.58173809 0.27754387 0.26614914 ... 0.69979786 0.52855633 0.29516541]
 [0.72274701 0.70404608 0.44724442 ... 0.75020163 0.58749634 0.32758831]
 [0.32082338 0.80140605 0.70629455 ... 0.81623924 0.88339137 0.9031128 ]]


In [8]:
# LFM模型评测指标
# RMSE 和 MAE
def calc_evaluation(R, pred_R):
    mae_sum = 0.0
    rmse_sum = 0.0
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    T = 0
    
    for user in range(M):
        for item in range(N):
            if R[user, item] > 0:
                T += 1
                ess = R[user, item] - pred_R[user, item]
                mae_sum += abs(ess)
                rmse_sum += ess**2
    
    # 计算总的 MAE 指标
    MAE = mae_sum / T
    # Root Mean Square Error RMSE
    RMSE = np.sqrt(rmse_sum / T)
    return MAE, RMSE

In [9]:
MAE, RMSE = calc_evaluation(R, pred_R)
print("the MAE of LFM:", MAE)
print("the RMSE OF LFM:", RMSE)

the MAE of LFM: 0.14668918203365244
the RMSE OF LFM: 0.1472427659730991
