In [10]:
# 模块和包都是在逐步的探索中所需要的，全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^ 
# 不熟悉的模块和包，强烈建议查看官方文档说明以及例子
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
# cPickle 数据以二进制进行高效的储存到文件
import _pickle as cPickle 
# defaultdict 设置稀疏矩阵的 NULL 位置的默认值
from collections import defaultdict 
# 利用scipy sparse 构建稀疏矩阵
import scipy.sparse as ss     
import scipy.io as sio    # 利用scipy储存评分矩阵
# 利用numpy创建指定长度或形状的矩阵以及矩阵运算
import numpy as np 
# numpy.random中的randn函数生成正态分布的随机数据
from numpy.random import random    
import time    # 计算训练时迭代的时间
import json    # 将模型参数保存和加载 json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵

In [11]:
# 4.2.1、LMF模型
# ############################################
#
# 核心算法实现
#
# @输入参数
#     R —— M*N 评分矩阵
#     k —— 隐向量的维度
#     theta —— 迭代次数
#     alpha —— 步长（学习率）
#     lamda —— 正则化系数
#
# @输出参数
#     分解之后的 P，Q
#     P：初始化用户特征矩阵 M*K
#     Q：初始化物品特征矩阵 N*K
#
# ############################################
# 设定模型参数\
# 18 个电影类型 1*18=18、2*18=36、3*18=54
K = 16
# 迭代次数：20、40、60、80、100
theta = 80
alpha = 0.04
lamda = 0.15
# 核心算法
def LFM_grad_desc( R, K, theta, alpha, lamda ):
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    
    # P,Q初始值，随机生成
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T
    
    # 第一次迭代时间开始
    time_start = time.time()
    print("开始进行{}个step的训练".format(theta))
    each_time_start = time_start
        
    # 开始迭代
    for step in range(theta):
        print('The {}-th  step is running'.format(step))
        # 对所有的用户u、物品i做遍历，
        # 对应的特征向量Pu、Qi梯度下降
        for u in range(M):
            for i in range(N):
                # 对于每一个大于0的评分，
                # 求出预测评分误差
                if R[u, i] > 0:
                    eui = np.dot( P[u,:], Q[:,i] ) - R[u, i]
                    
                    # 按照梯度下降算法更新当前的Pu、Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha * ( eui * Q[k][i] + lamda * P[u][k] )
                        Q[k][i] = Q[k][i] - alpha * ( eui * P[u][k] + lamda * Q[k][i] )
        
        # u、i遍历完成，所有特征向量更新完成，
        # 可以得到P、Q，可以计算预测评分矩阵
        predR = np.dot( P, Q )
        
        # 计算当前损失函数
        cost = 0
        for u in range(M):
            for i in range(N):
                if R[u, i] > 0:
                    cost += ( np.dot( P[u,:], Q[:,i] ) - R[u, i] ) ** 2
                    # 加上正则化项
                    for k in range(K):
                        cost += lamda * ( P[u][k] ** 2 + Q[k][i] ** 2 )
                        each_cost = np.sqrt(cost / K)
        # 每次迭代时间结束
        each_time_tick = time.time()
        # 每次迭代消耗的时间
        each_cost_time = each_time_tick - each_time_start
        # 更新计算每次迭代的时间
        each_time_start = each_time_tick
        print("完成第{}个step的训练, each_cost={}, 耗时{:.4f}秒".format( step + 1, cost, each_cost_time))
        
        if cost < 0.0001:
            break
    # 计算训练数据集消耗的总时间
    time_end = time.time()
    total_cost_time = time_end - time_start
    print("结束了{}个step的训练，总耗时{:.4f}秒".format(theta, total_cost_time))    
    return P, Q.T, cost

In [12]:
# 加载评分矩阵
data_path = "./../dataset/BX-CSV-Dump/"
user_item_score = sio.mmread(data_path + "user_item_score")
# todense() 转换为矩阵 numpy 
R = scipy.sparse.csc_matrix.todense(user_item_score)
type(R)

numpy.matrix

In [13]:
# 开始训练 LFM 模型
P, Q, ess = LFM_grad_desc( R, K, theta, alpha, lamda )

开始进行80个step的训练
The 0-th  step is running
完成第1个step的训练, each_cost=1556.3372448524672, 耗时22.2953秒
The 1-th  step is running
完成第2个step的训练, each_cost=1286.4829695229348, 耗时21.5772秒
The 2-th  step is running
完成第3个step的训练, each_cost=1184.8205522979802, 耗时21.5242秒
The 3-th  step is running
完成第4个step的训练, each_cost=1140.789880940058, 耗时21.3562秒
The 4-th  step is running
完成第5个step的训练, each_cost=1118.550601145831, 耗时21.3252秒
The 5-th  step is running
完成第6个step的训练, each_cost=1105.1582712027584, 耗时21.3112秒
The 6-th  step is running
完成第7个step的训练, each_cost=1095.6269354752053, 耗时21.2362秒
The 7-th  step is running
完成第8个step的训练, each_cost=1087.9239681893448, 耗时21.2822秒
The 8-th  step is running
完成第9个step的训练, each_cost=1081.171646982558, 耗时21.1592秒
The 9-th  step is running
完成第10个step的训练, each_cost=1074.968153090898, 耗时21.3102秒
The 10-th  step is running
完成第11个step的训练, each_cost=1069.118514440358, 耗时21.3852秒
The 11-th  step is running
完成第12个step的训练, each_cost=1063.5224668060985, 耗时21.3322秒
The 12-th  st

In [14]:
# 通过训练的 P和 Q 计算出预测评分矩阵
pred_R = np.dot( P, Q.T )
# 将预测评分矩阵保存
sio.mmwrite(data_path + "pred_R", pred_R)

In [15]:
# 查看原始评分矩阵
print(R)
# 查看预测评分矩阵
print(pred_R)

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0. 10.  0. ...  0.  0.  0.]
 [ 0.  0.  6. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  4.  0.  0.]
 [ 0.  0.  0. ...  0.  5.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]
[[3.96721748 3.42702335 2.91267433 ... 2.24869418 2.2796187  3.06555235]
 [4.52907257 4.85635886 3.08850978 ... 2.96170881 2.82823102 4.12260219]
 [4.06449997 3.01765911 2.86766152 ... 2.49347607 2.21585425 3.38163361]
 ...
 [2.66174109 2.26134594 1.92663625 ... 1.87469318 1.60711235 2.11666156]
 [3.74082535 3.52250887 2.69288849 ... 2.41623118 2.36875035 3.55470725]
 [4.48982368 4.33140652 3.29320487 ... 2.77492591 2.64757183 3.95179228]]


In [16]:
# 查看训练后的 P 和 Q 矩阵
print(P)
print("====================================================")
print(Q)

[[0.03110579 0.90092573 0.10061658 ... 0.60953477 0.03940641 0.21467996]
 [0.44270401 0.87259874 0.86676969 ... 0.52550093 0.68544395 0.5310504 ]
 [0.63467921 0.25632915 0.44791758 ... 0.28532077 0.33009696 0.09333528]
 ...
 [0.37542351 0.31288138 0.20981461 ... 0.25432703 0.13745457 0.2213469 ]
 [0.08530476 0.47724678 0.57152931 ... 0.45820884 0.44714744 0.39290853]
 [0.0241565  0.91600459 0.29446936 ... 0.79659843 0.88448111 0.62515252]]
[[0.80610512 0.93504496 0.11566151 ... 0.87151794 0.42681622 0.11185077]
 [0.46218514 0.78289453 0.66194    ... 0.68970322 0.43536483 0.75562124]
 [0.32614656 0.65274206 0.31470409 ... 0.09986914 0.57197932 0.07601742]
 ...
 [0.23361816 0.16750703 0.42494413 ... 0.64421337 0.19562474 0.40976416]
 [0.20695398 0.2447751  0.45872985 ... 0.48627968 0.0929761  0.55330531]
 [0.36344401 0.19872457 0.94017584 ... 0.09782842 0.90329149 0.07286373]]


In [17]:
# LFM模型评测指标
# RMSE 和 MAE
def calc_evaluation(R, pred_R):
    mae_sum = 0.0
    rmse_sum = 0.0
    # 基本维度参数定义
    M = R.shape[0]
    N = R.shape[1]
    T = 0
    
    for user in range(M):
        for item in range(N):
            if R[user, item] > 0:
                T += 1
                ess = R[user, item] - pred_R[user, item]
                mae_sum += abs(ess)
                rmse_sum += ess**2
    
    # 计算总的 MAE 指标
    MAE = mae_sum / T
    # Root Mean Square Error RMSE
    RMSE = np.sqrt(rmse_sum / T)
    return MAE, RMSE

In [18]:
MAE, RMSE = calc_evaluation(R, pred_R)
print("the MAE of LFM:", MAE)
print("the RMSE OF LFM:", RMSE)

the MAE of LFM: 3.930479728104465
the RMSE OF LFM: 4.048104826394875
