## 库导入

In [1]:
from IPython.core.display import display
import numpy as np
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## 评级数据导入


In [2]:
# import train data "user ,movie, rating, time "
rawData = sc.textFile("file:///Users/wente/GitHub/MovieRating/wente_file/ml-100k/u.data")
# 用不到时间戳——弃掉
rawRatings = rawData.map(lambda x: x.split("\t")).map(lambda x: x[:3]).map(lambda l: (int(l[0]), int(l[1]), float(l[2])))
# 将数据转换为Rating对象
ratings = rawRatings.map(lambda l: Rating(l[0], l[1], l[2]))
print (ratings.first())

predictData = rawRatings.map(lambda x: x[:2])
print (predictData.first())
# test
userNum  = rawRatings.map(lambda x: x[0]).distinct().count()
predictNum = rawRatings.map(lambda x: x[1]).distinct().count()
print (userNum, predictNum)

Rating(user=196, product=242, rating=3.0)
(196, 242)
943 1682


## 建模
### 参数的设置：

rank 因子个数，一般越多越好，不过内存开销也越大，合理值是 10-200。

iterations 迭代次数 对 ALS 10次就可以了。

lambda 正则化过程————>也就是过拟合情况，应该统计非样的测试数据 进行交叉验证调整。

In [37]:
## 训练
rank = 50
iterations = 10
theLambda  = 0.01
model = ALS.train(ratings, rank, iterations, theLambda)
print (model)


<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x112130d30>

In [39]:
#test
## predict user789 对 movie123的rating预测
predict = model.predict(789,123)
print (predict)
# test
print (model.userFeatures())
print (model.userFeatures().count())# 6040
print (model.productFeatures().count()) # 3706

PythonRDD[851] at RDD at PythonRDD.scala:43

943

1682

## 模型评估
- 均方误差(Mean Squared Error,MSE) 
常用于显示评级的情形
- Root Mean Squared Error,RMSE
- K值平均准确率 MAPK 

### 均方误差

In [42]:
# 对测试数据集进行预测
# predictions 
predictions = model.predictAll(predictData).map(lambda x: ((x[0],x[1]),x[2]))
predictions.first()

# 原始数据整理
# 
result = rawRatings.map(lambda x: ((x[0],x[1]),x[2])).join(predictions)
display (result.first())

## MSE & RMSE
mse = result.map(lambda v: (v[1][0] - v[1][1]) ** 2).mean()
rmse = np.sqrt(mse)
print("mse = {:.3f}".format(mse))
print("rmse = {:.3f}".format(rmse))

((711, 707), (5.0, 5.199120610460281))

mse = 0.084
rmse = 0.290


### K值平均准确率（单个用户）

In [44]:
# MAP的Python脚本如下：
def avgPrecisionK(actual, predicted, k): 
    if len(predicted) > k:
        predK = predicted[:k]
    else:
        predK = predicted
    score = 0.0
    numHits = 0.0
    for i,p in enumerate(predK):
        if p in actual and p not in predK:
            numHits = numHits + 1
            score = score + numHits/(i+1)
    if not actual:
        return 1.0
    else:
        return score/min(len(actual),k)

In [45]:
# 下面计算模型对用户789推荐的MAP。首先提取出用户实际评级过的电影的ID和预测电影ID:
# 数据整理
userId = 7
K = 10
actualMovies =[elem[1:] for elem in model.recommendProducts(userId, K)]
predictMovies = [elem[1:] for elem in rawRatings.keyBy(lambda x: x[0]).lookup(userId)]
display (actualMovies[:1])
display (predictMovies[:1])

[(466, 6.180083966572809)]

[(32, 4.0)]

In [46]:
MAP10 = avgPrecisionK(actualMovies,predictMovies,10)
display (MAP10)

0.0

这里,APK的得分为0,这表明该模型在为该用户做相关电影预测上的表现并不理想。,推荐类任务的这个得分通常都较低,特别是当物品的数量极大时。

## 参数调整

## 用户推荐——为某个用户生产评分最高的前K个电影

In [47]:
# 为某个用户生产前K个推荐物品（评分最高的）
userId = 6
K = 10
topKRecs = model.recommendProducts(userId, K)
topKRecs

[Rating(user=6, product=190, rating=5.5493446692464685),
 Rating(user=6, product=603, rating=5.517621034424161),
 Rating(user=6, product=173, rating=5.422938849019065),
 Rating(user=6, product=657, rating=5.405587489069024),
 Rating(user=6, product=661, rating=5.273043507159072),
 Rating(user=6, product=514, rating=5.230886754323885),
 Rating(user=6, product=490, rating=5.21597693859621),
 Rating(user=6, product=652, rating=5.137018279668898),
 Rating(user=6, product=474, rating=5.089366284138842),
 Rating(user=6, product=493, rating=5.086634777428775)]

In [49]:
# 查找真实数据中user798对电影的评分
# 为了和上面的预测对比
moviesForUser = rawRatings.keyBy(lambda x: x[0]).lookup(userId)
len(moviesForUser)

211

In [50]:
# 对比上面的预测值topKRecs ——差别有点大啊
# 真实的评分最高的前K个物品（电影）
display (sorted(moviesForUser, key=lambda x:x[2], reverse=True)[:K])
display  ("——————下面是预测数据——————")
display  (topKRecs)

[(6, 14, 5.0),
 (6, 98, 5.0),
 (6, 492, 5.0),
 (6, 469, 5.0),
 (6, 211, 5.0),
 (6, 475, 5.0),
 (6, 134, 5.0),
 (6, 525, 5.0),
 (6, 523, 5.0),
 (6, 481, 5.0)]

'——————下面是预测数据——————'

[Rating(user=6, product=190, rating=5.5493446692464685),
 Rating(user=6, product=603, rating=5.517621034424161),
 Rating(user=6, product=173, rating=5.422938849019065),
 Rating(user=6, product=657, rating=5.405587489069024),
 Rating(user=6, product=661, rating=5.273043507159072),
 Rating(user=6, product=514, rating=5.230886754323885),
 Rating(user=6, product=490, rating=5.21597693859621),
 Rating(user=6, product=652, rating=5.137018279668898),
 Rating(user=6, product=474, rating=5.089366284138842),
 Rating(user=6, product=493, rating=5.086634777428775)]

## 物品推荐——计算与某个物品相似度最高的K个物品

In [51]:
# func余弦相似度计算

def cosineSimilarity (vec1, vec2):
    import numpy as np
    from numpy import linalg as la
    
    inA = np.mat(vec1) 
    inB = np.mat(vec2)
    num = float(inA * inB.T) #若为行向量: A * B.T
    donom = la.norm(inA) * la.norm(inB) ##余弦值 
    return 0.5+ 0.5*(num / donom) # 归一化
    #关于归一化：因为余弦值的范围是 [-1,+1] ，相似度计算时一般需要把值归一化到 [0,1]


In [52]:
# 测试一下，结果应该为1    
testx = np.array([1.0,2.0,3.0])
display(cosineSimilarity(testx,testx))

1.0

In [54]:
# 计算100 与其他的物品的余弦相似度
sims = model.productFeatures().map(lambda x: (x[0],cosineSimilarity(x[1], itemFactor))).collect()
simsSorted = sorted(sims, key=lambda x:x[1], reverse=True)
simsSorted[:10]

[(100, 1.0),
 (124, 0.93856940066241967),
 (654, 0.93809784134109064),
 (191, 0.93773602951034141),
 (484, 0.93494876203278809),
 (508, 0.93493009430963958),
 (185, 0.93386061802668741),
 (223, 0.93230556310196522),
 (963, 0.93220063939183218),
 (127, 0.93116954605370839)]