## 库导入

In [1]:
from IPython.core.display import display
import numpy as np
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## 评级数据导入

### train data(80%)

In [5]:
# import train data "user ,movie, rating, time "
rawData = sc.textFile("file:///Users/wente/GitHub/MovieRating/wente_file/ml-100k/u1.base")
# 用不到时间戳——弃掉
rawRatings = rawData.map(lambda x: x.split("\t")).map(lambda x: x[:3])
# 将数据转换为Rating对象
ratings = rawRatings.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
ratings.first()

Rating(user=1, product=1, rating=5.0)

In [16]:
userNum  = rawRatings.map(lambda x: x[0]).distinct().count()
predictNum = rawRatings.map(lambda x: x[1]).distinct().count()
display (userNum, predictNum)

943

1650

### test data(20%)

In [26]:
# import train data "user ,movie, rating, time "
rawTest = sc.textFile("file:///Users/wente/GitHub/MovieRating/wente_file/ml-100k/u1.test")
# 用不到时间戳——弃掉--->格式转换
testData = rawTest.map(lambda x: x.split("\t")).map(lambda x: x[:3]).map(lambda l: (int(l[0]), int(l[1]), float(l[2])))
display(testData.first())
# 将数据转换为
predictData = testData.map(lambda x: x[:2])
predictData.first()

(1, 6, 5.0)

(1, 6)

## 训练模型
### 参数的设置：

rank 因子个数，一般越多越好，不过内存开销也越大，合理值是 10-200。

iterations 迭代次数 对 ALS 10次就可以了。

lambda 正则化过程————>也就是过拟合情况，应该统计非样的测试数据 进行交叉验证调整。

In [10]:
## 训练
rank = 50
iterations = 10
theLambda  = 0.01
model = ALS.train(ratings, rank, iterations, theLambda)
model

<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x112038d68>

In [11]:
## predict user789 对 movie123的rating预测
predict = model.predict(789,123)
predict

2.692111402220827

In [13]:
# test
display (model.userFeatures())
display (model.userFeatures().count())# 6040
display (model.productFeatures().count()) # 3706

PythonRDD[438] at RDD at PythonRDD.scala:43

943

1650

## 预测评分

In [27]:
# predictions 
predictions = model.predictAll(predictData).map(lambda x: ((x[0],x[1]),x[2]))
predictions.first()

((316, 1084), 2.106355562777414)

## 模型评估
- 均方误差(Mean Squared Error,MSE) 
常用于显示评级的情形
- Root Mean Squared Error,RMSE
- K值平均准确率 MAPK 

In [32]:
# 数据整理
result = testData.map(lambda x: ((x[0],x[1]),x[2])).join(predictions)
display (result.first())


((213, 121), (5.0, 3.8524447211157113))

In [33]:
## MSE & RMSE
mse = result.map(lambda v: (v[1][0] - v[1][1]) ** 2).mean()
rmse = np.sqrt(mse)
print("mse = {:.3f}".format(mse))
print("rmse = {:.3f}".format(rmse))

mse = 1.505
rmse = 1.227


## 用户推荐——为某个用户生产评分最高的前K个电影

In [43]:
# 为某个用户生产前K个推荐物品（评分最高的）
userId = 6
K = 10
topKRecs = model.recommendProducts(userId, K)
topKRecs

[Rating(user=6, product=81, rating=5.515287852490642),
 Rating(user=6, product=727, rating=5.448419671229706),
 Rating(user=6, product=528, rating=5.227340949258899),
 Rating(user=6, product=659, rating=5.207600911445117),
 Rating(user=6, product=173, rating=5.160539449994344),
 Rating(user=6, product=519, rating=5.111068850390716),
 Rating(user=6, product=490, rating=5.110495282701655),
 Rating(user=6, product=478, rating=5.076015375473031),
 Rating(user=6, product=707, rating=5.055637538497926),
 Rating(user=6, product=489, rating=5.0514676526394044)]

In [44]:
# 查找真实数据中user798对电影的评分
# 为了和上面的预测对比
moviesForUser = testData.keyBy(lambda x: x[0]).lookup(userId)
len(moviesForUser)

101

In [46]:
# 对比上面的预测值topKRecs ——差别有点大啊
# 真实的评分最高的前K个物品（电影）
display (sorted(moviesForUser, key=lambda x:x[2], reverse=True)[:K])
display  ("——————下面是预测数据——————")
display  (topKRecs)

[(6, 14, 5.0),
 (6, 59, 5.0),
 (6, 98, 5.0),
 (6, 100, 5.0),
 (6, 124, 5.0),
 (6, 131, 5.0),
 (6, 134, 5.0),
 (6, 135, 5.0),
 (6, 136, 5.0),
 (6, 197, 5.0)]

'——————下面是预测数据——————'

[Rating(user=6, product=81, rating=5.515287852490642),
 Rating(user=6, product=727, rating=5.448419671229706),
 Rating(user=6, product=528, rating=5.227340949258899),
 Rating(user=6, product=659, rating=5.207600911445117),
 Rating(user=6, product=173, rating=5.160539449994344),
 Rating(user=6, product=519, rating=5.111068850390716),
 Rating(user=6, product=490, rating=5.110495282701655),
 Rating(user=6, product=478, rating=5.076015375473031),
 Rating(user=6, product=707, rating=5.055637538497926),
 Rating(user=6, product=489, rating=5.0514676526394044)]

## 物品推荐——计算与某个物品相似度最高的K个物品

In [47]:
# func余弦相似度计算

def cosineSimilarity (vec1, vec2):
    import numpy as np
    from numpy import linalg as la
    
    inA = np.mat(vec1) 
    inB = np.mat(vec2)
    num = float(inA * inB.T) #若为行向量: A * B.T
    donom = la.norm(inA) * la.norm(inB) ##余弦值 
    return 0.5+ 0.5*(num / donom) # 归一化
    #关于归一化：因为余弦值的范围是 [-1,+1] ，相似度计算时一般需要把值归一化到 [0,1]


In [48]:
# 物品100对应的因子
itemId = 100
itemFactor = model.productFeatures().lookup(itemId)
itemFactor = np.mat(itemFactor)
display(itemFactor)
#test物品567对应的因子和自己的余弦相似度计算 ——应该为1
cosineSimilarity(itemFactor,itemFactor)

matrix([[ 0.940781  , -0.58813453,  0.42437467, -1.71968317, -0.85785496,
         -0.73507148,  0.1358735 , -0.41329709, -0.08961392, -0.22453223,
         -0.22866897,  0.36925006,  0.52131379,  0.40339893, -1.16111124,
         -0.06876885,  0.7550447 ,  1.34409595,  0.41636825, -0.75265735,
         -0.17132822,  1.04754162,  0.54877031, -1.19848192,  1.51626456,
          1.79317737, -0.63476217, -0.50708252, -1.73963869,  0.25465798,
          0.50217831, -0.39770052,  0.39043236,  0.09335203,  0.43171662,
         -0.70431143,  1.78544664, -0.10555007,  0.15560056, -0.15454867,
         -0.83584517, -0.65195519,  1.92159963,  0.4936581 ,  1.082726  ,
          0.13944294, -0.23512444, -1.02493489,  1.20973265, -0.39064667]])

0.99999999999999989

In [49]:
# 计算100 与其他的物品的余弦相似度
sims = model.productFeatures().map(lambda x: (x[0],cosineSimilarity(x[1], itemFactor))).collect()
simsSorted = sorted(sims, key=lambda x:x[1], reverse=True)
simsSorted[:5]

[(100, 0.99999999999999989),
 (223, 0.94453409281378875),
 (191, 0.94235086677379964),
 (963, 0.94204077112197449),
 (30, 0.94129789674558628)]