## 库导入

In [17]:
from IPython.core.display import display
import numpy as np
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## 评级数据导入

### train data(80%)

In [18]:
# import train data "user ,movie, rating, time "
rawData = sc.textFile("file:///Users/wente/GitHub/MovieRating/wente_file/ml-100k/u1.base")
# 用不到时间戳——弃掉
rawRatings = rawData.map(lambda x: x.split("\t")).map(lambda x: x[:3])
# 将数据转换为Rating对象
ratings = rawRatings.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
ratings.first()

Rating(user=1, product=1, rating=5.0)

In [19]:
userNum  = rawRatings.map(lambda x: x[0]).distinct().count()
predictNum = rawRatings.map(lambda x: x[1]).distinct().count()
display (userNum, predictNum)

943

1650

### test data(20%)

In [20]:
# import train data "user ,movie, rating, time "
rawTest = sc.textFile("file:///Users/wente/GitHub/MovieRating/wente_file/ml-100k/u1.test")
# 用不到时间戳——弃掉--->格式转换
testData = rawTest.map(lambda x: x.split("\t")).map(lambda x: x[:3]).map(lambda l: (int(l[0]), int(l[1]), float(l[2])))
display(testData.first())
# 将数据转换为
predictData = testData.map(lambda x: x[:2])
predictData.first()

(1, 6, 5.0)

(1, 6)

## 训练模型
### 参数的设置：

rank 因子个数，一般越多越好，不过内存开销也越大，合理值是 10-200。

iterations 迭代次数 对 ALS 10次就可以了。

lambda 正则化过程————>也就是过拟合情况，应该统计非样的测试数据 进行交叉验证调整。

In [21]:
## 训练
rank = 50
iterations = 10
theLambda  = 0.01
model = ALS.train(ratings, rank, iterations, theLambda)
model

<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x111eb76d8>

In [22]:
## predict user789 对 movie123的rating预测
predict = model.predict(789,123)
predict

2.7660992202595964

In [23]:
# test
display (model.userFeatures())
display (model.userFeatures().count())# 6040
display (model.productFeatures().count()) # 3706

PythonRDD[503] at RDD at PythonRDD.scala:43

943

1650

## 对测试数据集进行预测

In [24]:
# predictions 
predictions = model.predictAll(predictData).map(lambda x: ((x[0],x[1]),x[2]))
predictions.first()

((316, 1084), 2.4544591958770345)

## 模型评估
- 均方误差(Mean Squared Error,MSE) 
常用于显示评级的情形
- Root Mean Squared Error,RMSE
- K值平均准确率 MAPK 

### 均方误差

In [25]:
# 数据整理
result = testData.map(lambda x: ((x[0],x[1]),x[2])).join(predictions)
display (result.first())


((213, 121), (5.0, 3.6714270837582674))

In [26]:
## MSE & RMSE
mse = result.map(lambda v: (v[1][0] - v[1][1]) ** 2).mean()
rmse = np.sqrt(mse)
print("mse = {:.3f}".format(mse))
print("rmse = {:.3f}".format(rmse))

mse = 1.481
rmse = 1.217


### K值平均准确率（单个用户）

In [52]:
# MAP的Python脚本如下：
def avgPrecisionK(actual, predicted, k): 
    if len(predicted) > k:
        predK = predicted[:k]
    else:
        predK = predicted
    score = 0.0
    numHits = 0.0
    for i,p in enumerate(predK):
        if p in actual and p not in predK:
            numHits = numHits + 1
            score = score + numHits/(i+1)
    if not actual:
        return 1.0
    else:
        return score/min(len(actual),k)

In [50]:
# 下面计算模型对用户789推荐的MAP。首先提取出用户实际评级过的电影的ID和预测电影ID:
# 数据整理
userId = 6
K = 10
actualMovies =[elem[1:] for elem in model.recommendProducts(userId, K)]
predictMovies = [elem[1:] for elem in testData.keyBy(lambda x: x[0]).lookup(userId)]
display (actualMovies[:1])
display(predictMovies[:1])

[(659, 6.396913771184828)]

[(14, 5.0)]

In [53]:
MAP10 = avgPrecisionK(actualMovies,predictMovies,10)
display (MAP10)

0.0

这里,APK的得分为0,这表明该模型在为该用户做相关电影预测上的表现并不理想。,推荐类任务的这个得分通常都较低,特别是当物品的数量极大时。

## 参数调整

## 用户推荐——为某个用户生产评分最高的前K个电影

In [27]:
# 为某个用户生产前K个推荐物品（评分最高的）
userId = 6
K = 10
topKRecs = model.recommendProducts(userId, K)
topKRecs

[Rating(user=6, product=659, rating=6.396913771184828),
 Rating(user=6, product=515, rating=5.946965727342597),
 Rating(user=6, product=1142, rating=5.625788705127286),
 Rating(user=6, product=205, rating=5.472596687607102),
 Rating(user=6, product=190, rating=5.365075266920087),
 Rating(user=6, product=357, rating=5.339812961327616),
 Rating(user=6, product=208, rating=5.324037223548699),
 Rating(user=6, product=603, rating=5.220581687133934),
 Rating(user=6, product=498, rating=5.14110769806142),
 Rating(user=6, product=646, rating=5.1084513138580805)]

In [28]:
# 查找真实数据中user798对电影的评分
# 为了和上面的预测对比
moviesForUser = testData.keyBy(lambda x: x[0]).lookup(userId)
len(moviesForUser)

101

In [29]:
# 对比上面的预测值topKRecs ——差别有点大啊
# 真实的评分最高的前K个物品（电影）
display (sorted(moviesForUser, key=lambda x:x[2], reverse=True)[:K])
display  ("——————下面是预测数据——————")
display  (topKRecs)

[(6, 14, 5.0),
 (6, 59, 5.0),
 (6, 98, 5.0),
 (6, 100, 5.0),
 (6, 124, 5.0),
 (6, 131, 5.0),
 (6, 134, 5.0),
 (6, 135, 5.0),
 (6, 136, 5.0),
 (6, 197, 5.0)]

'——————下面是预测数据——————'

[Rating(user=6, product=659, rating=6.396913771184828),
 Rating(user=6, product=515, rating=5.946965727342597),
 Rating(user=6, product=1142, rating=5.625788705127286),
 Rating(user=6, product=205, rating=5.472596687607102),
 Rating(user=6, product=190, rating=5.365075266920087),
 Rating(user=6, product=357, rating=5.339812961327616),
 Rating(user=6, product=208, rating=5.324037223548699),
 Rating(user=6, product=603, rating=5.220581687133934),
 Rating(user=6, product=498, rating=5.14110769806142),
 Rating(user=6, product=646, rating=5.1084513138580805)]

## 物品推荐——计算与某个物品相似度最高的K个物品

In [30]:
# func余弦相似度计算

def cosineSimilarity (vec1, vec2):
    import numpy as np
    from numpy import linalg as la
    
    inA = np.mat(vec1) 
    inB = np.mat(vec2)
    num = float(inA * inB.T) #若为行向量: A * B.T
    donom = la.norm(inA) * la.norm(inB) ##余弦值 
    return 0.5+ 0.5*(num / donom) # 归一化
    #关于归一化：因为余弦值的范围是 [-1,+1] ，相似度计算时一般需要把值归一化到 [0,1]


In [34]:
# 测试一下，结果应该为1    
testx = np.array([1.0,2.0,3.0])
display(cosineSimilarity(testx,testx))

1.0

In [31]:
# 物品100对应的因子
itemId = 100
itemFactor = model.productFeatures().lookup(itemId)
itemFactor = np.mat(itemFactor)
display(itemFactor)
#test物品567对应的因子和自己的余弦相似度计算 ——应该为1
cosineSimilarity(itemFactor,itemFactor)

matrix([[ 0.18132576, -0.367594  , -2.02750278,  0.14757667,  0.01309552,
          0.3418301 ,  0.02846426, -0.97273409, -0.23761597,  0.05263272,
         -0.58303279, -0.20545818, -0.46925241, -1.1589216 ,  0.44593593,
          0.44556555, -0.18251772, -0.37772518,  0.62050122,  0.05390786,
          0.03757767, -0.1753062 , -0.4385958 , -0.7255674 ,  0.93506473,
         -0.24861954,  0.77538127, -0.45520648, -1.26482451, -1.43371737,
          0.16625632,  0.14440562, -0.99962944, -0.5653494 ,  1.64225149,
          0.21734481, -1.13297093, -0.61752391,  0.33847439, -0.75160211,
         -1.12853777,  0.85624087, -0.56932443,  2.26373148, -0.59060234,
          0.31641617, -0.04694546,  0.64048362,  1.45574427, -1.46515882]])

1.0

In [35]:
# 计算100 与其他的物品的余弦相似度
sims = model.productFeatures().map(lambda x: (x[0],cosineSimilarity(x[1], itemFactor))).collect()
simsSorted = sorted(sims, key=lambda x:x[1], reverse=True)
simsSorted[:10]

[(100, 1.0),
 (297, 0.944038793895003),
 (134, 0.9405541803974149),
 (483, 0.93822144282935938),
 (484, 0.93775358381210028),
 (603, 0.93757508867269945),
 (30, 0.93500356535489326),
 (657, 0.93466656135533255),
 (223, 0.9343424937357363),
 (479, 0.93382210706393409)]