## 库导入

In [119]:
from IPython.core.display import display
import numpy as np
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## 评级数据导入

In [46]:
# 导入原始数据 "user ,movie, rating, time "
rawData = sc.textFile("file:///Users/wente/SparkTest/ml-100k/u.data")
rawData.take(5)                

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

In [31]:
# 用不到时间戳——弃掉
rawRatings = rawData.map(lambda x: x.split("\t")).map(lambda x: x[:3])
rawRatings.first()

['196', '242', '3']

In [33]:
# 将数据转换为Rating对象
ratings = rawRatings.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
ratings.first()

Rating(user=196, product=242, rating=3.0)

## 导入电影数据

In [131]:
# 导入电影数据 ——
rawMovies = sc.textFile("file:///Users/wente/SparkTest/ml-100k/u.item")
rawMovies.take(2)

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0']

In [52]:
# 处理电影数据——只保留“序号+名字”；--> 分割；-->去掉（年份）这样的东西
def extract_title(raw):
    import re
    grps = re.search("\((\w+)\)",raw)
    # print (grps)
    if grps:
        return raw[:grps.start()].strip()
    else :
        return raw
    
movies = rawMovies.map(lambda x: x.split("|")).map(lambda x: (int(x[0]), extract_title(x[1])))
movies.take(5)

[(1, 'Toy Story'),
 (2, 'GoldenEye'),
 (3, 'Four Rooms'),
 (4, 'Get Shorty'),
 (5, 'Copycat')]

In [136]:
# 转换成map
titles = movies.collectAsMap()
display(titles[123])

'Frighteners, The'

## 训练模型

In [41]:
## 训练
model = ALS.train(ratings, 50, 10, 0.01)
model

<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x11205bbe0>

In [42]:
## predict user789 对 movie123的rating预测
predict = model.predict(789,123)
predict

3.1257644207381152

## 用户推荐——为某个用户生产评分最高的前K个电影

In [45]:
# 为某个用户生产前K个推荐物品（评分最高的）
userId = 789
K = 10
topKRecs = model.recommendProducts(userId, K)
topKRecs

[Rating(user=789, product=179, rating=6.219926182505045),
 Rating(user=789, product=642, rating=5.671590135799223),
 Rating(user=789, product=502, rating=5.564187725338691),
 Rating(user=789, product=573, rating=5.41679249716231),
 Rating(user=789, product=521, rating=5.404343444980412),
 Rating(user=789, product=182, rating=5.211604324881899),
 Rating(user=789, product=962, rating=5.114975343515041),
 Rating(user=789, product=183, rating=5.112916949918486),
 Rating(user=789, product=1019, rating=5.112229193436341),
 Rating(user=789, product=32, rating=5.1004955779490695)]

In [58]:
# 查找真实数据中user798对电影的评分
# 为了和上面的预测对比
moviesForUser = ratings.keyBy(lambda x: x.user).lookup(789)
len(moviesForUser)

33

In [120]:
# 对比上面的预测值topKRecs ——差别有点大啊
# 真实的评分最高的前K个物品（电影）
display (sorted(moviesForUser, key=lambda x:x.rating, reverse=True)[:K])
display  ("——————下面是预测数据——————")
display  (topKRecs)

[Rating(user=789, product=127, rating=5.0),
 Rating(user=789, product=475, rating=5.0),
 Rating(user=789, product=9, rating=5.0),
 Rating(user=789, product=50, rating=5.0),
 Rating(user=789, product=150, rating=5.0),
 Rating(user=789, product=276, rating=5.0),
 Rating(user=789, product=129, rating=5.0),
 Rating(user=789, product=100, rating=5.0),
 Rating(user=789, product=741, rating=5.0),
 Rating(user=789, product=1012, rating=4.0)]

'——————下面是预测数据——————'

[Rating(user=789, product=179, rating=6.219926182505045),
 Rating(user=789, product=642, rating=5.671590135799223),
 Rating(user=789, product=502, rating=5.564187725338691),
 Rating(user=789, product=573, rating=5.41679249716231),
 Rating(user=789, product=521, rating=5.404343444980412),
 Rating(user=789, product=182, rating=5.211604324881899),
 Rating(user=789, product=962, rating=5.114975343515041),
 Rating(user=789, product=183, rating=5.112916949918486),
 Rating(user=789, product=1019, rating=5.112229193436341),
 Rating(user=789, product=32, rating=5.1004955779490695)]

## 物品推荐——计算与某个物品相似度最高的K个物品

In [132]:
# func余弦相似度计算

def cosineSimilarity (vec1, vec2):
    import numpy as np
    from numpy import linalg as la
    
    inA = np.mat(vec1) 
    inB = np.mat(vec2)
    num = float(inA * inB.T) #若为行向量: A * B.T
    donom = la.norm(inA) * la.norm(inB) ##余弦值 
    return 0.5+ 0.5*(num / donom) # 归一化
    #关于归一化：因为余弦值的范围是 [-1,+1] ，相似度计算时一般需要把值归一化到 [0,1]


In [122]:
# 物品567对应的因子
itemId = 567
itemFactor = model.productFeatures().lookup(itemId)
itemFactor = np.mat(itemFactor)
display(itemFactor)
#test物品567对应的因子和自己的余弦相似度计算 ——应该为1
cosineSimilarity(itemFactor,itemFactor)

matrix([[ 0.30001232, -0.82738632,  0.04953906,  0.11014523, -0.7215237 ,
         -1.16773701, -0.06502534, -0.59308511,  0.96469522,  0.3042264 ,
         -1.0616982 , -0.24189724,  0.19097036,  0.99766499,  1.03015256,
          0.82253695,  0.43349966,  0.23172313, -0.29670504,  0.68732196,
         -0.05919091,  0.90864134,  0.19700296, -0.33594021, -1.31641686,
          0.92831606,  0.53180593, -0.37785533,  0.07913431, -0.38576484,
         -0.34545758,  0.14826232, -0.41177404, -0.18283407, -0.00584139,
         -0.50690824, -0.00588502, -0.78301221, -0.4702107 , -0.14911231,
          0.34510347, -0.09306512,  0.05045496,  0.45057371,  1.23036456,
         -0.89369899, -0.380521  ,  0.0276823 ,  0.43097624,  0.31901327]])

1.0

In [123]:
# 计算735 与其他的物品的余弦相似度
sims = model.productFeatures().map(lambda x: (x[0],cosineSimilarity(x[1], itemFactor))).collect()
simsSorted = sorted(sims, key=lambda x:x[1], reverse=True)
simsSorted[:5]

[(567, 1.0),
 (413, 0.86113899935782356),
 (899, 0.85484505016523227),
 (219, 0.84834957136326716),
 (1527, 0.844293892019359)]

In [114]:
# test编号对应的名字
titles[itemId]

"Wes Craven's New Nightmare"

In [128]:
# 编号修改为名字
simsWithName = list(map((lambda x : (titles[x[0]],x[1])),simsSorted))
simsWithName[:5]

[("Wes Craven's New Nightmare", 1.0),
 ('Tales from the Crypt Presents: Bordello of Blood', 0.86113899935782356),
 ('Winter Guest, The', 0.85484505016523227),
 ('Nightmare on Elm Street, A', 0.84834957136326716),
 ('Senseless', 0.844293892019359)]