## 使用协同过滤基于网易云音乐数据构建模型并进行预测

In [1]:
import os
import pickle
from surprise import KNNBaseline, Reader
from surprise import Dataset

path = "./data/output/popular/"

# 重建歌单id到歌单名的映射字典
id_name_dic = pickle.load(open( path+"popular_playlist.pkl","rb"))
print("加载歌单id到歌单名的映射字典完成...")
# 重建歌单名到歌单id的映射字典
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成...")

file_path = os.path.expanduser(path+"popular_music_suprise_format.txt")
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print("构建数据集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

加载歌单id到歌单名的映射字典完成...
加载歌单名到歌单id的映射字典完成...
构建数据集...


In [2]:
i = list(id_name_dic.keys())[2]
i

'21770258'

In [3]:
print(id_name_dic[i])

周杰伦好听的“三字曲”


In [4]:
trainset.n_items

50539

In [5]:
trainset.n_users

1076

## 基于用户的协同过滤

主要思想：找出和当前用户兴趣相近的用户，针对网易云音乐歌单数据而言，这里的用户就是歌单

In [6]:
print("开始训练模型...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()

algo.fit(trainset)

current_playlist = list(name_id_dic.keys())[39]
print("歌单名称", current_playlist)

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print("歌单id", playlist_id)
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print("内部id", playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字
# to_raw_uid映射回去
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                       for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                       for playlist_id in playlist_neighbors)

print()
print("和歌单 《", current_playlist, "》 最接近的10个歌单为：\n")
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]))

开始训练模型...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
歌单名称 适合吉他初学者弹奏的歌曲
歌单id 69758545
内部id 723

和歌单 《 适合吉他初学者弹奏的歌曲 》 最接近的10个歌单为：

当过千评论的华语翻唱遇上“原唱”【更新】 1
【华语】暖心物语 纯白思念 3
〖循环〗单曲循环是强迫症吗？ 4
简单的爱总是那么吸引人 6
『华语/回忆』95后陪伴我中学时期的歌曲 13
所有的大人，曾经都是小孩 16
有没有一首歌让你泪流满面 17
专属你的周杰伦 18
云村村民专属歌单 20
「华语歌曲」 23


## 基于协同过滤的用户评分预测

In [7]:
import pickle
# 重建歌曲id到歌曲名的映射字典
song_id_name_dic = pickle.load(open(path+"popular_song.pkl","rb"))
print("加载歌曲id到歌曲名的映射字典完成...")
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成...")

加载歌曲id到歌曲名的映射字典完成...
加载歌曲名到歌曲id的映射字典完成...


In [8]:
#内部编码的4号用户
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 4          item: 478        r_ui = 1.00   est = 1.00   {'was_impossible': False} 听见下雨的声音	魏如昀
user: 4          item: 429        r_ui = 1.00   est = 1.00   {'was_impossible': False} 梦一场	萧敬腾
user: 4          item: 936        r_ui = 1.00   est = 1.00   {'was_impossible': False} 干杯	西瓜Kune
user: 4          item: 937        r_ui = 1.00   est = 1.00   {'was_impossible': False} 给自己的歌 (Live) - live	纵贯线
user: 4          item: 938        r_ui = 1.00   est = 1.00   {'was_impossible': False} 小半	陈粒
user: 4          item: 939        r_ui = 1.00   est = 1.00   {'was_impossible': False} 思念是一种病(Live) - live	张震岳
user: 4          item: 940        r_ui = 1.00   est = 1.00   {'was_impossible': False} 可以不可以	丁当
user: 4          item: 941        r_ui = 1.00   est = 1.00   {'was_impossible': False} 秋酿	房东的猫
user: 4          item: 616        r_ui = 1.00   est = 1.00   {'was_impossible': False} 退后	周杰伦
user: 4          item: 942        r_ui = 1.00   est = 1.00   {'was_impossible': False} 阴天	莫文蔚
user: 4        

## 基于矩阵分解的用户评分预测

In [9]:
### 使用NMF
from surprise import NMF
from surprise import Dataset

file_path = os.path.expanduser(path+'./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 构建数据集和建模
algo = NMF()
trainset = music_data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x63a80b8>

In [10]:
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(algo.trainset.to_raw_uid(user_inner_id), algo.trainset.to_raw_iid(song), r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 400232387  item: 27724082   r_ui = 1.00   est = 1.00   {'was_impossible': False} 听见下雨的声音	魏如昀
user: 400232387  item: 167916     r_ui = 1.00   est = 1.00   {'was_impossible': False} 梦一场	萧敬腾
user: 400232387  item: 408307325  r_ui = 1.00   est = 1.00   {'was_impossible': False} 干杯	西瓜Kune
user: 400232387  item: 394618     r_ui = 1.00   est = 1.00   {'was_impossible': False} 给自己的歌 (Live) - live	纵贯线
user: 400232387  item: 421423806  r_ui = 1.00   est = 1.00   {'was_impossible': False} 小半	陈粒
user: 400232387  item: 394485     r_ui = 1.00   est = 1.00   {'was_impossible': False} 思念是一种病(Live) - live	张震岳
user: 400232387  item: 5239563    r_ui = 1.00   est = 1.00   {'was_impossible': False} 可以不可以	丁当
user: 400232387  item: 30635613   r_ui = 1.00   est = 1.00   {'was_impossible': False} 秋酿	房东的猫
user: 400232387  item: 185884     r_ui = 1.00   est = 1.00   {'was_impossible': False} 退后	周杰伦
user: 400232387  item: 276936     r_ui = 1.00   est = 1.00   {'was_impossible': False} 阴天	莫文蔚
user: 400232387

## 模型保存与加载

In [11]:
import surprise
surprise.dump.dump('./model/recommendation.model', algo=algo)
# 可以用下面的方式载入
algo = surprise.dump.load('./model/recommendation.model')

## 不同的推荐系统算法评估

In [12]:
import os
from surprise import Reader, Dataset
# 指定文件路径
file_path = os.path.expanduser(path+'./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)

In [13]:
from surprise.model_selection import cross_validate

### 使用BaselineOnly

In [15]:
from surprise import BaselineOnly
algo = BaselineOnly()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
RMSE (testset)    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
Fit time          0.90    0.93    0.75    0.72    0.76    0.81    0.09    
Test time         0.64    0.63    0.40    0.41    0.42    0.50    0.11    


### 使用基础版协同过滤

In [16]:
from surprise import KNNBasic
algo = KNNBasic()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
RMSE (testset)    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
Fit time          0.13    0.24    0.20    0.21    0.21    0.20    0.04    
Test time         2.01    2.12    1.99    1.98    1.98    2.01    0.06    


### 使用均值协同过滤

In [17]:
from surprise import KNNWithMeans
algo = KNNWithMeans()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
RMSE (testset)    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
Fit time          0.30    0.30    0.25    0.28    0.27    0.28    0.02    
Test time         2.85    2.17    2.21    2.16    1.93    2.27    0.31    


### 使用协同过滤baseline

In [18]:
from surprise import KNNBaseline
algo = KNNBaseline()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
RMSE (testset)    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  
Fit time          0.86    0.87    1.08    0.96    0.92    0.94    0.08    
Test time         2.47    2.36    3.00    2.57    2.48    2.57    0.22    


### 使用SVD

In [19]:
from surprise import SVD
algo = SVD()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.0165  0.0167  0.0167  0.0165  0.0166  0.0166  0.0001  
RMSE (testset)    0.0365  0.0366  0.0368  0.0364  0.0369  0.0366  0.0002  
Fit time          12.87   12.77   13.10   12.83   12.82   12.88   0.12    
Test time         0.63    0.48    0.65    0.46    0.61    0.56    0.08    


由于云平台资源有限，下面的代码没有继续运行演示，同学们要看效果的话可以重启一下，然后只运行部分算法

### 使用SVD++

In [None]:
from surprise import SVDpp
algo = SVDpp()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### 使用NMF

In [None]:
from surprise import NMF
algo = NMF()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)