# 推荐系统入门项目

In [2]:
# 本项目使用的是MovieLens 1M 数据集，包含6000个用户在近4000部电影上的1亿条评论。
#数据集分为三个文件：用户数据users.dat，电影数据movies.dat和评分数据ratings.dat
! ls data

movies.dat  ratings.dat  README  u1.base  u1.test  users.dat


In [3]:
# 导入相关包
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import gc
import math
pd.set_option('display.float_format',lambda x : '%.3f' % x)
plt.style.use('seaborn-dark') 
plt.rcParams['axes.unicode_minus']=False 
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams['font.sans-serif'] = ['SimHei']



In [4]:
def read_rating_data(path):
    """
    读取评分数据并存储为csv文件
    """
    f = pd.read_table(path,sep='::',names=['UserID','MovieID','Rating','Timestamp'])
    # f.to_csv('ratings.csv',index=False)
    return f

In [5]:
# 首先我们来看下数据
# 评分数据分别有用户ID、电影ID、评分和时间戳等字段
ratings = read_rating_data("data/ratings.dat")

  """


In [6]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## User-CF基于用户的协同过滤
一般流程：
1. 找到和目标用户兴趣相似的用户集合
2. 找和这个集合中的用户喜欢的，且目标用户没有听说过的物品推荐给目标用户

In [7]:
def calcute_similar(series1,series2):
    """ 
        计算余弦相似度
    """
    unionLen = len(set(series1) & set(series2))
    if unionLen == 0: return 0.0
    product = len(series1) * len(series2)
    similarity = unionLen / math.sqrt(product)
    return similarity


In [8]:
series1 = ratings[ratings['UserID'] == 1]['MovieID']
series2 = ratings[ratings['UserID'] == 2]['MovieID']
calcute_similar(series1, series2)

0.08465746311541544

In [9]:
def get_user_topk_sim(df,userid=1, k=10):
    """ 
        计算与userid相似度最高topk
    """
    target = df[df['UserID'] == userid]['MovieID']
    other_users = set(df['UserID'].unique()) - set([userid])
    others = [df[df['UserID'] == i]['MovieID'] for i in other_users]
    similarlist = [calcute_similar(target,other) for other in others]
    return pd.Series(similarlist,index=other_users).sort_values(ascending=False)[:k]



In [10]:
get_user_topk_sim(ratings, 1, k=10)

5343   0.389
1481   0.385
5190   0.377
1283   0.365
5705   0.334
6006   0.321
1858   0.319
4718   0.316
5762   0.316
681    0.312
dtype: float64

In [11]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [12]:
def calcute_interest(frame, similarSeries, targetItemID):
    """
        计算目标用户对目标物品的感兴趣程度
    """
    similarUserID = similarSeries.index                                                 #和用户兴趣最相似的K个用户
    similarUsers = [frame[frame['UserID'] == i] for i in similarUserID]                 #K个用户数据
    similarUserValues = similarSeries.values                                            #用户和其他用户的兴趣相似度
    UserInstItem = []
    for u in similarUsers:                                                              #其他用户对物品的感兴趣程度
        if targetItemID in u['MovieID'].values:
            UserInstItem.append(u[u['MovieID']==targetItemID]['Rating'].values[0])
        else:
            UserInstItem.append(0)
    interest = sum([similarUserValues[v]*UserInstItem[v]/5 for v in range(len(similarUserValues))])
    return interest


def calcuteItem(df, userid=1,k=10):
    """ 
        推荐topk给用户
    """
    similars = get_user_topk_sim(df, userid)
    user_movie_ids = set(df[df['UserID'] == userid]['MovieID'])
    other_movie_ids = set(df[df['UserID'] != userid]['MovieID'])
    movie_ids = user_movie_ids ^ other_movie_ids #差集
    interestlist = [calcute_interest(df, similars ,movie) for movie in movie_ids]
    return pd.Series(interestlist, index=movie_ids).sort_values(ascending=False)[:k]


In [13]:
calcuteItem(ratings, 1)

2081   2.642
2078   2.356
2096   2.164
2085   2.088
2080   1.864
596    1.836
364    1.696
593    1.579
2137   1.570
480    1.564
dtype: float64

In [15]:
! ls data/

movies.dat  ratings.dat  README  u1.base  u1.test  users.dat


In [16]:
from sklearn.metrics import mean_squared_error

def rmse(pred, actual):
    '''计算预测结果的rmse'''
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [17]:

title=['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("data/u1.base",sep='\t',names = title)
test_df = pd.read_csv("data/u1.test",sep='\t',names = title)

In [18]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [19]:
# 构造评分矩阵
ratings = np.zeros((np.max(df['user_id']), np.max(df['item_id'])))
for row in df.itertuples():
    ratings[row[1]-1,row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [20]:
# 可以看出来评分矩阵是个非常稀疏的矩阵，95%的数据都是空值
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('训练集矩阵密度为: {:4.2f}%'.format(sparsity))

训练集矩阵密度为: 5.04%


In [21]:
ratings.shape

(943, 1682)

In [22]:
all_mean = np.mean(ratings[ratings!=0])
user_mean = sum(ratings.T)/sum((ratings!=0).T)
item_mean = sum(ratings)/sum((ratings!=0))
#用all_mean填充user_mean和item_mean可能存在的空值Nan
user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
# all_mean, user_mean, item_mean

  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
def predict_naive(user, item):
    prediction = item_mean[item] + user_mean[user] - all_mean
    return prediction

In [24]:
print('------ 基线算法(baseline) ------')
print('载入测试集...')
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用基线算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_naive(user, item))
    targets.append(actual)
print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

------ 基线算法(baseline) ------
载入测试集...
测试集大小为 20000
采用基线算法进行预测...
测试结果的rmse为 0.9802


In [25]:
print('------ item-based协同过滤算法(相似度未归一化) ------')

------ item-based协同过滤算法(相似度未归一化) ------


In [26]:
def cal_similarity(ratings, kind, epsilon=1e-9):
    '''利用余弦距离计算相似度'''
    '''epsilon: 防止分母为0的异常'''
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [27]:
print('计算相似度矩阵...')
user_similarity = cal_similarity(ratings, kind='user')
item_similarity = cal_similarity(ratings, kind='item')
print('计算完成.')
print('相似度矩阵样例: (item-item)')
print(np.round_(item_similarity[:10,:10], 3))

计算相似度矩阵...
计算完成.
相似度矩阵样例: (item-item)
[[1.    0.358 0.309 0.374 0.234 0.088 0.542 0.41  0.422 0.235]
 [0.358 1.    0.222 0.419 0.283 0.084 0.335 0.286 0.193 0.12 ]
 [0.309 0.222 1.    0.263 0.144 0.078 0.308 0.187 0.289 0.148]
 [0.374 0.419 0.263 1.    0.273 0.1   0.402 0.4   0.339 0.185]
 [0.234 0.283 0.144 0.273 1.    0.017 0.276 0.185 0.206 0.04 ]
 [0.088 0.084 0.078 0.1   0.017 1.    0.123 0.07  0.14  0.139]
 [0.542 0.335 0.308 0.402 0.276 0.123 1.    0.336 0.446 0.256]
 [0.41  0.286 0.187 0.4   0.185 0.07  0.336 1.    0.32  0.201]
 [0.422 0.193 0.289 0.339 0.206 0.14  0.446 0.32  1.    0.213]
 [0.235 0.12  0.148 0.185 0.04  0.139 0.256 0.201 0.213 1.   ]]


In [28]:
def predict_itemCF(user, item, k=100):
    '''item-based协同过滤算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    prediction = ratings[user, nzero].dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero])
    return prediction

In [29]:
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用item-based协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_itemCF(user, item))
    targets.append(actual)
print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

测试集大小为 20000
采用item-based协同过滤算法进行预测...
测试结果的rmse为 1.0331


In [30]:
print('------ 结合基线算法的item-based协同过滤算法(相似度未归一化) ------')

------ 结合基线算法的item-based协同过滤算法(相似度未归一化) ------


In [31]:
def predict_itemCF_baseline(user, item, k=100):
    '''结合baseline的item-basedCF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    return prediction 

In [32]:
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的item-item协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_itemCF_baseline(user, item))
    targets.append(actual)
print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

测试集大小为 20000
采用结合baseline的item-item协同过滤算法进行预测...
测试结果的rmse为 0.9456


In [33]:
print('------ user-based协同过滤算法(相似度未归一化) ------')

def predict_userCF(user, item, k=100):
    '''user-user协同过滤算法,预测rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = ratings[nzero, item].dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero])
    # 冷启动问题: 该item暂时没有评分
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction


predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用user-user协同过滤算法进行预测...')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

------ user-based协同过滤算法(相似度未归一化) ------
测试集大小为 20000
采用user-user协同过滤算法进行预测...


  


测试结果的rmse为 1.0264


In [34]:
print('------ 结合基线算法的的user-user协同过滤算法(相似度未归一化) ------')

def predict_userCF_baseline(user, item, k=100):
    '''结合baseline的user-user协同过滤算法,预测rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[nzero, item] - baseline[nzero]).dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero]) + baseline[user]
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的user-user协同过滤算法进行预测...')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF_baseline(user, item))
    targets.append(actual)
    
print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

------ 结合基线算法的的user-user协同过滤算法(相似度未归一化) ------
测试集大小为 20000
采用结合baseline的user-user协同过滤算法进行预测...


  


测试结果的rmse为 0.9679


In [35]:
print('------ 经过修正后的协同过滤 ------')
def predict_biasCF(user, item, k=100):
    '''结合基线算法的item-based CF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediciton = 1
    return prediction

predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的item-based协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_biasCF(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

------ 经过修正后的协同过滤 ------
测试集大小为 20000
采用结合baseline的item-based协同过滤算法进行预测...
测试结果的rmse为 0.9455


In [36]:
print('------ Top-k协同过滤(item-based + baseline)------')
def predict_topkCF(user, item, k=10):
    '''top-k CF算法,以item-based协同过滤为基础，结合baseline,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity[item, choice])\
                / sum(item_similarity[item, choice]) + baseline[item]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction 

predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用top K协同过滤算法进行预测...')
k = 20
print('选取的K值为%d.' % k)
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_topkCF(user, item, k))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

------ Top-k协同过滤(item-based + baseline)------
测试集大小为 20000
采用top K协同过滤算法进行预测...
选取的K值为20.
测试结果的rmse为 0.9309


In [37]:
def cal_similarity_norm(ratings, kind, epsilon=1e-9):
    '''采用归一化的指标:Pearson correlation coefficient'''
    if kind == 'user':
        # 对同一个user的打分归一化
        rating_user_diff = ratings.copy()
        for i in range(ratings.shape[0]):
            nzero = ratings[i].nonzero()
            rating_user_diff[i][nzero] = ratings[i][nzero] - user_mean[i]
        sim = rating_user_diff.dot(rating_user_diff.T) + epsilon
    elif kind == 'item':
        # 对同一个item的打分归一化
        rating_item_diff = ratings.copy()
        for j in range(ratings.shape[1]):
            nzero = ratings[:,j].nonzero()
            rating_item_diff[:,j][nzero] = ratings[:,j][nzero] - item_mean[j]
        sim = rating_item_diff.T.dot(rating_item_diff) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

print('计算归一化的相似度矩阵...')
user_similarity_norm = cal_similarity_norm(ratings, kind='user')
item_similarity_norm = cal_similarity_norm(ratings, kind='item')
print('计算完成.')
print('相似度矩阵样例: (item-item)')
print(np.round_(item_similarity_norm[:10,:10], 3))

计算归一化的相似度矩阵...
计算完成.
相似度矩阵样例: (item-item)
[[ 1.     0.067  0.081  0.049  0.077  0.028  0.115  0.082  0.042  0.058]
 [ 0.067  1.    -0.002  0.127  0.054 -0.01   0.082  0.162 -0.045 -0.016]
 [ 0.081 -0.002  1.    -0.039  0.023  0.058  0.024 -0.036  0.     0.016]
 [ 0.049  0.127 -0.039  1.    -0.127  0.006  0.052  0.121  0.067  0.036]
 [ 0.077  0.054  0.023 -0.127  1.    -0.016  0.051  0.031  0.024 -0.046]
 [ 0.028 -0.01   0.058  0.006 -0.016  1.    -0.029 -0.01   0.013  0.035]
 [ 0.115  0.082  0.024  0.052  0.051 -0.029  1.     0.065  0.137  0.005]
 [ 0.082  0.162 -0.036  0.121  0.031 -0.01   0.065  1.     0.025  0.053]
 [ 0.042 -0.045  0.     0.067  0.024  0.013  0.137  0.025  1.    -0.013]
 [ 0.058 -0.016  0.016  0.036 -0.046  0.035  0.005  0.053 -0.013  1.   ]]


In [38]:
def predict_norm_CF(user, item, k=20):
    '''baseline + item-based + 皮尔森归一化'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity_norm[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity_norm[item, choice])\
                / sum(item_similarity_norm[item, choice]) + baseline[item]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction 

predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用归一化矩阵方法，结合其它trick进行预测...')
k = 15
print('选取的K值为%d.' % k)
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_norm_CF(user, item, k))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))

测试集大小为 20000
采用归一化矩阵方法，结合其它trick进行预测...
选取的K值为15.
测试结果的rmse为 0.9388
