# 基于用户的协同过滤

# 电影的五分制评分的数据推荐
使用用户对电影的五分制评分的数据，采用基于用户的协同过滤算法，实现如下功能： 1.查看不同用户的皮尔逊相关系数 2.使用皮尔逊相似度计算一下与用户3兴趣最相近的3个用户 3.尝试对用户3做出影片推荐

In [16]:
import pandas as pd
table_name = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_table('user_cf.csv', sep=',', header=0, names=table_name)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,3.5,1260759144
1,1,2,2.0,1260759179
2,1,4,4.5,1260759185
3,1,5,5.0,1260759205
4,1,6,1.5,1260759151
5,1,7,2.5,1260759187
6,1,8,2.0,1260759148
7,2,1,2.0,1260759125
8,2,2,3.5,1260759131
9,2,3,4.0,1260759135


In [2]:
# 转换成User-Item矩阵
df = ratings.pivot(index='userId', columns='movieId', values='rating')
df

movieId,1,2,3,4,5,6,7,8
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3.5,2.0,,4.5,5.0,1.5,2.5,2.0
2,2.0,3.5,4.0,,2.0,3.5,,3.0
3,5.0,1.0,1.0,3.0,5.0,1.0,,
4,3.0,4.0,4.5,,3.0,4.5,4.0,2.0
5,,4.0,1.0,4.0,,,4.0,1.0
6,,4.5,4.0,5.0,5.0,4.5,4.0,4.0
7,5.0,2.0,,3.0,5.0,4.0,5.0,
8,3.0,,,5.0,4.0,2.5,3.0,4.0


In [4]:
# 构建共同的评分向量
def build_xy(user_id1, user_id2):
    bool_array = df.loc[user_id1].notnull() & df.loc[user_id2].notnull()
    return df.loc[user_id1, bool_array], df.loc[user_id2, bool_array]
build_xy(1,2)  #userId分别为1和2的两个用户的共同评分矩阵
#对比UI矩阵，1和2的共同评分向量是正确的，即用户1和用户2都曾经对电影1、2、5、6、8做出过评价。

(movieId
 1    3.5
 2    2.0
 5    5.0
 6    1.5
 8    2.0
 Name: 1, dtype: float64, movieId
 1    2.0
 2    3.5
 5    2.0
 6    3.5
 8    3.0
 Name: 2, dtype: float64)

In [5]:
# 曼哈顿距离
def manhattan(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    value = sum(abs(x - y))
    return value


# 欧几里德距离
def euclidean(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    value = sum((x - y)**2)**0.5
    return value

# 余弦相似度
def cosine(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    # 分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    try:
        value = sum(x*y)/denominator
    except ZeroDivisionError:
        value = 0
    return value
# 皮尔逊相关系数
def pearson(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    mean1, mean2 = x.mean(), y.mean()
    # 分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    try:
        value = sum((x - mean1) * (y - mean2)) / denominator
    except ZeroDivisionError:
        value = 0
    return value
#用户1和用户2的皮尔逊相关系数
print(pearson(1,2))

-0.9040534990682686


In [12]:
#使用皮尔逊相似度计算一下与用户3兴趣最相近的3个用户：
metric_funcs = {
    'manhattan':manhattan,
    'euclidean': euclidean,
    'pearson': pearson,
    'cosine': cosine
}
# 计算最近的邻居
def computeNearestNeighbor(user_id, metric='pearson', k=3):
    """
    metric: 度量函数
    k:      返回k个邻居
    返回：pd.Series，其中index是邻居名称，values是距离
    """
    if metric in ['manhattan', 'euclidean']:
        return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nlargest(k)
print(computeNearestNeighbor(3))

userId
1    0.819782
6    0.801784
7    0.766965
Name: userId, dtype: float64


In [13]:
# 向给定用户推荐（返回：pd.Series）
def recommend(user_id):
    # 找到距离最近的用户id
    nearest_user_id = computeNearestNeighbor(user_id, metric='cosine').index[0]
    print('最近邻用户id：', nearest_user_id)
    # 找出邻居评价过、但自己未曾评价的商品
    # 结果：index是商品名称，values是评分
    return df.loc[nearest_user_id, df.loc[user_id].isnull() & df.loc[nearest_user_id].notnull()].sort_values()
#尝试对用户3做出推荐：
recommend(3)

最近邻用户id： 1


movieId
8    2.0
7    2.5
Name: 1, dtype: float64

# 基于物品的协同过滤

# 基于物品的协同过滤算法主要分为两步：
计算物品之间的相似度；
根据物品的相似度和用户的历史行为给用户生成推荐列表。 本程序简化为，找一个与该物品相似的物品

In [21]:
import math
import pdb
import pandas as pd
import os
os.chdir(r'C:\Users\CDA\data')

In [19]:
train = dict()
#用户-物品的评分表
for line in open("item_book.txt"):
    user,score,item = line.strip().split(",")
    train.setdefault(user,{})
    train[user][item] = int(float(score))

In [20]:
train

{'Liu Yi': {'1001': 3, '1002': 3, '1003': 4, '1004': 4, '1005': 5},
 'Chen Er': {'1001': 4},
 'Zhang San': {'1001': 3, '1003': 5, '1004': 3},
 'Li Si': {'1001': 3, '1002': 4, '1003': 5}}

In [24]:
pd.DataFrame(train)

Unnamed: 0,Liu Yi,Chen Er,Zhang San,Li Si
1001,3,4.0,3.0,3.0
1002,3,,,4.0
1003,4,,5.0,5.0
1004,4,,3.0,
1005,5,,,


In [21]:
cooccur = dict()  #物品-物品的共现矩阵
buy = dict()  #物品被多少个不同用户购买N
for user,items in train.items():
    for i in items.keys():
        buy.setdefault(i,0)
        buy[i] += 1
        cooccur.setdefault(i,{})
        for j in items.keys():
            if i == j : continue
            cooccur[i].setdefault(j,0)
            cooccur[i][j] += 1
#计算相似度矩阵
similar = dict()
for i,related_items in cooccur.items():
    similar.setdefault(i,{})
    for j,cij in related_items.items():
        similar[i][j] = cij / (math.sqrt(buy[i] * buy[j]))

In [23]:
import pandas as pd
pd.DataFrame(similar)

Unnamed: 0,1001,1002,1003,1004,1005
1001,,0.707107,0.866025,0.707107,0.5
1002,0.707107,,0.816497,0.5,0.707107
1003,0.866025,0.816497,,0.816497,0.57735
1004,0.707107,0.5,0.816497,,0.707107
1005,0.5,0.707107,0.57735,0.707107,


In [27]:
#给用户user推荐，前K个相关用户，前N个物品
user="Li Si"
K=3
N=10    
rank = dict()
action_item = train[user]     
#用户user产生过行为的item和评分
for item,score in action_item.items():
    sortedItems = sorted(similar[item].items(),key=lambda x:x[1],reverse=True)[0:K]
    for j,wj in sortedItems:
        if j in action_item.keys():
            continue
        rank.setdefault(j,0)
        rank[j] += score * wj
dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])

{'1004': 6.203803248198273, '1005': 2.82842712474619}

# ☆使用Surprise实现电影推荐

In [18]:
#https://blog.csdn.net/a378812/article/details/82813728
#数据集介绍

In [24]:
from surprise import Reader, Dataset, KNNBaseline

In [25]:
#查看数据
# 评分数据包含四列：userId, movieId, rating, timestamp
# 电影数据包含三列：movieId, title, genres
ratings = pd.read_csv('ml-latest-small/ratings.csv') 
movies = pd.read_csv('ml-latest-small/movies.csv')
# 把两个csv的数据合并成一份数据，以movieId为基准
df = pd.merge(ratings, movies, on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
df.shape

(100836, 6)

In [6]:
df.userId.unique().shape  #表中人数

(610,)

In [31]:
df.movieId.unique().shape  #表中电影数

(9724,)

In [26]:
# 用Reader对象指定数据格式
reader = Reader(rating_scale=(0.5, 5.0))
# 通过surprise.Dataset的load_from_df方法，加载pandas数据格式
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
# 生成训练数据集
trainset = data.build_full_trainset()

In [11]:
trainset

<surprise.trainset.Trainset at 0x6a79c50>

## 基于用户的协同过滤

In [27]:
sim_options = {'name': 'pearson_baseline', 'user_based': True}
# 用字典打包参数

user_based = KNNBaseline(sim_options=sim_options)
user_based.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0xedfcfd0>

In [29]:
user_based.compute_similarities().shape

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


(610, 610)

In [38]:
# 首先,把userId转换为surprise内部id，因为真实的外部userId是不连续的，不利于计算
inner_user_id = user_based.trainset.to_inner_uid(1)#查找id为1的用户的邻居
# 然后，通过内部id获取邻近用户id
neighbor_users = user_based.get_neighbors(inner_user_id, k=5)
print('neighbor users inner id: ', neighbor_users)
# 最后，通过to_raw_uid把内部用户id转换为真实的用户id
for u in neighbor_users:
    print('real user id: ', user_based.trainset.to_raw_uid(u))

neighbor users inner id:  [203, 316, 169, 146, 166]
real user id:  597
real user id:  369
real user id:  484
real user id:  414
real user id:  477


In [36]:
user_based.predict(1,1)

Prediction(uid=1, iid=1, r_ui=None, est=4.486371971135822, details={'actual_k': 40, 'was_impossible': False})

In [19]:
df[df.userId == 1].head()#id为1的用户看过的电影

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
215,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
267,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
369,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
572,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [20]:
uid = user_based.trainset.to_raw_uid(neighbor_users[0])#597
df[df.userId == uid].head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
203,597,1,4.0,941557863,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
363,597,6,3.0,940420695,Heat (1995),Action|Crime|Thriller
564,597,47,4.0,940361541,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
769,597,50,5.0,940362491,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
825,597,70,2.0,941559139,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller


In [29]:
df1=df[df.userId == 1]#用户1看过的电影
df2=df[df.userId == uid]#用户597看过的电影
#求差集
df2=df2.append(df1)
df2=df2.append(df1)
df2=df2.drop_duplicates(subset=['movieId'],keep=False)
df2[df2.rating==5].reset_index(drop=True)#评分为五分的电影

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,597,1704,5.0,940361937,Good Will Hunting (1997),Drama|Romance
1,597,1124,5.0,941639767,On Golden Pond (1981),Drama
2,597,1263,5.0,941640278,"Deer Hunter, The (1978)",Drama|War
3,597,1272,5.0,941631197,Patton (1970),Drama|War
4,597,1302,5.0,941639635,Field of Dreams (1989),Children|Drama|Fantasy
...,...,...,...,...,...,...
118,597,3068,5.0,941639767,"Verdict, The (1982)",Drama|Mystery
119,597,2240,5.0,941639635,My Bodyguard (1980),Drama
120,597,2241,5.0,941641362,Class (1983),Comedy
121,597,2259,5.0,941641362,Blame It on Rio (1984),Comedy|Romance


## 基于物品的协同过滤

In [30]:
# 这里基于物品的算法，所以要指定user_bases为False
sim_options = {'name': 'pearson_baseline', 'user_based': False}
item_baesd = KNNBaseline(sim_options=sim_options)
item_baesd.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x13ecd748>

In [32]:
#将item_id转化为内部ID
inner_item_id = user_based.trainset.to_inner_iid(1)
neighbor_items = user_based.get_neighbors(inner_item_id, k=10)
print('neighbor items inner id: ', neighbor_items)
# to_raw_uid转换为真实的物品id
for u in neighbor_items:
    print('real item id: ', item_baesd.trainset.to_raw_iid(u))

neighbor items inner id:  [203, 316, 169, 146, 166, 356, 92, 532, 59, 310]
real item id:  3147
real item id:  450
real item id:  2616
real item id:  2291
real item id:  2571
real item id:  1266
real item id:  1377
real item id:  168
real item id:  1060
real item id:  345


In [33]:
df1=df[df.userId == 1]#第一个用户看过的电影
df1.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
215,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
267,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
369,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
572,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [34]:
movieid=[item_baesd.trainset.to_raw_iid(u) for u in neighbor_items]
movieid

[3147, 450, 2616, 2291, 2571, 1266, 1377, 168, 1060, 345]

In [35]:
df2=pd.DataFrame()#与第一个电影最相邻的k个电影
for i in movieid:
    df2=df2.append(df[df.movieId == i])
df2=df2.drop_duplicates(subset=['movieId'],keep='first')
df2

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
15050,1,3147,5.0,964983873,"Green Mile, The (1999)",Crime|Drama
19714,4,450,2.0,986848828,With Honors (1994),Comedy|Drama
12971,1,2616,4.0,964983080,Dick Tracy (1990),Action|Crime
11782,1,2291,5.0,964983664,Edward Scissorhands (1990),Drama|Fantasy|Romance
12642,1,2571,5.0,964981888,"Matrix, The (1999)",Action|Sci-Fi|Thriller
21511,4,1266,4.0,986849037,Unforgiven (1992),Drama|Western
9245,1,1377,3.0,964982653,Batman Returns (1992),Action|Crime
29139,6,168,5.0,845553695,First Knight (1995),Action|Drama|Romance
5972,1,1060,4.0,964980924,Swingers (1996),Comedy|Drama
19441,4,345,4.0,945629063,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama


In [36]:
df2=df2.append(df1)
df2=df2.append(df1)
df2.drop_duplicates(subset=['movieId'],keep=False)#

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
19714,4,450,2.0,986848828,With Honors (1994),Comedy|Drama
21511,4,1266,4.0,986849037,Unforgiven (1992),Drama|Western
29139,6,168,5.0,845553695,First Knight (1995),Action|Drama|Romance
19441,4,345,4.0,945629063,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama
