# 基于用户的协同过滤

In [1]:
# A dictionary of movie critics and their ratings of a small#
critics = {
    'A': {'老炮儿':3.5,'唐人街探案': 1.0},
    'B': {'老炮儿':2.5,'唐人街探案': 3.5,'星球大战': 3.0, '寻龙诀': 3.5, '神探夏洛克': 2.5, '小门神': 3.0},
    'C': {'老炮儿':3.0,'唐人街探案': 3.5,'星球大战': 1.5, '寻龙诀': 5.0, '神探夏洛克': 3.0, '小门神': 3.5},
    'D': {'老炮儿':2.5,'唐人街探案': 3.5,'寻龙诀': 3.5, '神探夏洛克': 4.0},
    'E': {'老炮儿':3.5,'唐人街探案': 2.0,'星球大战': 4.5, '神探夏洛克': 3.5, '小门神': 2.0},
    'F': {'老炮儿':3.0,'唐人街探案': 4.0,'星球大战': 2.0, '寻龙诀': 3.0, '神探夏洛克': 3.0, '小门神': 2.0},
    'G': {'老炮儿':4.5,'唐人街探案': 1.5,'星球大战': 3.0, '寻龙诀': 5.0, '神探夏洛克': 3.5}
    }

print(critics['B']['星球大战'])
from math import sqrt


# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
    # Get the list of shared_items
    si = {}
    # person1 和 person2 共同评过分的电影数
    for item in prefs[person1]:
        if item in prefs[person2]: si[item] = 1
    # 如果没有共同评分的电影
    if len(si) == 0: return 0

    # 欧式距离
    sum_of_squares = sqrt(sum([pow(prefs[person1][item] - prefs[person2][item], 2) for item in prefs[person1] if item in prefs[person2]]))
    # 相似度
    return 1 / (1 + sum_of_squares)

print(sim_distance(critics, 'A', 'B'))



def getRecommendations(prefs, person, similarity=sim_distance):
    totals = {}
    simSums = {}
    for other in prefs:
        # 不和自己对比
        if other == person: continue
        sim = similarity(prefs, person, other)
        # 忽略相似度低的
        # if sim <= 0: continue
        for item in prefs[other]:
            # 只对没看过的电影评分
            if item not in prefs[person] :  # or prefs[person][item] == 0
                # Similarity * Score
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item] * sim
                # Sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim

    # Create the normalized list
    rankings = [(total / simSums[item], item) for item, total in totals.items()]

    rankings.sort(reverse=True)
    return rankings

getRecommendations(critics, 'A')

3.0
0.2708131845707603


[(4.152703901679927, '寻龙诀'),
 (3.304207244554503, '神探夏洛克'),
 (3.045124682040546, '星球大战'),
 (2.5333970389243956, '小门神')]

# 基于物品的协同过滤 

##### 皮尔逊相关系数计算方法

In [2]:
from scipy import stats

stats.pearsonr([3.5, 5.0, 3.0], [3.0, 3.5, 2.0])

PearsonRResult(statistic=0.8910421112136306, pvalue=0.2999500933457434)

##### 读取数据

In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from scipy import stats
pd.options.mode.chained_assignment = None

data = pd.read_csv("data/movies.csv", sep="\t")
data

Unnamed: 0,user,movie,rate
0,A,老炮儿,3.5
1,A,唐人街探案,1.0
2,B,老炮儿,2.5
3,B,唐人街探案,3.5
4,B,星球大战,3.0
5,B,寻龙诀,3.5
6,B,神探夏洛克,2.5
7,B,小门神,3.0
8,C,老炮儿,3.0
9,C,唐人街探案,3.5


In [4]:
# 电影列表
item_names:list = data["movie"].unique().tolist()   
print(item_names)

['老炮儿', '唐人街探案', '星球大战', '寻龙诀', '神探夏洛克', '小门神']


##### 构造相似度矩阵

In [5]:
def item_sim(item_i_df:DataFrame, item_j_df:DataFrame):
    item_i_df.drop("movie",axis=1,inplace=True)
    item_i_df.set_index("user", inplace=True)
    item_i_df.rename({"rate":"rate_i"}, axis=1,inplace=True)

    item_j_df.drop("movie", axis=1, inplace=True)
    item_j_df.set_index("user", inplace=True)
    item_j_df.rename({"rate":"rate_j"}, axis=1,inplace=True)

    joined_df = item_i_df.join(item_j_df, how="inner")
    s, p = stats.pearsonr(joined_df["rate_i"], joined_df["rate_j"])

    return s

# 相似度矩阵
sim_matrix = np.zeros((len(item_names), len(item_names)))

item_group = data.groupby("movie")
group_laopao = item_group.get_group("老炮儿")
print(group_laopao)

for i in range(len(item_names)):        #[0, 1, 2, 3, 4, 5]
    for j in range(len(item_names)):    #[0, 1, 2, 3, 4, 5]
        sim = item_sim(item_group.get_group(item_names[i]), item_group.get_group(item_names[j]))
        sim_matrix[i][j] = sim

print(sim_matrix) 

   user movie  rate
0     A   老炮儿   3.5
2     B   老炮儿   2.5
8     C   老炮儿   3.0
14    D   老炮儿   2.5
18    E   老炮儿   3.5
23    F   老炮儿   3.0
29    G   老炮儿   4.5
[[ 1.         -0.76916737  0.3007374   0.65060005  0.2533202  -0.54433105]
 [-0.76916737  1.         -0.67120927 -0.68551062 -0.38138504  0.32075015]
 [ 0.3007374  -0.67120927  1.         -0.08084521  0.44129801 -0.54594868]
 [ 0.65060005 -0.68551062 -0.08084521  1.          0.11720181  0.89104211]
 [ 0.2533202  -0.38138504  0.44129801  0.11720181  1.         -0.54433105]
 [-0.54433105  0.32075015 -0.54594868  0.89104211 -0.54433105  1.        ]]


##### 根据看过的电影和未看过的电影，查询相似度矩阵

In [6]:
user_df:DataFrame = data.groupby("user").get_group("A")
watched_movies = user_df["movie"]
watched_movie_indexes = [item_names.index(i) for i in watched_movies]   # 查询看过的电影的在电影列表中的索引
watched_movie_indexes

[0, 1]

In [7]:
unwatched_movie_indexes = list(set(range(len(item_names))) - set(watched_movie_indexes))
unwatched_movie_indexes

[2, 3, 4, 5]

In [8]:
watched_unwatched_movie_sim_array = sim_matrix[watched_movie_indexes][:,unwatched_movie_indexes]
watched_unwatched_movie_sim_array

array([[ 0.3007374 ,  0.65060005,  0.2533202 , -0.54433105],
       [-0.67120927, -0.68551062, -0.38138504,  0.32075015]])

##### 查询用户对看过的电影的评分

In [9]:
rep = dict(zip(watched_movies, watched_movie_indexes))
watched_movie_rate_df = user_df.replace(rep)
watched_movie_rate_df

Unnamed: 0,user,movie,rate
0,A,0,3.5
1,A,1,1.0


In [10]:
watched_movie_rate_array = watched_movie_rate_df.loc[:,["movie","rate"]].to_numpy()
watched_movie_rate_array

array([[0. , 3.5],
       [1. , 1. ]])

##### 推荐度计算

In [11]:
recommend_score = watched_movie_rate_array[:,1].dot(watched_unwatched_movie_sim_array)
recommend_score

array([ 0.38137165,  1.59158955,  0.50523566, -1.58440854])

In [12]:
list(zip(unwatched_movie_indexes, recommend_score))

[(2, 0.38137164735743034),
 (3, 1.5915895488293916),
 (4, 0.5052356592353362),
 (5, -1.5844085392815683)]

In [13]:
sorted(zip(unwatched_movie_indexes, recommend_score), key=lambda x:x[1], reverse=True)

[(3, 1.5915895488293916),
 (4, 0.5052356592353362),
 (2, 0.38137164735743034),
 (5, -1.5844085392815683)]

# 基于物品的协同过滤 - PySpark

In [14]:
import os
import pandas as pd
from scipy import stats

def sim(ur_ur):
    u_r_1 = pd.DataFrame(ur_ur[1][0], columns=['user', 'rating_a'])
    u_r_2 = pd.DataFrame(ur_ur[1][1], columns=['user', 'rating_b'])
    u_r_1.set_index('user', inplace=True)
    u_r_2.set_index('user', inplace=True)
    u_r_r = u_r_1.join(u_r_2, how='inner')
    print(u_r_r)
    pearson_value, P_value = stats.pearsonr(u_r_r['rating_a'].astype(float), u_r_r['rating_b'].astype(float))
    print(ur_ur[0], pearson_value)
    return [ur_ur[0], pearson_value]


def recommend(bc, m_sim):
    bc_df = pd.DataFrame(bc.value, columns=['movie', 'value'])
    bc_df.set_index('movie', inplace=True)

    m_sim_df = pd.DataFrame(m_sim, columns=['movie', 'value'])
    m_sim_df.set_index('movie', inplace=True)
    m_sim_sub_df = m_sim_df.loc[bc_df.index.values]

    print(m_sim_sub_df)

    # 评分，（矩阵相乘）
    score = m_sim_sub_df.T.astype(float).dot(bc_df.astype(float))
    print("dot: \n ", score)
    return score.loc['value']['value']


if __name__ == '__main__':
    '''
    《数据挖掘与机器学习》122页, 基于物品的协同过滤
    '''
    file_path = r"movies.csv"

    raw_umr_rdd = sc.textFile(file_path)  # 'A	老炮儿	3.5'
    u_m_r_rdd = raw_umr_rdd.map(lambda line: line.split("\t")[:3])  # ['A', '老炮儿', '3.5']
    m_ur_rdd = u_m_r_rdd.map(lambda line: (line[1], [line[0], line[2]]))
    m_urs_rdd = m_ur_rdd.groupByKey().mapValues(list)
    m_urs_cartesian_rdd = m_urs_rdd.cartesian(m_urs_rdd)
    m_urs_cartesian_unique_rdd = m_urs_cartesian_rdd.filter(lambda m_urs: m_urs[0][0] != m_urs[1][0])

    m__m_rrs_rdd = m_urs_cartesian_unique_rdd.map(lambda m_urs_2: (m_urs_2[0][0], (m_urs_2[1][0], [m_urs_2[0][1], m_urs_2[1][1]])))
    m__m_sim_rdd = m__m_rrs_rdd.mapValues(sim)
    m__m_sim_s_rdd = m__m_sim_rdd.groupByKey().mapValues(list)

    # 用户已看过的电影
    u_mr_rdd = u_m_r_rdd.map(lambda line: (line[0], [line[1], line[2]]))
    u_mrs_rdd = u_mr_rdd.groupByKey().mapValues(list)  # (u, [ [m,r],[m,r],[m,r]] )

    m_r_d = u_mrs_rdd.filter(lambda x: x[0] == 'A').values()  # [ [m,r],[m,r] ]
    m_r_d_rdd = sc.parallelize(m_r_d.first())
    bc = sc.broadcast(m_r_d.first())
    sub_sim_rdd = m__m_sim_s_rdd.subtractByKey(m_r_d_rdd)  # (m,[[m,s],[m,s]])
    m_score_rdd = sub_sim_rdd.mapValues(lambda x: recommend(bc, x))
    m_score_sorted_rdd = m_score_rdd.sortBy(lambda x: x[1], False)
    print(m_score_sorted_rdd.collect())
    print(m_score_sorted_rdd.take(3))


NameError: name 'sc' is not defined