In [9]:
import os
import time

import json

from operator import itemgetter

# data science imports
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
from fuzzywuzzy import fuzz

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

## Load Json Data

In [10]:
def read_josn (path):
    file = open(path, "r", encoding = 'utf-8')
    df = []
    for line in file.readlines():
        dic = json.loads(line)
        df.append(dic)
    return df

In [11]:
# load business data
path = "dataset/Peoria_Data/Peoria_business.json"
business_list = read_josn(path)
# load reviews data
path = "dataset/Peoria_Data/Peoria_Review.json"
review_list = read_josn(path)
# load user data
path = "dataset/Peoria_Data/Peoria_User.json"
user_list = read_josn(path)

## Transform to dataframe

In [12]:
# business list into dataframe
df_business = pd.DataFrame()
business_id = []
name = []
for element in business_list:
    business_id.append(element["business_id"])
    name.append(element["name"])

Business_Num_Id = {}
U = 1
B = 1
Business_Id = []
for index in range(len(business_id)):
    if business_id[index] not in Business_Num_Id.keys():
        Business_Num_Id[business_id[index]] = B
        B = B + 1
    Business_Id.append(Business_Num_Id[business_id[index]])
    

df_business["RawBusinessId"] = business_id
df_business["businessId"] = Business_Id
df_business["title"] = name

df_business.head()

Unnamed: 0,RawBusinessId,businessId,title
0,Q1GhjqlLENaT383k9Ex5wg,1,Daily Donut
1,cz5kBEZMHKyJ_nZXn-IRIA,2,Hall Brent R DDS
2,U1iF2RC18uBgBe01z5BMXw,3,Church's Chicken
3,9eY5ZfKPc_oz6Co5WCPynQ,4,Fullerton Financial Planning
4,aERMWmIT6yJVGYIn4qQizQ,5,GO AZ Motorcycles


In [13]:
# User list into dataframe
df_user = pd.DataFrame()
user_id = []
name = []
for element in user_list:
    user_id.append(element["user_id"])
    name.append(element["name"])

User_Num_Id = {}
U = 1
User_Id = []
for index in range(len(user_id)):
    if user_id[index] not in User_Num_Id.keys():
        User_Num_Id[user_id[index]] = U
        U = U + 1
    User_Id.append(User_Num_Id[user_id[index]])
    
df_user["RawUserId"] = user_id
df_user["UserId"] = User_Id
df_user["name"] = name

df_user.head()

Unnamed: 0,RawUserId,UserId,name
0,eSlOI3GhroEtcbaD_nFXJQ,1,Jason
1,U4INQZOPSUaj8hMjLlZ3KA,2,Michael
2,xZAmw5gihOVO4duMN2Ju6Q,3,Flynn
3,F78tJHr0qW6FsYBtWMmLpA,4,Christopher
4,TUE3NJYN4i7xyGVSIXMLvw,5,Kymberly


In [14]:
# review list into dataframe
df_ratings = pd.DataFrame()
#review_id = []
user_id = []
business_id = []
ratings = []
for element in review_list:
    uid = User_Num_Id[element["user_id"]]
    bid = Business_Num_Id[element["business_id"]]
    
    user_id.append(uid)
    business_id.append(bid)
    ratings.append(element["stars"])

#df_ratings["review_id"] = review_id
df_ratings["userId"] = user_id
df_ratings["businessId"] = business_id
df_ratings["rating"] = ratings

df_ratings.head()

Unnamed: 0,userId,businessId,rating
0,5042,153,5.0
1,5763,50,5.0
2,79,58,1.0
3,6284,63,5.0
4,4844,130,2.0


## Filter Data

### For Business

In [15]:
# get rating frequency
df_business_cnt = pd.DataFrame(df_ratings.groupby('businessId').size(), columns=['count'])
df_business_cnt.head()

Unnamed: 0_level_0,count
businessId,Unnamed: 1_level_1
1,33
2,4
3,3
4,4
5,44


In [16]:
# top quantiles of rating counts
df_business_cnt['count'].quantile(np.arange(1, 0.6, -0.05))

1.00    620.0
0.95    124.2
0.90     71.0
0.85     46.0
0.80     34.0
0.75     27.0
0.70     21.0
0.65     18.0
Name: count, dtype: float64

In [17]:
# filter data - movies count less than 20
popularity_thres = 10
popular_business = list(set(df_business_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_business = df_ratings[df_ratings.businessId.isin(popular_business)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular business: ', df_ratings_drop_business.shape)

shape of original ratings data:  (57857, 3)
shape of ratings data after dropping unpopular business:  (53338, 3)


### For Users

In [18]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_business.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,1
2,2
3,6
4,10
6,7


In [19]:
df_users_cnt['count'].quantile(np.arange(1, 0.5, -0.05))

1.00    109.0
0.95      5.0
0.90      3.0
0.85      2.0
0.80      2.0
0.75      2.0
0.70      1.0
0.65      1.0
0.60      1.0
0.55      1.0
Name: count, dtype: float64

In [20]:
# filter data
ratings_thres = 2
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_business[df_ratings_drop_business.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular business and inactive users: ', df_ratings_drop_users.shape)

shape of original ratings data:  (57857, 3)
shape of ratings data after dropping both unpopular business and inactive users:  (32281, 3)


In [21]:
df_ratings_drop_users.index = list(range(0, df_ratings_drop_users.shape[0]))
df_ratings_drop_users

Unnamed: 0,userId,businessId,rating
0,79,58,1.0
1,4844,130,2.0
2,1598,78,5.0
3,3163,16,3.0
4,4758,88,4.0
...,...,...,...
32276,22959,1821,1.0
32277,15889,1740,4.0
32278,3020,1831,5.0
32279,4671,1831,5.0


In [22]:
# the number of users and movies
num_users = len(df_ratings_drop_users.userId.unique())
num_items = len(df_ratings_drop_users.businessId.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

There are 8393 unique users and 1016 unique movies in this data set


## Transform dataframe to dictionary

In [23]:
dataDic = {}
dataDic_len = 0
# 遍历data的每一行， 把userId, movidId, rating按照{user: {movidId: rating}}的方式存储， 当然定义一个随机种子进行数据集划分
for ele in df_ratings_drop_users.itertuples():   # 遍历行这里推荐用itertuples， 比iterrows会高效很多
    user, business, rating = getattr(ele, 'userId'), getattr(ele, 'businessId'), getattr(ele, 'rating')
    
    dataDic.setdefault(user, {})
    dataDic[user][business] = rating
    dataDic_len += 1
    
print('dataDic = %s' % dataDic_len)

dataDic = 32281


In [24]:
dataDic.keys()

dict_keys([79, 4844, 1598, 3163, 4758, 2015, 2741, 4003, 1927, 4314, 1220, 1669, 5934, 2802, 4322, 5677, 5669, 822, 6784, 1681, 127, 4666, 6104, 3272, 6428, 6375, 6761, 7029, 1414, 4808, 1980, 581, 87, 6632, 6526, 15, 6675, 1324, 2961, 4078, 2772, 4144, 3079, 4686, 4178, 798, 5350, 3514, 4831, 4216, 3004, 1041, 3033, 628, 2010, 3278, 2957, 1975, 3397, 671, 4233, 3126, 5013, 6648, 3141, 2621, 2705, 2683, 922, 1817, 2664, 6519, 586, 3444, 504, 1323, 1093, 2918, 5909, 5758, 2047, 3197, 2314, 2178, 1132, 5772, 1545, 649, 747, 6672, 4164, 1415, 6621, 6091, 6872, 5149, 5699, 5045, 6870, 6442, 2888, 5358, 513, 6690, 6815, 6162, 2652, 6984, 2539, 2347, 5019, 3229, 6334, 6952, 1822, 6209, 689, 132, 1330, 6609, 7100, 5179, 2873, 5316, 5111, 3684, 3651, 6454, 5131, 3972, 1393, 1089, 1195, 2611, 261, 6559, 973, 5353, 6439, 1172, 493, 490, 7083, 5087, 6689, 113, 66, 1981, 331, 1162, 6991, 449, 90, 2983, 2400, 6837, 5051, 4591, 5461, 5641, 3026, 5856, 7067, 3755, 3307, 4112, 7070, 3304, 5932, 3273, 

## Get Similarity

In [25]:
# 计算每部电影的流行程度, 也就是每部电影被用户看的总次数， 这个衡量相似度的时候作为分母
business_popular = {}
for user, businesses in dataDic.items():   # 这里的user就是每个用户， movies还是个字典， {movieID: rating}
    for business in businesses:       # 这里的movie就是movieID了
        if business not in business_popular:     
            business_popular[business] = 0  
        business_popular[business] += 1


business_count = len(business_popular)
print('Total business number = %d' % business_count)


# 下面建立电影相似矩阵
print('Build user co-rated movies matrix ...')
business_sim_matrix = {}
for user, businesses in dataDic.items():     # 注意这个地方与UserCF的不同
    for b1 in businesses:           # 对于每个电影， 都得双层遍历
        for b2 in businesses:
            if b1 == b2:
                continue
            business_sim_matrix.setdefault(b1, {})      # 把字典的值设置为字典的形式
            business_sim_matrix[b1].setdefault(b2, 0)
            business_sim_matrix[b1][b2] += 1     # 这里统计两个电影被同一个用户产生行为的次数， 这个就是余弦相似度的分子
            
print('Build user co-rated businesses matrix success!')


# 下面计算电影之间的相似性
print('Calculating movies similarity matrix ...')
for b1, related_businesses in business_sim_matrix.items():
    for b2, count in related_businesses.items():    # 这里面m2是相关电影， count是共同被同一个用户打分的次数
        # 这里注意零向量的处理， 即某电影的用户数为0
        if business_popular[b1] == 0  or business_popular[b2] == 0:
            business_sim_matrix[b1][b2] = 0
        else:
            business_sim_matrix[b1][b2] = count / math.sqrt(business_popular[b1] * business_popular[b2]) 
            
print('Calculate movies similarity matrix success!')

Total business number = 1016
Build user co-rated movies matrix ...
Build user co-rated businesses matrix success!
Calculating movies similarity matrix ...
Calculate movies similarity matrix success!


In [26]:
# 这里先把产生推荐的那个封装成函数才能测试所有的测试样本
def recommend(aim_user, data, sim_matrix, k=20, n=20):
    rank ={}
    rated_businesses = data[aim_user]      # 找出目标用户看到电影
    
    # 753
    w_sum = {}
    R = {}
    
    for business, rating in rated_businesses.items():
        #遍历与物品item最相似的前k个产品，获得这些物品及相似分数
        for related_business, w in sorted(sim_matrix[business].items(), key=itemgetter(1), reverse=True)[:k]:
            # 若该物品用户看过， 跳过
            if related_business in rated_businesses:
                continue

            # 计算用户user对related_business的偏好值， 初始化该值为0
            rank.setdefault(related_business, 0)
            #通过与其相似物品对物品related_business的偏好值相乘并相加。
            #排名的依据—— > 推荐电影与该已看电影的相似度(累计) * 用户对已看电影的评分
            rank[related_business] += w * float(rating)
    
    # 产生最后的推荐列表
    result =  sorted(rank.items(), key=itemgetter(1), reverse=True)[:n]  # itemgetter(1) 是简洁写法
    
    rank_position = list(range(len(result)))
    
    return result, rank_position

#### 一个例子

In [27]:
aim_user = 1202
recom_list, rank_posi = recommend(aim_user, dataDic, business_sim_matrix)

In [28]:
recom_list

[(1002, 0.5555510285603042),
 (1717, 0.4720908403610231),
 (284, 0.47150240839898494),
 (262, 0.4074108571913334),
 (1236, 0.4068746026617149),
 (936, 0.37171128201453396),
 (554, 0.3474411464169823),
 (1837, 0.34691329926842285),
 (312, 0.30534353162371025),
 (1891, 0.2975975299507521),
 (1464, 0.29417420270727607),
 (748, 0.2699527623995085),
 (919, 0.2672861022670582),
 (441, 0.2613541867446584),
 (1378, 0.2602485194454143),
 (881, 0.2584542794136979),
 (1514, 0.2508726030021272),
 (1124, 0.25058072087536704),
 (895, 0.24027781497213452),
 (659, 0.23624976928744731)]

In [29]:
rank_posi

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

## PRU

In [30]:
from scipy import stats

def get_SRC(recom_list, rank_posi, business_popular):
    # popularity list
    popularity = []
    for bid, rating in recom_list:
        popularity.append(business_popular[bid])
    
    SRC,_ = stats.spearmanr(rank_posi, popularity)
    
    return SRC    

In [31]:
get_SRC(recom_list, rank_posi, business_popular)

-0.14904036598446474

In [32]:
# 这一步对于小数据集，需要10min左右
# print的目的是，查看运行到哪里，可删除
SRC_list = []
SRC = 0
pop_list = []
for uid in list(dataDic.keys()):
    #print(uid)
    recom_list, rank_posi = recommend(uid, dataDic, business_sim_matrix,k = 500, n = 500)
    SRC = get_SRC(recom_list, rank_posi, business_popular)
    SRC_list.append(SRC)
    
PRU = -np.mean(SRC_list)

In [33]:
PRU

nan

In [34]:
len(SRC_list)

8393

In [35]:
print(np.argwhere(np.isnan(SRC_list)))

[[5466]
 [7724]
 [7727]
 [7875]]


In [36]:
-np.sum(list(map(lambda x: 0.0 if math.isnan(x) else x, SRC_list)))/8389

0.039631979358876926