In [1]:
import pandas as pd
import numpy as np
import math
import operator

In [2]:
data = pd.read_excel("order_data.xlsx")

In [3]:
data.head(5)

Unnamed: 0,user_id,product_id,catalog_id,brand_id,vender_id,orders_num,buy_num,price
0,27,100103539,1127.0,967861.0,51,1,1,22.0
1,27,100118748,1993.0,6071.0,11,1,1,63.51
2,27,100144115,1127.0,969673.0,51,2,2,61.0
3,27,100152505,1206.0,1234.0,191,1,5,3.0
4,27,100156451,1104.0,913062.0,191,1,2,12.5


In [4]:
# 将空由0替换
data = data.fillna(0)

In [5]:
data = data[(data.product_id > 0)]
data["product_id"] = data["product_id"].apply(lambda x: int(x))

In [6]:
# 过滤一个用户至少买过两个商品的记录
user_data = data.groupby("user_id").size()
user_data = user_data[user_data > 1]
data = data[data.user_id.isin(user_data.keys())]
user_list = data.values.tolist()

In [7]:
len(data)

76121

In [8]:
all_product_id = list(set(data["product_id"].values.tolist()))

In [9]:
len(all_product_id)

19462

In [10]:
product_to_index = {}
index_to_product = {}
for index,value in enumerate(all_product_id):
    product_to_index[value] = index
    index_to_product[index] = value

In [11]:
index_to_product[0]

100499457

In [12]:
# 第一步创建用户-物品的倒排索引
user_item_index = {}
for user_id in user_data.keys():
    product_ids = data[data.user_id == user_id]["product_id"].values.tolist()
    for index,value in enumerate(product_ids):
        product_ids[index] = product_to_index[value]
    user_item_index[user_id] = product_ids

In [13]:
# 第二步创建共现矩阵
product_length = len(product_to_index)
matrix_c = np.zeros((product_length,product_length))

# 循环用户-商品倒排索引 对于用一个用户购买的任意两个商品 在共现矩阵中要加1
for user_id in user_item_index:
    product_ids = user_item_index[user_id]
    for i, value in enumerate(product_ids):
        if(i < len(product_ids) - 1):
            list_other = product_ids[(i+1):len(product_ids)]
            for second_product_index in list_other:
                matrix_c[value][second_product_index] += 1
                matrix_c[second_product_index][value] += 1

In [14]:
matrix_c[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [15]:
# 第三步根据算法得到商品的相似矩阵
# 算法：cij/sqrt(|N(i)|*|N(j)|)
product_index_count_dic = {}
product_group = data.groupby("product_id").size()
for product_id in product_group.keys():
    product_index_count_dic[product_to_index[product_id]] = product_group[product_id]

matrix_w = np.zeros((product_length, product_length))

# 共现矩阵大于0的下标list
index_i_list, index_j_list = np.where(matrix_c > 0)
for index,value in enumerate(index_i_list):
    i = value
    j = index_j_list[index]
    score = matrix_c[i][j]/math.sqrt(product_index_count_dic[i] * product_index_count_dic[j])
    matrix_w[i][j] = score
    matrix_w[j][i] = score

In [17]:
a = np.zeros(product_length)
a[1] = 3
a[2] = 4
a[5] = 6
a = (a-np.min(a))/(np.max(a) - np.min(a))

In [18]:
a

array([0.        , 0.5       , 0.66666667, ..., 0.        , 0.        ,
       0.        ])

In [19]:
def normalize(value):
    value = (value - np.min(value))/(np.max(value) - np.min(value))
    return value

In [20]:
# 第四步创建用户的喜好商品矩阵，并进行归一化
user_like_item_dic = {}
for user_id in user_data.keys():
    user_like_item = data[data.user_id == user_id]
    user_item_like_matrix = np.zeros(product_length)
    for i in range(len(user_like_item)):
        index = product_to_index[user_like_item.iloc[i].product_id]
        user_item_like_matrix[index] = user_like_item.iloc[i].orders_num
    user_like_item_dic[user_id] = normalize(user_item_like_matrix)

In [21]:
user_like_item_dic[27]

array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
#获得最相似的k个商品
def getMostSimilar(matrix_w,index,k):
    c_list = matrix_w[index]
    similar_item = pd.DataFrame({"value":c_list})
    similar_item = similar_item.sort_values(by="value",ascending=False).iloc[0:k]
    similar_item_dic = {}
    for i in range(len(similar_item)):
        similar_item_dic[similar_item.iloc[i].name] = similar_item.iloc[i].value
    return similar_item_dic

In [24]:
getMostSimilar(matrix_w,0,10)

{17074: 1.0,
 1915: 1.0,
 8425: 0.5773502691896258,
 2099: 0.5773502691896258,
 19254: 0.5,
 14051: 0.3779644730092272,
 2622: 0.3333333333333333,
 18137: 0.24253562503633297,
 12974: 0.0,
 12973: 0.0}

In [25]:
like_list = np.where(user_like_item_dic[753664] > 0)
print(like_list)

(array([ 862, 6749, 6907, 7157]),)


In [26]:
def reommendItem(user_id,matrix_w,user_like_item_dic,k):
    recommend_dic = {}
    user_like_list = user_like_item_dic[user_id]
    user_like_item_index_list = np.where(user_like_list > 0)
    user_like_item_index_list = user_like_item_index_list[0]
    for product_index in user_like_item_index_list:
        like_score = user_like_list[product_index]
        most_similar_item = getMostSimilar(matrix_w,product_index,k)
        for key in most_similar_item.keys():
            if key in user_like_item_index_list:
                continue
            #最终得分是用户对商品的喜欢程度 * 商品的相似程度
            score = like_score * most_similar_item[key]
            if key in recommend_dic.keys():
                score += recommend_dic[key]
            recommend_dic[key] = score
    #返回得分最高的k个商品
    sorted_x = sorted(recommend_dic.items(), key=operator.itemgetter(1))
    sorted_x.reverse()
    return sorted_x[0:k]

In [27]:
recommend_dic = reommendItem(27,matrix_w,user_like_item_dic,10)
recommend_dic

[(7348, 0.35355339059327373),
 (6851, 0.25),
 (9369, 0.25),
 (15451, 0.20952908873087345),
 (12060, 0.20412414523193154),
 (10971, 0.15811388300841897),
 (12248, 0.15617376188860607),
 (14435, 0.15617376188860607),
 (9366, 0.15617376188860607),
 (10009, 0.15617376188860607)]

In [33]:
#第五步给用户推荐商品

def getAllUserRecommend():
    user_recommend = {}
    for user_id in user_like_item_dic.keys():
        #print(user_id)
        recommend_dic = reommendItem(user_id,matrix_w,user_like_item_dic,10)
        value = ""
        for key in recommend_dic:
            index = key[0]
            if value == "":
                value += str(index_to_product[index])
            else:
                value += "," + str(index_to_product[index])
        user_recommend[user_id] = value
    return user_recommend

In [None]:
res = getAllUserRecommend()

In [31]:
print(res)

{27: '100181857,100279367,100152592,100229198,100321430,100155976,100157822,100162018,100513031,100514430', 150: '100514581,100245191,100323925,100489774,100248259,100473952,100488699,100246915,100323750,100511141', 157: '100140582,100494210,100228406,100255244,100297358,100460068,100239142,100496523,100461354,100172798', 189: '100200005,100504161,100465504,100495678,100504374,100511437,100495741,100143393,100279156,100142645', 218: '100078076,100181362,100157494,100155347,100217526,100067539,100158065,100467555,100278304,100322962', 219: '100330512,100247511,100206116,100078435,100340965,100494848,100497151,100319595,100491389,100504802', 257: '100173098,100173150,100172335,100142762,100356967,100173509,100155331,100144654,100183047,100173660', 401: '100473173,100141567,100141955,100075831,100340279,100508899,100322861,100498780,100177029,100279664', 610: '100259364,100256253,100272015,100284396,100067827,100439057,100516219,100521125,100513777,100514334', 774: '100164492,100167639,10