In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import time

start_time = time.time()
# 数据读取
file_path ="./dataset/UserBehavior.csv"
data=pd.read_csv(file_path, header=None, index_col=None, nrows=20000000)
data.columns=['User_Id','Item_Id','Category_Id','Behavior_type','Timestamp']
# data.head(5)
end_time = time.time()
print("Read and preprocess data finished! Time cost: %.2f s" % (end_time - start_time))

Read and preprocess data finished! Time cost: 15.17 s


In [2]:
print(data.head(5))

   User_Id  Item_Id  Category_Id Behavior_type   Timestamp
0        1  2268318      2520377            pv  1511544070
1        1  2333346      2520771            pv  1511561733
2        1  2576651       149192            pv  1511572885
3        1  3830808      4181361            pv  1511593493
4        1  4365585      2520377            pv  1511596146


In [3]:
# 一、数据读取和数据清洗
# 1.检查空值 ---检查结果数据无空值
print(pd.isnull(data["Timestamp"]).value_counts())
print(pd.isnull(data["Item_Id"]).value_counts())

False    20000000
Name: Timestamp, dtype: int64
False    20000000
Name: Item_Id, dtype: int64


In [4]:
# 2.检查异常值
# 1）Timestamp存在负值
print(data[data["Timestamp"]<0])
data = data[data["Timestamp"]>0]
# 2）Behavior_type存在异常值
print(data["Behavior_type"].value_counts())
# 3）检查异常处理结果
print(data[data["Timestamp"]<0])
print(data["Behavior_type"].value_counts())

# print(data.info())  # [19999969 rows x 5 columns]
data.head()

          User_Id  Item_Id  Category_Id Behavior_type   Timestamp
5896224     34939  1384459      4719814            pv -1586903608
9025584     49210  2691161      2640118            pv -2034497153
9025585     49210  4409284      2640118            pv -2034497135
9025586     49210  4944697      2640118            pv -2034497078
9025587     49210  4409284      2640118            pv -2034496919
9025588     49210  1973455      2578647            pv -2034496686
9025589     49210  2156732      2578647            pv -2034496606
9025590     49210  4386323      2578647            pv -2034496566
9025591     49210  4882098      2578647            pv -2034496546
9025592     49210  3552038      2578647            pv -2034496512
9025593     49210   890796      2578647            pv -2034496438
9025594     49210  5105888      2578647            pv -2034496407
9025595     49210  3713509      1467750            pv -2034496405
9025596     49210  5105888      2578647            pv -2034496402
9025597   

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior_type,Timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [5]:
# 3.时间戳转换时间单位
data.loc[:,'Timestamp']=data['Timestamp'].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x)))
data.loc[:,'Date']=data['Timestamp'].apply(lambda x:x.split(' ')[0])
data.loc[:,'Time']=data['Timestamp'].apply(lambda x:x.split(' ')[1])
# 只取2017年11月25日至2017年12月3日之间的数据进行研究
# print(data["Date"].value_counts())
cleaned_data=data[(data["Date"]>='2017-11-25')&(data["Date"]<='2017-12-03')]
print(cleaned_data["Date"].value_counts())

2017-12-02    2762194
2017-12-03    2742393
2017-12-01    2167368
2017-11-26    2124383
2017-11-30    2087264
2017-11-25    2070079
2017-11-29    2047138
2017-11-27    2012639
2017-11-28    1975748
Name: Date, dtype: int64


In [6]:
# 4.去除重复数据
print(cleaned_data.duplicated().value_counts())
cleaned_data=cleaned_data.drop_duplicates(subset=['User_Id','Item_Id','Category_Id','Behavior_type','Timestamp'],keep='first')
print(cleaned_data.duplicated().value_counts())


False    19989198
True            8
dtype: int64
False    19989198
dtype: int64


In [7]:
# 5.预处理数据保存
cleaned_data.to_csv('./dataset/cleaned_data.csv', encoding = 'utf-8', index = False)
print(cleaned_data.duplicated().value_counts())

False    19989198
dtype: int64


In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path ="./dataset/cleaned_data.csv"
data=pd.read_csv(file_path)
data.head()

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior_type,Timestamp,Date,Time
0,1,2268318,2520377,pv,2017-11-25 01:21:10,2017-11-25,01:21:10
1,1,2333346,2520771,pv,2017-11-25 06:15:33,2017-11-25,06:15:33
2,1,2576651,149192,pv,2017-11-25 09:21:25,2017-11-25,09:21:25
3,1,3830808,4181361,pv,2017-11-25 15:04:53,2017-11-25,15:04:53
4,1,4365585,2520377,pv,2017-11-25 15:49:06,2017-11-25,15:49:06


In [9]:
describe=pd.DataFrame({"用户数量":len(set(list(data["User_Id"]))),"产品数量":len(set(list(data["Item_Id"]))),"产品种类数量":len(set(list(data["Category_Id"]))),"统计天数":len(set(list(data["Date"])))},index=['1'])
describe

Unnamed: 0,用户数量,产品数量,产品种类数量,统计天数
1,198001,2208950,8562,9


In [10]:
data['Behavior_type'] = data['Behavior_type'].map({'pv':0,'cart':1,'fav':2,'buy':3})
print(data)
data['Behavior_type'] = data['Behavior_type'].apply(lambda x: list(str(x)))
print(data)
c = data.groupby(['User_Id', 'Category_Id'])['Behavior_type'].sum()
process = pd.DataFrame()
process['Behavior_list'] = c
process

          User_Id  Item_Id  Category_Id  Behavior_type            Timestamp  \
0               1  2268318      2520377              0  2017-11-25 01:21:10   
1               1  2333346      2520771              0  2017-11-25 06:15:33   
2               1  2576651       149192              0  2017-11-25 09:21:25   
3               1  3830808      4181361              0  2017-11-25 15:04:53   
4               1  4365585      2520377              0  2017-11-25 15:49:06   
...           ...      ...          ...            ...                  ...   
19989193  1002513  3285794      1045172              0  2017-11-28 23:31:22   
19989194  1002513   638230      1045172              0  2017-11-28 23:31:43   
19989195  1002513  1286197      1045172              0  2017-11-28 23:32:36   
19989196  1002513  2009534      1045172              0  2017-11-28 23:34:39   
19989197  1002513  1286197      1045172              0  2017-11-28 23:35:29   

                Date      Time  
0         2017-11-

Unnamed: 0_level_0,Unnamed: 1_level_0,Behavior_list
User_Id,Category_Id,Unnamed: 2_level_1
1,149192,"[0, 0, 0, 0, 0, 0]"
1,411153,"[0, 0, 0, 0, 0, 0]"
1,982926,[0]
1,1080785,[0]
1,1320293,"[0, 0]"
...,...,...
1018011,2322253,[0]
1018011,2885642,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1018011,3669044,"[3, 0]"
1018011,4284875,[0]


In [11]:
behaviors = []
for item in c:
    behaviors.append(item)
    
# collect index for users and items
u_count = 0
i_count = 0
user_dict = dict()
item_dict = dict()
for index in c.index:
    if index[0] not in user_dict:
        user_dict[index[0]] = u_count
        u_count += 1
    if index[1] not in item_dict:
        item_dict[index[1]] = i_count
        i_count += 1
print(u_count)
print(i_count)

198001
8562


In [24]:
# construct matrix
import numpy as np
import math
view_ratio = 1
cart_fav_ratio = 2
bug_ratio = 10
user_num = len(set(list(data["User_Id"]))) // 10
cate_num = len(set(list(data["Category_Id"])))
R = np.zeros((user_num, cate_num), dtype=np.float16)
print(user_num, cate_num)
# indexs = c.index
for i in range(len(c.index)):
    user_id = c.index[i][0]
    category_id = c.index[i][1]
    if user_dict[user_id] >= user_num:
        continue
    behavior = behaviors[i]
#     print(behavior)
    view_weight = behavior.count('0') / len(behavior) * view_ratio
    cart_weight = behavior.count('1') / len(behavior) * cart_fav_ratio
    fav_weigth = behavior.count('2') / len(behavior) * cart_fav_ratio
    buy_weight = behavior.count('3') / len(behavior) * bug_ratio
#     print(view_weight)
#     print(user_dict[user_id], item_dict[category_id])
    R[user_dict[user_id]][item_dict[category_id]] = view_weight * math.exp((cart_weight + fav_weigth) / 2 + buy_weight)
#     R[user_id - 1][category_id - 1] = 
print(R[0][:50])
print(R[1][:50])

19800 8562
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
[0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [25]:
# normalize into [0,5]
max_scope = -1
for item in R:
    if max(item) > max_scope:
        max_scope = max(item)
print(max_scope)

R = R / max_scope * 5
print(R[0])

650.0
[0.00769 0.00769 0.00769 ... 0.      0.      0.     ]


In [26]:
# # sklearn nmf
# from sklearn.decomposition import NMF
# nmf = NMF(n_components=6, init='nndsvda', tol=5e-3)
# result = nmf.fit_transform(R)
# print(result)


[[1.86412789e-05 3.48962702e-03 9.43909632e-04 1.57518277e-04
  1.79303598e-03 8.12441789e-04]
 [1.29450335e-05 3.37485955e-03 7.89810840e-04 1.28127748e-04
  1.42106029e-03 4.34870523e-04]
 [1.73174414e-05 6.46179432e-04 1.18360289e-03 2.80986165e-04
  1.84313469e-03 7.00879624e-04]
 ...
 [1.15705249e-05 3.48959149e-04 7.69242961e-04 7.63512055e-05
  1.08862246e-03 3.41068167e-04]
 [1.04635561e-05 1.47473219e-04 5.11308134e-03 6.23822668e-05
  5.39974225e-03 4.51418607e-04]
 [2.54548621e-05 3.38430400e-03 8.28682108e-04 2.50455556e-04
  4.15373229e-03 7.69746784e-04]]


In [31]:
# sklearn nmf
from sklearn.decomposition import NMF
nmf = NMF(n_components=6, init='nndsvda', tol=5e-3)
W = nmf.fit_transform(R)
H = nmf.components_
print(H)
print(W.dot(H))
result = W.dot(H)

[[2.84471968e-04 6.00299861e-05 3.64802083e-03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.81102679e-02 7.61989097e-03 4.91485183e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.70264237e-02 6.84004827e-03 3.98979463e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.72415800e-03 4.66497130e-03 2.39094226e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.34089107e-02 6.29156192e-03 4.60405755e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.38138312e-02 8.06258644e-03 4.01055686e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[[1.60526148e-04 5.16132263e-05 3.28130246e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.34842797e-04 4.41623124e-05 2.83347570e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.17235360e-04 3.15711040e-05 1.98700168e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [6.89200166e-05 1.78758416e-05 1.13501903e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.

In [33]:
result

array([[1.60526148e-04, 5.16132263e-05, 3.28130246e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.34842797e-04, 4.41623124e-05, 2.83347570e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.17235360e-04, 3.15711040e-05, 1.98700168e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.89200166e-05, 1.78758416e-05, 1.13501903e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.29566412e-04, 7.40022050e-05, 4.79494016e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.09226925e-04, 6.49655472e-05, 4.27583877e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [32]:
np.save("Result.npy",result) # save the result