# 预处理飞猪数据集

飞猪数据集说明：https://tianchi.aliyun.com/dataset/113649

首先 check 文件是否存在：

In [7]:
import os
import numpy as np
import pandas as pd

data_dir = '../data/fliggy'

files = ['user_profile.csv', 'item_profile.csv', 'user_item_behavior_history.csv']

for f in files:
    if not os.path.exists(os.path.join(data_dir, f)):
        raise FileNotFoundError(f'Not found file: {f} in directory {os.path.abspath(data_dir)}')

print('Check files over.')

Check files over.


In [8]:
USER_NAMES = ['UserID', 'Age', 'Gender', 'Occupation', 'UserCity', 'uLabel']
ITEM_NAMES = ['ItemID', 'CateID', 'Item_city', 'iLabel']
BEHAVIOR_NAMES = ['UserID', 'ItemID', 'BehaviorType', 'TimeStamp']

In [9]:
def mapped2sequential(df: pd.DataFrame, columns: list, start_from_1=True):
    for c in columns:
        m = {}
        sign = 1 if start_from_1 else 0
        if isinstance(c, str):
            if c in df.columns:
                l = df[c].unique().tolist()
                for v in l:
                    if m.setdefault(v, sign) == sign:
                        sign += 1
                df[c] = df[c].map(m)
        elif isinstance(c, list):
            for sub_c in c:
                if isinstance(sub_c, str):
                    if sub_c in df.columns:
                        l = df[sub_c].unique().tolist()
                        for v in l:
                            if m.setdefault(v, sign) == sign:
                                sign += 1
                        df[sub_c] = df[sub_c].map(m)
                else:
                    raise ValueError('最多支持二级list')


def min_max_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())


def std_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].mean()) / df[c].std()


def multi_value_process(df: pd.DataFrame, column: str, sep: str):
    uMap, ans = {}, []
    for arr in df[column]:
        que = []
        for i in arr.split(sep):
            if uMap.get(i) is None:
                uMap[i] = len(uMap) + 1
            que.append(uMap[i])
        ans.append(que)
    return ans, len(uMap) + 1

读取数据函数

In [10]:
def read_data(file: str, sample_size, sep, names=None, dtype=None):
    if not os.path.exists(file):
        e = f'The file: {file} not exists.'
        raise FileNotFoundError(e)
    df = pd.read_csv(file, iterator=True, names=names, sep=sep, dtype=dtype)
    if sample_size > 0:
        df = df.get_chunk(sample_size)
    else:
        df = df.get_chunk()
    return df

开始读取数据

In [11]:
# 两亿条数据太多了，这里只加载五千万条
sample_num = 1e8
SEP = ','

users = read_data(f'{data_dir}/{files[0]}', -1, SEP, USER_NAMES)
items = read_data(f'{data_dir}/{files[1]}', -1, SEP, ITEM_NAMES)
behavior = read_data(f'{data_dir}/{files[2]}', sample_num, SEP, BEHAVIOR_NAMES)

In [12]:
# 处理用户异常年龄
avg_age = int(users['Age'].mean())
users['Age'] = users['Age'].apply(lambda x: x if x <= 75 else avg_age)

0          53.292499
1          53.292499
2          53.292499
3          53.292499
4          53.292499
             ...    
5655133    46.000000
5655134    73.000000
5655135    13.000000
5655136    59.000000
5655137    46.000000
Name: Age, Length: 5655138, dtype: float64

观察 behavior 数据集，会发现其含有 UserID	ItemID	BehaviorType 完全一致，但是只有 TimeStamp 不同的记录

In [13]:
behavior.head(10)

Unnamed: 0,UserID,ItemID,BehaviorType,TimeStamp
0,2499531,264562,clk,1559587204
1,2499531,264562,clk,1559587234
2,2499531,264562,clk,1559587325
3,3744925,23419,clk,1559641606
4,3744925,23419,clk,1559641305
5,3744925,23419,clk,1559641392
6,3744925,23419,clk,1559640610
7,3744925,23419,clk,1559641105
8,3744925,23419,clk,1559640580
9,3744925,23419,clk,1559640564


因此合并三者都相同的项，使用 BehaviorCount 来表示交互的次数

In [14]:
behavior = behavior.drop(columns=['TimeStamp'])
behavior['BehaviorCount'] = 0
behavior = behavior.groupby(['UserID', 'ItemID', 'BehaviorType']).count().reset_index()

将三者数据集进行合并

In [15]:
# 直接进行合并
behavior = behavior.merge(users, on='UserID').merge(items, on='ItemID')
behavior.head(10)

Unnamed: 0,UserID,ItemID,BehaviorType,BehaviorCount,Age,Gender,Occupation,UserCity,uLabel,CateID,Item_city,iLabel
0,1,269,clk,3,77,2,5,392,19;11;13;20,38,221,-1
1,35036,269,clk,3,120,2,11,267,11;14;17;16,38,221,-1
2,858047,269,clk,4,54,2,3,378,19;2;12;20,38,221,-1
3,1480494,269,clk,3,77,3,-1,363,11;14;17;16,38,221,-1
4,2688687,269,clk,3,63,2,9,267,13;14;7;16,38,221,-1
5,2889835,269,clk,6,123,2,3,224,19;8;15;14,38,221,-1
6,3111981,269,clk,6,61,3,11,279,5;14;21;16,38,221,-1
7,3359256,269,cart,5,8,3,3,288,13;12;20;16,38,221,-1
8,3359256,269,clk,6,8,3,3,288,13;12;20;16,38,221,-1
9,3746177,269,clk,3,54,2,6,57,19;12;20;6,38,221,-1


In [16]:
min_max_normalize(behavior, ['TimeStamp'])
mapped2sequential(behavior, ['UserID', 'ItemID', 'Occupation', 'CateID', 'BehaviorType', ['UserCity', 'Item_city']])
behavior.head(10)

Unnamed: 0,UserID,ItemID,BehaviorType,BehaviorCount,Age,Gender,Occupation,UserCity,uLabel,CateID,Item_city,iLabel
0,1,1,1,3,77,2,1,1,19;11;13;20,1,156,-1
1,2,1,1,3,120,2,2,2,11;14;17;16,1,156,-1
2,3,1,1,4,54,2,3,3,19;2;12;20,1,156,-1
3,4,1,1,3,77,3,4,4,11;14;17;16,1,156,-1
4,5,1,1,3,63,2,5,2,13;14;7;16,1,156,-1
5,6,1,1,6,123,2,3,5,19;8;15;14,1,156,-1
6,7,1,1,6,61,3,2,6,5;14;21;16,1,156,-1
7,8,1,2,5,8,3,3,7,13;12;20;16,1,156,-1
8,8,1,1,6,8,3,3,7,13;12;20;16,1,156,-1
9,9,1,1,3,54,2,6,8,19;12;20;6,1,156,-1


In [17]:
# 处理多值属性
uLabel, u_label_vocab = multi_value_process(behavior, 'uLabel', ';')
iLabel, i_label_vocab = multi_value_process(behavior, 'iLabel', ';')

In [18]:
# 替换属性
behavior['uLabels'] = uLabel
behavior['iLabels'] = iLabel
behavior = behavior.drop(columns=['uLabel', 'iLabel'])
behavior.head(10)

Unnamed: 0,UserID,ItemID,BehaviorType,BehaviorCount,Age,Gender,Occupation,UserCity,CateID,Item_city,uLabels,iLabels
0,1,1,1,3,77,2,1,1,1,156,"[1, 2, 3, 4]",[1]
1,2,1,1,3,120,2,2,2,1,156,"[2, 5, 6, 7]",[1]
2,3,1,1,4,54,2,3,3,1,156,"[1, 8, 9, 4]",[1]
3,4,1,1,3,77,3,4,4,1,156,"[2, 5, 6, 7]",[1]
4,5,1,1,3,63,2,5,2,1,156,"[3, 5, 10, 7]",[1]
5,6,1,1,6,123,2,3,5,1,156,"[1, 11, 12, 5]",[1]
6,7,1,1,6,61,3,2,6,1,156,"[13, 5, 14, 7]",[1]
7,8,1,2,5,8,3,3,7,1,156,"[3, 9, 4, 7]",[1]
8,8,1,1,6,8,3,3,7,1,156,"[3, 9, 4, 7]",[1]
9,9,1,1,3,54,2,6,8,1,156,"[1, 9, 4, 15]",[1]


为每个用户记录其所有交互过的 item

In [19]:
item_list_per_user = behavior.groupby(['UserID'])['ItemID'].apply(list).reset_index()
item_list_per_user.head(10)

Unnamed: 0,UserID,ItemID
0,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]"
1,2,"[1, 122, 127, 377, 526, 958, 1491, 2015, 2015,..."
2,3,"[1, 401, 22282, 39759, 105532, 136174]"
3,4,"[1, 532, 904, 1519, 25592]"
4,5,"[1, 489, 520, 559, 626, 647, 863, 1163, 1545, ..."
5,6,"[1, 5264, 20027, 32293, 159346, 159346]"
6,7,"[1, 109, 109, 114, 738, 1051, 1052, 1052, 2434..."
7,8,"[1, 1, 379, 379, 1398, 1398, 2319, 2375, 2482,..."
8,9,"[1, 61, 2559, 4842, 4895, 9329, 22857]"
9,10,"[1, 5092, 31778]"


In [20]:
item_list_per_user.columns = ['UserID', 'InteractItems']

In [21]:
behavior = behavior.merge(item_list_per_user, on='UserID')

In [22]:
query_col = USER_NAMES + ['BehaviorType', 'BehaviorCount']

In [23]:
# 随机打乱顺序
behavior = behavior.sample(frac=1).reset_index(drop=True)
behavior.shape

(27351913, 13)

In [24]:
len = behavior.shape[0]
train_data = behavior[:int(len * 0.9)].reset_index(drop=True)
test_data = behavior[int(len * 0.9):].reset_index(drop=True)
train_data.shape, test_data.shape

((24616721, 13), (2735192, 13))

In [25]:
item_data = behavior[['ItemID', 'CateID', 'Item_city', 'iLabels']]
t = item_data['iLabels'].apply(lambda x: ','.join(list(map(str, x))))
t

0               1
1           58,17
2               1
3             176
4               1
            ...  
27351908        1
27351909        1
27351910        1
27351911        1
27351912    13,14
Name: iLabels, Length: 27351913, dtype: object

In [26]:
item_data = item_data.drop(columns=['iLabels'])
item_data['iLabels'] = t
item_data = item_data.drop_duplicates().reset_index(drop=True)
item_data.shape, items.shape

((236572, 5), (273188, 4))

In [29]:
t = item_data['iLabels'].apply(lambda x: [int(i) for i in x.split(',')])
item_data = item_data.drop(columns=['iLabels'])
item_data['iLabels'] = t

In [30]:
item_data

Unnamed: 0,index,ItemID,CateID,Item_city,iLabels
0,0,4737,12,5,[1]
1,1,274,3,16,"[58, 17]"
2,2,33609,3,163,[1]
3,3,6090,2,39,[176]
4,4,3311,1,23,[1]
...,...,...,...,...,...
236567,27349251,230468,9,304,[1]
236568,27349793,225657,3,44,[1]
236569,27350012,222002,12,52,[1]
236570,27351823,232062,2,267,[1]


In [55]:
query_col = USER_NAMES + ['BehaviorType', 'BehaviorCount', 'InteractItems']
query_col[5] = 'uLabels'
tmp = behavior.iloc[0, :][query_col]
item_df = pd.DataFrame([tmp.tolist()]*item_data.shape[0], columns=query_col)
item_df = pd.concat([item_df, item_data], axis=1)
# 整理列的顺序
item_df = item_df[train_data.columns]
item_df

['UserID',
 'Age',
 'Gender',
 'Occupation',
 'UserCity',
 'uLabels',
 'BehaviorType',
 'BehaviorCount',
 'InteractItems']

In [61]:
train_data.head()

Unnamed: 0,UserID,ItemID,BehaviorType,BehaviorCount,Age,Gender,Occupation,UserCity,CateID,Item_city,uLabels,iLabels,InteractItems
0,858963,4737,1,3,8,3,7,47,12,5,"[1, 19, 6, 10]",[1],"[190, 4737, 4737, 4737, 5996, 66070, 82237]"
1,967330,274,1,1,115,2,3,23,3,16,"[1, 13, 4, 15]","[58, 17]","[274, 3527, 5531, 26835]"
2,2140076,33609,1,1,39,2,1,21,3,163,"[1, 3, 9, 5]",[1],"[2558, 6524, 6524, 33609, 42701, 68060]"
3,265572,6090,1,1,38,2,6,204,2,39,"[1, 9, 4, 15]",[176],"[61, 1331, 6090, 28433, 74290, 94927, 108785, ..."
4,93508,3311,1,1,52,2,3,5,1,23,"[1, 8, 9, 5]",[1],"[21, 41, 492, 3311, 22797]"


In [62]:
print(query_col)

['UserID', 'Age', 'Gender', 'Occupation', 'UserCity', 'uLabels', 'BehaviorType', 'BehaviorCount', 'InteractItems']
