# 预处理飞猪数据集

飞猪数据集说明：https://tianchi.aliyun.com/dataset/113649

首先 check 文件是否存在：

In [None]:
import os
import numpy as np
import pandas as pd

data_dir = '../data/fliggy'

files = ['user_profile.csv', 'item_profile.csv', 'user_item_behavior_history.csv']

for f in files:
    if not os.path.exists(os.path.join(data_dir, f)):
        raise FileNotFoundError(f'Not found file: {f} in directory {os.path.abspath(data_dir)}')

print('Check files over.')

In [None]:
USER_NAMES = ['UserID', 'Age', 'Gender', 'Occupation', 'UserCity', 'uLabel']
ITEM_NAMES = ['ItemID', 'CateID', 'Item_city', 'iLabel']
BEHAVIOR_NAMES = ['UserID', 'ItemID', 'BehaviorType', 'TimeStamp']

In [None]:
def mapped2sequential(df: pd.DataFrame, columns: list, start_from_1=True):
    for c in columns:
        m = {}
        sign = 1 if start_from_1 else 0
        if isinstance(c, str):
            if c in df.columns:
                l = df[c].unique().tolist()
                for v in l:
                    if m.setdefault(v, sign) == sign:
                        sign += 1
                df[c] = df[c].map(m)
        elif isinstance(c, list):
            for sub_c in c:
                if isinstance(sub_c, str):
                    if sub_c in df.columns:
                        l = df[sub_c].unique().tolist()
                        for v in l:
                            if m.setdefault(v, sign) == sign:
                                sign += 1
                        df[sub_c] = df[sub_c].map(m)
                else:
                    raise ValueError('最多支持二级list')


def min_max_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())


def std_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].mean()) / df[c].std()


def multi_value_process(df: pd.DataFrame, column: str, sep: str):
    uMap, ans = {}, []
    for arr in df[column]:
        que = []
        for i in arr.split(sep):
            if uMap.get(i) is None:
                uMap[i] = len(uMap) + 1
            que.append(uMap[i])
        ans.append(que)
    return ans, len(uMap) + 1

读取数据函数

In [None]:
def read_data(file: str, sample_size, sep, names=None, dtype=None):
    if not os.path.exists(file):
        e = f'The file: {file} not exists.'
        raise FileNotFoundError(e)
    df = pd.read_csv(file, iterator=True, names=names, sep=sep, dtype=dtype)
    if sample_size > 0:
        df = df.get_chunk(sample_size)
    else:
        df = df.get_chunk()
    return df

开始读取数据

In [None]:
# 两亿条数据太多了，这里只加载五千万条
sample_num = 1e8
SEP = ','

users = read_data(f'{data_dir}/{files[0]}', -1, SEP, USER_NAMES)
items = read_data(f'{data_dir}/{files[1]}', -1, SEP, ITEM_NAMES)
behavior = read_data(f'{data_dir}/{files[2]}', sample_num, SEP, BEHAVIOR_NAMES)

In [None]:
# 处理用户异常年龄
avg_age = int(users['Age'].mean())
users['Age'] = users['Age'].apply(lambda x: x if x <= 75 else avg_age)

观察 behavior 数据集，会发现其含有 UserID	ItemID	BehaviorType 完全一致，但是只有 TimeStamp 不同的记录

In [None]:
behavior.head(10)

因此合并三者都相同的项，使用 BehaviorCount 来表示交互的次数

In [None]:
behavior = behavior.drop(columns=['TimeStamp'])
behavior['BehaviorCount'] = 0
behavior = behavior.groupby(['UserID', 'ItemID', 'BehaviorType']).count().reset_index()

将三者数据集进行合并

In [None]:
# 直接进行合并
behavior = behavior.merge(users, on='UserID').merge(items, on='ItemID')
behavior.head(10)

In [None]:
min_max_normalize(behavior, ['TimeStamp'])
mapped2sequential(behavior, ['UserID', 'ItemID', 'Occupation', 'CateID', 'BehaviorType', ['UserCity', 'Item_city']])
behavior.head(10)

In [None]:
# 处理多值属性
uLabel, u_label_vocab = multi_value_process(behavior, 'uLabel', ';')
iLabel, i_label_vocab = multi_value_process(behavior, 'iLabel', ';')

In [None]:
# 替换属性
behavior['uLabels'] = uLabel
behavior['iLabels'] = iLabel
behavior = behavior.drop(columns=['uLabel', 'iLabel'])
behavior.head(10)

为每个用户记录其所有交互过的 item

In [None]:
item_list_per_user = behavior.groupby(['UserID'])['ItemID'].apply(list).reset_index()
item_list_per_user.head(10)

In [None]:
item_list_per_user.columns = ['UserID', 'InteractItems']

In [None]:
behavior = behavior.merge(item_list_per_user, on='UserID')

In [None]:
query_col = USER_NAMES + ['BehaviorType', 'BehaviorCount']

In [None]:
# 随机打乱顺序
behavior = behavior.sample(frac=1).reset_index(drop=True)
behavior.shape

In [None]:
len = behavior.shape[0]
train_data = behavior[:int(len * 0.9)].reset_index(drop=True)
test_data = behavior[int(len * 0.9):].reset_index(drop=True)
train_data.shape, test_data.shape

In [None]:
item_data = behavior[['ItemID', 'CateID', 'Item_city', 'iLabels']]
t = item_data['iLabels'].apply(lambda x: ','.join(list(map(str, x))))
t

In [None]:
item_data = item_data.drop(columns=['iLabels'])
item_data['iLabels'] = t
item_data = item_data.drop_duplicates().reset_index(drop=True)
item_data.shape, items.shape

In [None]:
t = item_data['iLabels'].apply(lambda x: [int(i) for i in x.split(',')])
item_data = item_data.drop(columns=['iLabels'])
item_data['iLabels'] = t

In [None]:
item_data

In [None]:
query_col = USER_NAMES + ['BehaviorType', 'BehaviorCount', 'InteractItems']
query_col[5] = 'uLabels'
tmp = behavior.iloc[0, :][query_col]
item_df = pd.DataFrame([tmp.tolist()]*item_data.shape[0], columns=query_col)
item_df = pd.concat([item_df, item_data], axis=1)
# 整理列的顺序
item_df = item_df[train_data.columns]
item_df

In [None]:
train_data.head()

In [None]:
print(query_col)