In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
import gc
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
%%time
path = './data/'
data = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'submission.csv')
#按customer_id和order_pay_time升序排序
data = data.sort_values(["customer_id", "order_pay_time"]).reset_index(drop=True)
data.head(3)

CPU times: user 11.9 s, sys: 1.34 s, total: 13.2 s
Wall time: 13.2 s


Unnamed: 0,order_detail_id,order_id,order_total_num,order_amount,order_total_payment,order_total_discount,order_pay_time,order_status,order_count,is_customer_rate,order_detail_status,order_detail_goods_num,order_detail_amount,order_detail_payment,order_detail_discount,customer_province,customer_city,member_id,customer_id,customer_gender,member_status,is_member_actived,goods_id,goods_class_id,goods_price,goods_status,goods_has_discount,goods_list_time,goods_delist_time
0,1000000,1000000,1.0,239.9,96.9,0.0,2012-11-01 00:10:56,6,1.0,0.0,6.0,1.0,96.9,96.9,143.0,北京,北京市,0.0,1000000,,,,998,998,54.909289,1.0,0.0,2014-10-25 11:08:07,2014-11-01 11:08:07
1,1181340,1155016,1.0,129.9,66.9,0.0,2013-08-03 21:34:49,6,1.0,0.0,6.0,1.0,66.9,66.9,63.0,湖南省,长沙市,0.0,1000014,,,,1038,1038,49.554744,1.0,0.0,2014-10-28 14:26:28,2014-11-04 14:26:28
2,1183974,1157250,2.0,299.8,99.9,0.0,2013-03-08 10:21:29,6,2.0,0.0,6.0,1.0,0.0,0.0,59.9,广东省,广州市,0.0,1000034,,,,2853,2853,41.310164,2.0,0.0,2014-01-10 15:54:23,2014-01-11 12:46:14


## 数据预处理

In [3]:
def reduce_mem_usage(df):
    """内存优化"""
    start_mem = df.memory_usage().sum() / 1024 ** 2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
def time2multi(df, dt_fea):
    """获取dt_fea的月、日、星期几以及小时数据；对于2012年的月份，还要进行 减12 处理"""
    df_res = pd.concat([df[dt_fea].dt.month, df[dt_fea].dt.day, df[dt_fea].dt.weekday + 1, df[dt_fea].dt.hour],
                       axis=1)
    df_res.columns = [dt_fea + "_" + f for f in ["month", "day", "weekday", "hour"]]
    df_res[dt_fea + "_month"] = df_res[dt_fea + "_month"].apply(lambda x: x - 12 if x > 8 else x)
    return pd.concat([df, df_res], axis=1)

In [5]:
def preprocess(data):
    """数据预处理"""
    # goods_id不连续，进行标签编码
    data["goods_id"] = pd.factorize(data["goods_id"])[0]
    # goods_id和goods_class_id取值完全一样，因此删除goods_class_id列
    data.drop(["goods_class_id"], axis=1, inplace=True)
    # 删除is_member_actived列
    data.drop(["is_member_actived"], axis=1, inplace=True)
    # 按goods_id使用价格均值填充goods_price
    for g_id in set(data[data["goods_price"].isna()]["goods_id"]):
        data["goods_price"][data["goods_id"] == g_id] = data["goods_price"][data["goods_id"] == g_id].fillna(
            data["goods_price"][data["goods_id"] == g_id].mean())
    # customer_province和customer_city填充缺失值“未知”，标签编码
    data["customer_province"].fillna("未知", inplace=True)
    data["customer_city"].fillna("未知", inplace=True)
    data["customer_province"] = pd.factorize(data["customer_province"])[0]
    data["customer_city"] = pd.factorize(data["customer_city"])[0]
    # 处理id字段
    data['order_detail_id'] = data['order_detail_id'].astype(np.uint32)
    data['order_id'] = data['order_id'].astype(np.uint32)
    data['customer_id'] = data['customer_id'].astype(np.uint32)
    data['goods_id'] = data['goods_id'].astype(np.uint32)
    # train['goods_class_id'] = train['goods_class_id'].astype(np.uint32)
    data['member_id'] = data['member_id'].astype(np.uint32)
    # 处理状态字段，这里同时处理空值，将空值置为0
    data['order_status'] = data['order_status'].astype(np.uint8)
    data['goods_has_discount'] = data['goods_has_discount'].astype(np.uint8)
    data["customer_gender"] = data["customer_gender"].fillna(0).astype(np.uint8)  # 填充0
    data["member_status"] = data["member_status"].fillna(0).astype(np.uint8)  # 填充0
    data['is_customer_rate'] = data['is_customer_rate'].astype(np.uint8)
    data['order_detail_status'] = data['order_detail_status'].astype(np.uint8)
    # 处理日期
    data['goods_list_time'] = pd.to_datetime(data['goods_list_time'], format="%Y-%m-%d %H:%M:%S")
    data['order_pay_time'] = pd.to_datetime(data['order_pay_time'], format="%Y-%m-%d %H:%M:%S")
    data['goods_delist_time'] = pd.to_datetime(data['goods_delist_time'], format="%Y-%m-%d %H:%M:%S")
    data = time2multi(data, "order_pay_time")
    return data

In [6]:
data = preprocess(data)

## 划分数据集

In [7]:
def split_dataset(data, pred_month=8):
    """
    根据待预测的时间pred_month，以pred_month的用户复购情况作为label.
    需要注意的是，如果在pred_month之前没产生过购买数据的用户应进行剔除，不属于复购预测。
    """
    # 待预测用户id（即pred_month之前是否已产生过购买行为）
    pred_cust_id = set(data[data["order_pay_time_month"] < pred_month]["customer_id"])
    # 进行购买的用户id
    true_cust_id = set(data[data["order_pay_time_month"] == pred_month]["customer_id"])
    train_y = data[data["customer_id"].isin(pred_cust_id)][["customer_id"]].drop_duplicates().reset_index(drop=True)
    train_y['label'] = train_y["customer_id"].apply(lambda x: 1 if x in true_cust_id else 0)
    train_data = data[(data["order_pay_time_month"] < pred_month)].reset_index(drop=True)
    return train_data, train_y

In [8]:
# 验证集的预测月份为8月
tra_data, tra_y = split_dataset(data, pred_month=8)  

## 提取特征

In [9]:
def get_time_series_features(raw_data, pred_month, months=None, days=None):
    """获取最近months个月客户的时序特征，当months=None时，按全部时间获取。"""
    if months:  # 如果不是None
        data = raw_data[(raw_data["order_pay_time_month"] < pred_month) & (
                    raw_data["order_pay_time_month"] >= pred_month - months)].reset_index(drop=True)
    else:
        if not isinstance(days, int):
            raise Exception("days必须为整数.")
        end_time = datetime.strptime(f"2013-{pred_month}", "%Y-%m")
        beg_time = end_time - timedelta(days=days)
        data = raw_data[(raw_data["order_pay_time"] >= beg_time) & (raw_data["order_pay_time"] < end_time)].reset_index(drop=True)
    t = datetime.strptime(f'2013-{pred_month}', "%Y-%m")  # 待预测月份的第一天0时
    data["order_pay_time_diff"] = (t - data["order_pay_time"]).dt.seconds / 3600  # 交易过去了多少小时
    data["order_all_discount"] = data["order_amount"] - data["order_total_payment"]
    if months:  # 如果不是None
        data["weight_time"] = 1 - data["order_pay_time_diff"] / (months * 30 * 24) / 2
    else:
        data["weight_time"] = 1 - data["order_pay_time_diff"] / (days * 24) / 2
    df = data[["customer_id"]].drop_duplicates().set_index("customer_id")
    df["not_nan"] = 1
    gb_cust = data.groupby("customer_id")  # 复用，提高效率
    # RFM模型的基本特征
    df["Recency"] = gb_cust["order_pay_time_diff"].min()  # 最近一次交易过去了多少小时
    df["Freq_time"] = gb_cust["order_pay_time"].nunique()  # 按付款时间计算频率，下单次数
    df["Freq_time_weight"] = data.groupby(["customer_id", "order_pay_time"])["weight_time"].last().groupby(
        "customer_id").sum()
    df["Freq_detail_order"] = gb_cust["order_detail_id"].nunique()  # 按详单个数计算频率，下单个数
    df["Freq_detail_order_weight"] = data.groupby(["customer_id", "order_detail_id"])["weight_time"].last().groupby(
        "customer_id").sum()
    df["Freq_order"] = gb_cust["order_id"].nunique()  # 按商家个数计算频率
    df["Freq_order_weight"] = data.groupby(["customer_id", "order_id"])["weight_time"].last().groupby(
        "customer_id").sum()
    df["Freq_avg_detail"] = gb_cust[["order_detail_id", "order_id"]].nunique().apply(
        lambda x: x["order_detail_id"] / x["order_id"], axis=1)  # 按详单个数/商家个数计算频率
    df["Monetary"] = gb_cust["order_total_payment"].sum()
    df["Monetary_weight"] = data.groupby(["customer_id", "order_id"])["order_total_payment", "weight_time"].last().prod(
        axis=1) \
        .groupby("customer_id").sum()
    # 计算加权平均每个月下单人数比例
    weight = data.groupby(["order_pay_time_month"])["weight_time"].max()  # 权重
    num_buy = ((data.groupby(["order_pay_time_month"])[
                    "customer_id"].nunique() * weight) / weight.sum()).mean()  # 加权平均每个月下单人数
    num_cust = len(raw_data[raw_data["order_pay_time_month"] < pred_month]["customer_id"].drop_duplicates())
    ratio_buy = num_buy / num_cust
    # 计算划分阈值
    threshold_r, threshold_f, threshold_m = df["Recency"].quantile(q=ratio_buy), df["Freq_time_weight"].quantile(
        q=1 - ratio_buy), df["Monetary_weight"].quantile(q=1 - ratio_buy)
    # RFM模型的8分类
    df["R_high"] = df["Recency"].apply(lambda x: 1 if x <= threshold_r else 0)
    df["F_high"] = df["Freq_time_weight"].apply(lambda x: 1 if x >= threshold_f else 0)
    df["M_high"] = df["Monetary_weight"].apply(lambda x: 1 if x >= threshold_m else 0)
    df["RFM_cate"] = pd.factorize(df[["M_high", "R_high", "F_high"]].applymap(str).apply(lambda x: ''.join(x), axis=1))[
        0]
    # 其他交易特征
    df["total_goods_num"] = gb_cust["order_total_num"].sum()  # 购买商品总数
    df["pay_per_time"] = df["Monetary"] / df["Freq_time"]  # 平均每次交易的支出
    df["pay_per_time_weight"] = df["Monetary_weight"] / df["Freq_time"]
    df["pay_per_good"] = df["Monetary"] / df["total_goods_num"]  # 平均每件商品的支出
    df["pay_per_good_weight"] = df["Monetary_weight"] / df["total_goods_num"]
    df["num_per_time"] = df["total_goods_num"] / df["Freq_time"]  # 平均每次交易的商品购买数量
    df["goods_list_time_diff"] = (t - gb_cust["goods_list_time"].last()).dt.seconds / 3600
    df["goods_delist_time_diff"] = (t - gb_cust["goods_delist_time"].last()).dt.seconds / 3600
    # 总优惠率、商家优惠率和子订单优惠率
    df["order_all_discount_rate"] = gb_cust["order_all_discount"].sum() / gb_cust["order_amount"].sum()
    df["order_discount_rate"] = gb_cust["order_total_discount"].sum() / gb_cust["order_amount"].sum()
    df["order_detail_discount_rate"] = gb_cust["order_detail_discount"].sum() / (
                gb_cust["order_detail_discount"].sum() + gb_cust["order_detail_payment"].sum())
    df["goods_has_discount_rate"] = gb_cust["goods_has_discount"].mean()
    df.drop("total_goods_num", axis=1, inplace=True)  # 该特征后面还会产生，避免重复
    # 统计特征暴力衍生
    stats_dict = {
        "order_total_num": ["mean", "sum"],  # total_goods_num = sum(order_total_num)
        "order_total_payment": ["mean", "std"],  # Monetary = sum(order_total_payment)
        "order_amount": ["mean", "sum", "std"], "order_detail_payment": ["mean", "sum", "std"],
        "order_all_discount": ["mean", "sum", "std"], "order_total_discount": ["mean", "sum", "std"],
        "order_detail_discount": ["mean", "sum", "std"],
    }
    tmp = gb_cust.agg(stats_dict)
    tmp.columns = ["_".join(tup) for tup in tmp.columns]
    # 部分特征按时间加权处理
    for f in stats_dict.keys():
        stats_ls = [x for x in stats_dict[f] if x != "std"]  # 不处理std特征
        if "detail" in f:  # 按子订单划分
            tmp1 = data.groupby(["customer_id", "order_detail_id"])[f, "weight_time"].last().prod(axis=1) \
                .groupby("customer_id").agg(stats_ls)
        else:  # 按订单划分
            tmp1 = data.groupby(["customer_id", "order_id"])[f, "weight_time"].last().prod(axis=1) \
                .groupby("customer_id").agg(stats_ls)
        tmp1.columns = [f + "_weight_" + m for m in stats_ls]
        tmp = tmp.merge(tmp1, how="left", on="customer_id")
    # 交叉特征衍生？

    # 按全部用户id合并
    df = pd.merge(raw_data[["customer_id"]].drop_duplicates(), df, how="left", on="customer_id")
    df = pd.merge(df, tmp, how="left", on="customer_id")
    # 处理时间diff缺失值
    time_feas = ["Recency", "goods_list_time_diff", "goods_delist_time_diff"]
    for fea in time_feas:
        df[fea].fillna(df[fea].max() * 1.5, inplace=True)  # 取一个足够大的值
    # 0值填充
    for fea in df.columns:
        if fea not in time_feas:
            df[fea].fillna(0, inplace=True)
    if months is not None:  # 修改特征名
        df.columns = ["customer_id"] + [f + f'_{months}m' for f in df.columns if f != "customer_id"]
    else:
        df.columns = ["customer_id"] + [f + f'_{days}d' for f in df.columns if f != "customer_id"]
    return df

In [10]:
def get_features(data, pred_month, ts_months=None, ts_days=None):
    """获取数据集中用户的相关特征。"""
    cust_df = data[['customer_id']].drop_duplicates().set_index("customer_id")
    # 静态特征
    gb_cust = data.groupby("customer_id")  # 复用，提高效率
    cust_df["cust_gender"] = gb_cust["customer_gender"].max()  # customer_gender同时存在两个值（其中必有未知性别值0）的，取已知性别值
    #     cust_df["member_id_last"] = data.groupby("customer_id")["member_id"].last()
    cust_df["member_status"] = gb_cust["member_status"].last()
    cust_df["cust_province_last"] = gb_cust["customer_province"].last()
    cust_df["cust_province_count"] = gb_cust["customer_province"].count()
    cust_df["cust_city_last"] = gb_cust["customer_city"].last()
    cust_df["cust_city_count"] = gb_cust["customer_city"].count()
    cust_df["cust_rate_ratio"] = (
                data[data["is_customer_rate"] == 1].groupby("customer_id")["is_customer_rate"].count()
                / gb_cust["is_customer_rate"].count()).fillna(0)  # 用户对订单进行评价的比例
    # 商品相关
    cust_df["goods_id_last"] = gb_cust["goods_id"].last()
    cust_df.reset_index(inplace=True)
    # 时序特征
    # 按月滑窗
    if isinstance(ts_months, list):
        for months in ts_months:
            ts = get_time_series_features(data, pred_month, months)
            cust_df = cust_df.merge(ts, on="customer_id", how='left')
        # 只保留时间窗口最长的Recency特征
        if None in ts_months:
            raise Exception("Something wrong!")
            # cust_df.drop([f for f in cust_df.columns if f.startswith("Recency_")], axis=1, inplace=True)
        else:
            cust_df.drop(
                sorted([f for f in cust_df.columns if f.startswith("Recency")], key=lambda x: int(x[-2]))[:-1],
                axis=1, inplace=True)
    else:
        ts = get_time_series_features(data, pred_month, ts_months)
        cust_df = cust_df.merge(ts, on="customer_id", how='left')
    # 按日滑窗
    if ts_days:
        if isinstance(ts_days, list):
            for days in ts_days:
                ts = get_time_series_features(data, pred_month, days=days)
                cust_df = cust_df.merge(ts, on="customer_id", how='left')
            if ts_months is None:
                cust_df.drop(
                    sorted([f for f in cust_df.columns if f.startswith("Recency") and f.endswith("d")], key=lambda x: int(x[-2]))[:-1],
                    axis=1, inplace=True)
            else:
                cust_df.drop([f for f in cust_df.columns if f.startswith("Recency") and f.endswith("d")], axis=1, inplace=True)
        else:
            raise Exception("ts_days请使用list格式.")
    # 内存优化
    cust_df = reduce_mem_usage(cust_df)
    return cust_df

In [17]:
%%time
months = [1, 2, 3, 4, 5]  # 使用5种滑动窗口提取特征
days = [4, 7, 14, 21]
# tra_X = get_features(tra_data, pred_month=7, ts_months=months)
tra_X = get_features(tra_data, pred_month=8, ts_months=months, ts_days=days)
test_X = get_features(data, pred_month=9, ts_months=months, ts_days=days)

Memory usage after optimization is: 1496.22 MB
Decreased by 73.8%
Memory usage after optimization is: 1638.05 MB
Decreased by 74.1%
CPU times: user 12min 2s, sys: 16min 10s, total: 28min 13s
Wall time: 28min 31s


In [19]:
# 保存数据
tra_X.to_pickle(path + "train_X.pkl")
test_X.to_pickle(path + "test_X.pkl")

## 模型训练

In [20]:
# 分类变量
cate_feas = ["cust_gender", "member_status", "cust_province_last", "cust_city_last", "goods_id_last"] + \
             [f for f in tra_X.columns if f[:-3] in ["not_nan", "R_high", "F_high", "M_high", "RFM_cate"]]
len(cate_feas)

40

In [21]:
train_data = tra_X.copy()
train_data = train_data.merge(tra_y, on='customer_id')

In [56]:
%%time
# 采用CV=5折交叉验证
kf = KFold(n_splits=5, shuffle=True)
y_pred = 0
for train_index , valid_index in kf.split(train_data):
    # 设置每一折的 train和valid
    X_train, X_valid, y_train, y_valid = train_data.drop('label', axis=1).iloc[train_index], \
        train_data.drop('label', axis=1).iloc[valid_index], train_data['label'].values[train_index], \
        train_data['label'].values[valid_index]
    param = {
        'num_leaves': 63,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'learning_rate': 0.05,
        'metric': 'auc',
        'verbose': -1,
    }
    # 使用lgb进行训练
    trn_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_valid, y_valid)
    lgbm = lgb.train(param, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, 
                     early_stopping_rounds=100, verbose_eval=50, categorical_feature=cate_feas)
    # 计算test的预测结果
    y_pred = y_pred + lgbm.predict(test_X) * 0.2

Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.867352	valid_1's auc: 0.812382
[100]	training's auc: 0.888371	valid_1's auc: 0.81113
Early stopping, best iteration is:
[43]	training's auc: 0.863185	valid_1's auc: 0.812542
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.867076	valid_1's auc: 0.81219
[100]	training's auc: 0.887511	valid_1's auc: 0.809376
Early stopping, best iteration is:
[29]	training's auc: 0.854128	valid_1's auc: 0.812478
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.865886	valid_1's auc: 0.813909
[100]	training's auc: 0.887548	valid_1's auc: 0.813045
[150]	training's auc: 0.894887	valid_1's auc: 0.811677
Early stopping, best iteration is:
[57]	training's auc: 0.869847	valid_1's auc: 0.814088
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.86662	valid_1's auc: 0.815182
[100]	training's auc: 0.886581	valid_1's auc: 0.8134

In [70]:
test["result"] = y_pred
# threshold = test["result"].quantile(q=1-0.12)  # 8月的复购比例为0.12
threshold = test["result"].mean()
test["result"] = test["result"].apply(lambda x: 1 if x >= threshold else 0)
test

Unnamed: 0,customer_id,result
0,1000000,0
1,1000014,1
2,1000034,0
3,1000046,1
4,1000048,0
...,...,...
1585981,2826570,1
1585982,2826571,0
1585983,2826572,0
1585984,2826573,0


In [71]:
test.to_csv("./result/res_20201226_2.csv", index=None)