In [1]:
import pandas as pd
import numpy as np
from datetime import date
import datetime as dt

# 获取数据

In [2]:
off_train = pd.read_csv('./data/ccf_offline_stage1_train.csv', keep_default_na=False, header=0)
off_test = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv',keep_default_na=False, header=0)
on_train = pd.read_csv('./data/ccf_online_stage1_train.csv', keep_default_na=False, header=0)

In [3]:
# 更换特征名称
off_train.columns=['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']


# 数据分批处理

#将数据分为3个数据集 利用滑窗法
#将2016年1月1日到4月13日的数据提取特征，利用4月14日的到5月14日的作为测试集
#将2月1日到5月14日的作为数据集提取特征，利用5月15日6月15日的作为测试集
#将3月15日到6月30日作为数据集提取特征，再测试7月1日到7月31日的数据

In [4]:
# 将2016年1月1日到4月13日的数据提取特征
feature1 = off_train[(off_train.date >= '20160101') & (off_train.date <= '20160413') | (
    (off_train.date == 'null') & (off_train.date_received >= '20160101') & (off_train.date_received <= '20160413'))]
# 利用4月14日的到5月14日的作为测试集
dataset1 = off_train[(off_train.date_received >= '201604014')
                     & (off_train.date_received <= '20160514')]

In [5]:
# 在2月1日到5月14日之间使用了券,只要领取时间在2月1日到5月14日之间,并包括没有数据中没有领取券的
feature2 = off_train[(off_train.date >= '20160201') & (off_train.date <= '20160514') | (
    (off_train.date == 'null') & (off_train.date_received >= '20160201') & (off_train.date_received <= '20160514'))]
# 提取数据集2的测试集
dataset2 = off_train[(off_train.date_received >= '20160515')
                     & (off_train.date_received <= '20160615')]

In [6]:
# 数据集3的特征为 取数据中领券和用券日期大于3月15日和小于6月30日的
feature3 = off_train[((off_train.date >= '20160315') & (off_train.date <= '20160630')) | (
    (off_train.date == 'null') & (off_train.date_received >= '20160315') & (off_train.date_received <= '20160630'))]
# 使数据集3等于test集 没有label标签
dataset3 = off_test

# 特征工程

## 用户特征处理

In [7]:
def is_firstlastone(x):
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1  # 表明这个优惠券只接受了一次

def get_day_gap_before(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        # 将时间差转化为天数
        this_gap = (dt.date(int(date_received[0:4]), int(date_received[4:6]), int(
            date_received[6:8]))-dt.date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)


def get_day_gap_after(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (dt.datetime(int(d[0:4]), int(d[4:6]), int(d[6:8]))-dt.datetime(
            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)

In [8]:
def GetOtherFeature(dataset):
    # 对于测试集，提取用户的Id
    dataset3 = dataset
    t = dataset3[['user_id']]
    # 相当于给原有数据加上一列，这个月用户收取的所有优惠券数目，并初始化为1
    t['this_month_user_receive_all_coupon_count'] = 1

    # 将t按照用户id进行分组，然后统计所有用户收取的优惠券数目,并初始化一个索引值
    t = t.groupby('user_id').agg('sum').reset_index()
    # 提取数据集的优惠券Id和用户Id
    t1 = dataset3[['user_id', 'coupon_id']]
    # 提取这个月用户收到的相同的优惠券的数量
    t1['this_month_user_receive_same_coupn_count'] = 1
    t1 = t1.groupby(['user_id', 'coupon_id']).agg('sum').reset_index()

    # 提取数据集的用户id，优惠券id以及优惠券接收的时间
    t2 = dataset3[['user_id', 'coupon_id', 'date_received']]

    # 将数据转换为str类型
    t2.date_received = t2.date_received.astype('str')
    # 如果出现相同的用户接收相同的优惠券在接收时间上用‘：’连接上第n次接受优惠券的时间
    t2 = t2.groupby(['user_id', 'coupon_id'])['date_received'].agg(
        lambda x: ':'.join(x)).reset_index()
    # 将接收时间的一组按着':'分开，这样就可以计算接受了优惠券的数量,apply是合并
    t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
    t2 = t2[t2.receive_number > 1]
    # 最大接受的日期
    t2['max_date_received'] = t2.date_received.apply(
        lambda s: max([int(d) for d in s.split(':')]))
    # 最小的接收日期
    t2['min_date_received'] = t2.date_received.apply(
        lambda s: min([int(d) for d in s.split(':')]))
    t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]

    t3 = dataset3[['user_id', 'coupon_id', 'date_received']]
    # 将两表融合只保留左表数据,这样得到的表，相当于保留了最近接收时间和最远接受时间
    t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
    # 这个优惠券最近接受时间
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
        t3.date_received.astype(int)
    # 这个优惠券最远接受时间
    t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
        int)-t3.min_date_received

    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
        is_firstlastone)
    t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
        is_firstlastone)
    t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
             'this_month_user_receive_same_coupon_firstone']]
    # 将表格中接收优惠券日期中为最近和最远的日期时置为1其余为0，若只接受了一次优惠券为-1

    # 提取第四个特征,一个用户所接收到的所有优惠券的数量
    t4 = dataset3[['user_id', 'date_received']]
    t4['this_day_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id', 'date_received']).agg('sum').reset_index()

    # 提取第五个特征,一个用户不同时间所接收到不同优惠券的数量
    t5 = dataset3[['user_id', 'coupon_id', 'date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id', 'coupon_id', 'date_received']
                    ).agg('sum').reset_index()
    # 一个用户不同优惠券 的接受时间
    t6 = dataset3[['user_id', 'coupon_id', 'date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id', 'coupon_id'])['date_received'].agg(
        lambda x: ':'.join(x)).reset_index()
    # 重命名inplace代表深拷贝
    t6.rename(columns={'date_received': 'dates'}, inplace=True)


    t7 = dataset3[['user_id', 'coupon_id', 'date_received']]
    # 将t6和t7融合
    t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
    # 注意这里所有的时间格式都已经是'str'格式
    t7['date_received_date'] = t7.date_received.astype('str')+'-'+t7.dates
    # print(t7)
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
    t7 = t7[['user_id', 'coupon_id', 'date_received',
             'day_gap_before', 'day_gap_after']]

    # 将所有特征融合在一张表中
    other_feature3 = pd.merge(t1, t, on='user_id')
    other_feature3 = pd.merge(other_feature3, t3, on=['user_id', 'coupon_id'])
    other_feature3 = pd.merge(other_feature3, t4, on=['user_id', 'date_received'])
    other_feature3 = pd.merge(other_feature3, t5, on=[
                              'user_id', 'coupon_id', 'date_received'])
    other_feature3 = pd.merge(other_feature3, t7, on=[
                              'user_id', 'coupon_id', 'date_received'])
    return other_feature3

## 提取优惠券的相关特征

In [9]:
def calc_discount_rate(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])
def get_discount_man(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 'null'
    else:
        return int(s[0])
def get_discount_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 'null'
    else:
        return int(s[1])

def is_man_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 0
    else:
        return 1 

In [10]:
def GetCouponFeature(dataset, feature):
    # 对于数据集
    # 将时间转化为第几周
    # 显示时间是第几周
    # tt是获取到的特征中消费的最大时间
    dataset3 = dataset
    tt = feature[feature.date != 'null'].date.unique().max()
    dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(
        lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
    # 显示时间是几月
    dataset3['day_of_month'] = dataset3.date_received.astype(
        'str').apply(lambda x: int(x[6:8]))
    # 显示时期和截止日之间的天数
    dataset3['days_distance'] = dataset3.date_received.astype('str').apply(
        lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(int(tt[0:4]), int(tt[4:6]), int(tt[6:8]))).days)
    # 显示满了多少钱后开始减
    dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
    # 显示满减的减少的钱
    dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
    # 返回优惠券是否是满减券
    dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
    # 显示打折力度
    dataset3['discount_rate'] = dataset3.discount_rate.apply(
        calc_discount_rate)
    d = dataset3[['coupon_id']]
    d['coupon_count'] = 1
    # 显示每一种优惠券的数量
    d = d.groupby('coupon_id').agg('sum').reset_index()

    dataset3 = pd.merge(dataset3, d, on='coupon_id', how='left')
    return dataset3

## 提取商品的特征

In [11]:
def GetMerchantFeature(feature):
    #提取商品的特征
    #对于数据集
    feature3 = feature
    merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]

    t = merchant3[['merchant_id']]
    #删除重复行数据
    t.drop_duplicates(inplace=True)

    #显示卖出的商品
    t1 = merchant3[merchant3.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    #显示每个商品的销售数量
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()


    #显示使用了优惠券消费的商品，正样本
    t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()


    #显示了商品的优惠券的总数量
    t3 = merchant3[merchant3.coupon_id != 'null'][['merchant_id']]
    t3 ['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()

    #显示商品销量和距离的关系
    t4 = merchant3[(merchant3.date != 'null')&(merchant3.coupon_id != 'null')][['merchant_id','distance']]
    #把数据中的null值全部替换为-1
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    #再把数据中的-1全部替换为NaN
    t4.replace(-1,np.nan,inplace=True)

    #返回用户离商品的距离最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)

    #返回用户离商品的距离最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t6.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    #print(t6)

    #返回距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns = {'distance':'merchant_mean_distance'},inplace= True)

    #返回距离的中位值
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    t8.rename(columns={'distance':'merchant_median_distance'},inplace = True)

    merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')

    #将数据中的NaN用0来替换
    merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
    #即优惠券的使用率
    merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float')/merchant3_feature.total_coupon
    #即卖出商品中使用优惠券的占比
    merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
    #将数据中的NaN用0来替换
    merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
    return merchant3_feature

## 用户的相关信息

In [12]:
def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days

In [13]:
def GetUserRelateInfo(feature):
    
    feature3 = feature
    user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]

    t = user3[['user_id']]
    #去掉数据中重复的用户Id
    t.drop_duplicates(inplace=True)

    #用户购买商品的种类数
    t1 = user3[user3.date!='null'][['user_id','merchant_id']]
    #同样去掉重复用的用户id和商品id
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)


    #使用了优惠券购买商品的用户id和距离
    t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
    #将null值替换为-1
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')#转换数据类型为int
    t2.replace(-1,np.nan,inplace=True)

    #得到使用优惠券购买商品的用户离店铺的最短距离
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_distance'},inplace=True)

    #得到最大距离
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)

    #得到平均距离
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)

    #得到中间距离
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)

    #每个用户使用优惠券购买的物品数量
    t7 = user3[(user3.date != 'null')&(user3.coupon_id != 'null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()

    #购买物品的总数
    t8 = user3[user3.date != 'null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()

    #接受的优惠券的总数
    t9 = user3[user3.coupon_id != 'null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()

    #接受到优惠券的日期和使用之间的间隔
    t10 = user3[(user3.date_received != 'null')&(user3.date != 'null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date + ':'+ t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]

    #将用户优惠券使用时间的间隔取平均数
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)

    #间隔天数的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)

    #间隔天数的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datareceived_gap':'max_user_date_datereceived_gap'},inplace=True)

    #将提取的特征合并
    user3_feature = pd.merge(t,t1,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')

    user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
    user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
    user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')#使用优惠券购买的商品占总数的多少
    user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
    user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)#将数据中的NaN值转为0
    user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
    return user3_feature

## 用户和商家之间的特征关系

In [14]:
def GetUserMerchantRelateInfo(feature):
    #4.user_merchant:
    #times_user_buy_merchant_before. 
    feature3 = feature
    all_user_merchant = feature3[['user_id','merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)
    
    #只保留销售了商品的商户id
    t = feature3[['user_id','merchant_id','date']]
    t = t[t.date!='null'][['user_id','merchant_id']]
    
    #用户一共买了这家商户的多少商品
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)

    t1 = feature3[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
    
    #用户一共收到一个商户的多少优惠券
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace = True)

    t2 = feature3[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
    
    #用户在一家商户中使用优惠券购买的商品的数目
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace = True)

    #用户在一家商家的所有记录总数
    t3 = feature3[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace = True)

    t4 = feature3[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
    
    #用户没有使用优惠券购买的商品的数目
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace = True)

    user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
    
    #都是针对一家商户和一个用户
    user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
    user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
    #y优惠券的转换率，用户使用了的优惠券/一共收到的优惠券
    user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
    #用户使用优惠券的概率，在一家商户使用优惠券购买的商品/在一家商户购买商品的总数
    user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
    #用户在商户消费的概率 用户在商户购买的总数/在一家商户浏览的总次数
    user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
    #用户在一家商户不适用优惠券购买的概率 普通购买的商品数/购买商品的总数
    user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
    return user_merchant3

## 构建训练集和测试集

In [15]:
def get_label(s):
    s = s.split(':')
    if s[0]=='null':
        return 0
    elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
        return 1
    else:
        return -1 

In [16]:
def GenerateData(dataset, feature, label=True):
    # 获取各个特征处理后的结果
    coupon_feature = GetCouponFeature(dataset, feature)
    merchant_feature = GetMerchantFeature(feature)
    user_feature = GetUserRelateInfo(feature)
    user_merchant = GetUserMerchantRelateInfo(feature)
    other_feature = GetOtherFeature(dataset)

    dataset = pd.merge(coupon_feature, merchant_feature,
                       on='merchant_id', how='left')
    dataset = pd.merge(dataset, user_feature, on='user_id', how='left')
    dataset = pd.merge(dataset, user_merchant, on=[
                       'user_id', 'merchant_id'], how='left')
    dataset = pd.merge(dataset, other_feature, on=[
                       'user_id', 'coupon_id', 'date_received'], how='left')
    dataset.drop_duplicates(inplace=True)

    dataset.user_merchant_buy_total = dataset.user_merchant_buy_total.replace(
        np.nan, 0)
    dataset.user_merchant_any = dataset.user_merchant_any.replace(np.nan, 0)
    dataset.user_merchant_received = dataset.user_merchant_received.replace(
        np.nan, 0)
    dataset['is_weekend'] = dataset.day_of_week.apply(
        lambda x: 1 if x in (6, 7) else 0)
    weekday_dummies = pd.get_dummies(dataset.day_of_week)
    weekday_dummies.columns = [
        'weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset = pd.concat([dataset, weekday_dummies], axis=1)

    # 如果是训练集要记得处理label标签值  但是在测试集中不用处理label标签 注意off_train和off_test字段
    if label:
        dataset['label'] = dataset.date.astype(
            'str') + ':' + dataset.date_received.astype('str')
        dataset.label = dataset.label.apply(get_label)
        dataset.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
                     'coupon_count'], axis=1, inplace=True)

    else:
        dataset.drop(['merchant_id', 'day_of_week', 'coupon_count'],
                 axis=1, inplace=True)
        
    # 所有的表都要一起处理null
    dataset = dataset.replace('null', np.nan)

    return dataset

In [17]:
GenerateData1 = GenerateData(dataset1, feature1)
GenerateData2 = GenerateData(dataset2, feature2)
GenerateData3 = GenerateData(dataset3, feature3, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http:/

## 保存处理好的特征值, 以便后续使用

In [18]:
GenerateData1.to_csv('./GenerateData1.csv', index=None)
GenerateData2.to_csv('./GenerateData2.csv', index=None)
GenerateData3.to_csv('./GenerateData3.csv', index=None)