In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option("display.max_column",100)
pd.set_option("expand_frame_repr",False)

In [3]:
off_train_file=r"D:\dataset\o2o\ccf_offline_stage1_train.csv"
off_test_file=r"D:\dataset\o2o\ccf_offline_stage1_test_revised.csv"
on_train_file=r"D:\dataset\o2o\ccf_online_stage1_train.csv"
off_train=pd.read_csv(off_train_file)
off_test=pd.read_csv(off_test_file)
on_train=pd.read_csv(on_train_file)

In [4]:
#查看数据
off_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,


In [5]:
off_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   User_id        int64  
 1   Merchant_id    int64  
 2   Coupon_id      float64
 3   Discount_rate  object 
 4   Distance       float64
 5   Date_received  float64
 6   Date           float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB


In [6]:
on_train.head()

Unnamed: 0,User_id,Merchant_id,Action,Coupon_id,Discount_rate,Date_received,Date
0,13740231,18907,2,100017492.0,500:50,20160513.0,
1,13740231,34805,1,,,,20160321.0
2,14336199,18907,0,,,,20160618.0
3,14336199,18907,0,,,,20160618.0
4,14336199,18907,0,,,,20160618.0


In [7]:
on_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11429826 entries, 0 to 11429825
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   User_id        int64  
 1   Merchant_id    int64  
 2   Action         int64  
 3   Coupon_id      object 
 4   Discount_rate  object 
 5   Date_received  float64
 6   Date           float64
dtypes: float64(2), int64(3), object(2)
memory usage: 610.4+ MB


In [8]:
off_test.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,4129537,450,9983,30:5,1.0,20160712
1,6949378,1300,3429,30:5,,20160706
2,2166529,7113,6928,200:20,5.0,20160727
3,2166529,7113,1808,100:10,5.0,20160727
4,6172162,7605,6500,30:1,2.0,20160708


In [9]:
off_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113640 entries, 0 to 113639
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   User_id        113640 non-null  int64  
 1   Merchant_id    113640 non-null  int64  
 2   Coupon_id      113640 non-null  int64  
 3   Discount_rate  113640 non-null  object 
 4   Distance       101576 non-null  float64
 5   Date_received  113640 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 5.2+ MB


In [10]:
#查看空值
off_train.isnull().sum()

User_id               0
Merchant_id           0
Coupon_id        701602
Discount_rate    701602
Distance         106003
Date_received    701602
Date             977900
dtype: int64

In [11]:
on_train.isnull().sum()

User_id                 0
Merchant_id             0
Action                  0
Coupon_id        10557469
Discount_rate    10557469
Date_received    10557469
Date               655898
dtype: int64

In [12]:
off_test.isnull().sum()

User_id              0
Merchant_id          0
Coupon_id            0
Discount_rate        0
Distance         12064
Date_received        0
dtype: int64

In [13]:
#查看数据边界
#领券日期
display(off_train[off_train["Date_received"]!="null"]["Date_received"].min())
display(off_train[off_train["Date_received"]!="null"]["Date_received"].max())
display(on_train[on_train["Date_received"]!="null"]["Date_received"].min())
display(on_train[on_train["Date_received"]!="null"]["Date_received"].max())
display(off_test[off_test["Date_received"]!="null"]["Date_received"].min())
display(off_test[off_test["Date_received"]!="null"]["Date_received"].max())

20160101.0

20160615.0

20160101.0

20160615.0

20160701

20160731

In [14]:
#用券日期
display(off_train[off_train["Date"]!="null"]["Date"].min())
display(off_train[off_train["Date"]!="null"]["Date"].max())
display(on_train[on_train["Date"]!="null"]["Date"].min())
display(on_train[on_train["Date"]!="null"]["Date"].max())

20160101.0

20160630.0

20160101.0

20160630.0

In [15]:
#查看训练集和测试集的相关性
#查看online和offline训练集的user_id与测试集的重合度
#off_train[[]]的写法，是将user_id保持成DataFrame的格式方便后面merge,不然将变成Series
off_train_user=off_train[["User_id"]].copy().drop_duplicates()
off_test_user=off_test[["User_id"]].copy().drop_duplicates()
on_train_user=on_train[["User_id"]].copy().drop_duplicates()

In [16]:
#User_id的数量
display(len(off_train_user))
display(len(on_train_user))
display(len(off_test_user))

539438

762858

76309

In [17]:
#使用merge的方式将训练集合和测试集合并，以此判断他们有多少重合的项
off_train_user["off_train_flag"]=1
off_merge=off_test_user.merge(off_train_user,on="User_id",how="left").reset_index().fillna(0)
display(off_merge["off_train_flag"].sum()/off_merge["off_train_flag"].count())

on_train_user["on_train_flag"]=1
on_merge=off_test_user.merge(on_train_user,on="User_id",how="left").reset_index().fillna(0)
display(on_merge["on_train_flag"].sum()/on_merge["on_train_flag"].count())

0.9999737907717308

0.5655296229802513

In [18]:
#查看online和offline训练集合的merchant_id与训练集的重合度
off_train_merchant=off_train[["Merchant_id"]].copy().drop_duplicates()
off_test_merchant=off_test[["Merchant_id"]].copy().drop_duplicates()
on_train_merchant=on_train[["Merchant_id"]].copy().drop_duplicates()

In [19]:
#merchant_id的数量
display(len(off_train_merchant))
display(len(on_train_merchant))
display(len(off_test_merchant))

8415

7999

1559

In [20]:
#合并训练集和测试集的样本，看看有多少样本可以merge出来
#给off_train_merchant增加一个字段"off_train_flag"作为off_train_merchant的唯一标识符
off_train_merchant["off_train_flag"]=1
off_merge=off_test_merchant.merge(off_train_merchant,on="Merchant_id",how="left").reset_index().fillna(0)
display(off_merge["off_train_flag"].sum()/off_merge["off_train_flag"].count())

on_train_merchant["on_train_flag"]=1
on_merge=off_test_merchant.merge(on_train_merchant,on="Merchant_id",how="left").reset_index().fillna(0)
display(on_merge["on_train_flag"].sum()/on_merge["on_train_flag"].count())

0.9993585631815266

0.0

In [21]:
#特征工程

In [22]:
#工具类
#计算折扣率，将满减和折扣统一
def get_discount_rate(s):
    s = str(s)
    if s=='null':
        return -1
    s = s.split(fd_seperator)
    if len(s) == 1:
        return float(s[0])
    else:
        return round((1.0-float(s[1])/float(s[0])),3)

#获取是否满减（full reduction promotion）
def get_if_fd(s):
    s = str(s)
    s = s.split(fd_seperator)
    if len(s)==1:
        return 0
    else:
        return 1
        
#获取满减的条件
def get_full_value(s):
    s = str(s)
    s = s.split(fd_seperator)
    if len(s)==1:
        #return 'null'
        return np.nan
    else:
        return int(s[0])
        
#获取满减的优惠     
def get_reduction_value(s):
    s = str(s)
    s = s.split(fd_seperator)
    if len(s) == 1:
        #return 'null'
        return np.nan
    else:
        return int(s[1])

#获取日期间隔，输入内容为Date_received:Date
def get_day_gap(s):
    s = s.split(fd_seperator)
    if s[0]=='null':
        return -1
    if s[1]=='null':
        return -1
    else:    
        return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days


#获取Label，输入内容为Date:Date_received
def get_label(s):
    s = s.split(fd_seperator)
    if s[0]=='null':
        return 0
    if s[1]=='null':
        return -1
    elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
        return 1
    else:
        return 0

#增加折扣相关特征
def add_discount(df):
    df['if_fd']=df['discount_rate'].apply(get_if_fd)
    df['full_value']=df['discount_rate'].apply(get_full_value)
    df['reduction_value']=df['discount_rate'].apply(get_reduction_value)
    df['discount_rate']=df['discount_rate'].apply(get_discount_rate)
    df.distance=df.distance.replace('null',np.nan)
    return df

#计算日期间隔  
def add_day_gap(df):
    df['day_gap']=df['date'].astype('str') + ':' +  df['date_received'].astype('str')
    df['day_gap']=df['day_gap'].apply(get_day_gap)
    return df

#获取label
def add_label(df):
    df['label']=df['date'].astype('str') + ':' +  df['date_received'].astype('str')
    df['label']=df['label'].apply(get_label)
    return df

def is_firstlastone(x):
    if x==0:
        return 1
    elif x>0:
        return 0
    else:
        #return -1
        return np.nan

def get_day_gap_before(s):
    date_received,dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        #将时间差转化为天数
        this_gap = (dt.date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-dt.date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps)==0:
        #return -1
        return np.nan
    else:
        return min(gaps)
    
def get_day_gap_after(s):
    date_received,dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (dt.datetime(int(d[0:4]),int(d[4:6]),int(d[6:8]))-dt.datetime(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps)==0:
        #return -1
        return np.nan
    else:
        return min(gaps)

In [23]:
#工具函数(起到统计量的作用)
# df:添加特征的dataframe
# df_group:特征生成的数据集
# group_cols:用于group by的列
# value_col:被统计的列
# agg_ops:处理方式 包括：count,mean,sum,std,max,min,nunique
# col_names:新特征的名称
def add_agg_feature_names(df,df_group,group_cols,value_col,agg_ops,col_names):
    df_group[value_col]=df_group[value_col].astype("float")
    df_agg=pd.DataFrame(df_group.groupby(group_cols)[value_col].agg(agg_ops)).reset_index()
    df_agg.columns=group_cols+col_names
    df=df.merge(df_agg,on=group_cols,how="left")
    return df 

#统计特征处理函数
#名称按照keyword+"_"+value_col+"_"+op自动增加
def add_agg_feature(df,df_group,group_cols,value_col,agg_ops,keyword):
    col_names=[]
    for op in agg_ops:
        col_names.append(keyword+"_"+value_col+"_"+op)
    df=add_agg_feature_names(df,df_group,group_cols,value_col,agg_ops,col_names)
    return df 

#因为count特征会很多，所以开发一个专门增加count特征的函数
def add_count_new_feature(df,df_group,group_cols,new_feature_name):
    df_group[new_feature_name]=1
    df_group=df_group.groupby(group_cols).agg("sum").reset_index()
    df=df.merge(df_group,on=group_cols,how="left")
    return df 

In [24]:
#特征群生成（起到特征创造的作用）
#商户相关特征群
def get_merchant_feature(feature):
    merchant=feature[['Merchant_id','Coupon_id','Distance','Date_received','Date']].copy()
    t=merchant[['Merchant_id']].copy()
    #删除重复行数据
    t.drop_duplicates(inplace=True)

    #卖出的商品
    t1=merchant[merchant.Date!="null"][["Merchant_id"]].copy()
    merchant_feature=add_count_new_feature(t,t1,"Merchant_id","total_sales")

    #在每个商户销售中，使用优惠券的交易次数（正样本）
    t2=merchant[(merchant.Date!="null") & (merchant.Coupon_id!="null")][["Merchant_id"]].copy()
    merchant_feature=add_count_new_feature(merchant_feature,t2,"Merchant_id","sales_use_coupon")

    #每个商户发放的优惠券总数
    t3=merchant[merchant.Coupon_id!="null"][["Merchant_id"]].copy()
    merchant_feature=add_count_new_feature(merchant_feature,t3,"Merchant_id","total_coupon")

    #在每个线下商户含有优惠券的交易中，统计和用户距离的最大值，最小值，平均值，中位数
    t4=merchant[(merchant.Date!="null") & (merchant.Coupon_id!="null")][["Merchant_id","Distance"]].copy()
    t4.Distance=t4.Distance.astype("int")
    merchant_feature=add_agg_feature(merchant_feature,t4,["Merchant_id"],"Distance",["min","max","mean","median"],"merchant")

    #将数据中的nan用0来替换
    merchant_feature.sales_use_coupon=merchant_feature.sales_use_coupon.replace(np.nan,0)

    #商户发放优惠券的使用率
    merchant_feature["merchant_coupon_transfer_rate"]=merchant_feature.sales_use_coupon.astype("float")/merchant_feature.total_coupon

    #在商户交易中，使用优惠券的交易占比
    merchant_feature['coupon_rate']=merchant_feature.sales_use_coupon.astype("float")/merchant_feature.total_sales

    #将数据中的nan用0来替换
    merchant_feature.total_coupon=merchant_feature.total_coupon.replace(np.nan,0)

    return merchant_feature


In [25]:
#用户相关特征群
def get_user_feature(feature):
    user=feature[['User_id','Merchant_id','Coupon_id','Discount_rate','Distance','Date_received','Date']].copy()
    t=user[['User_id']].copy()
    t.drop_duplicates(inplace=True)
    #每个用户交易的商户数
    t1=user[user.Date!="null"][["User_id","Merchant_id"]].copy()
    t1.drop_duplicates(inplace=True)
    t1=t1[["User_id"]]
    user_feature=add_count_new_feature(t,t1,"User_id","count_merchant")
    #在每个用户线下使用优惠券产生的交易中，统计和商户距离的最大值，最小值，平均值，中位数
    t2=user[(user.Date!="null") & (user.Coupon_id!="null") & (user.Distance!="null")][["User_id","Distance"]]
    t2.Distance=t2.Distance.astype('int')
    user_feature=add_agg_feature(user_feature,t2,["User_id"],"Distance",["min","max",'mean',"median"],"user")
    #每个用户使用优惠券消费的额次数
    t7=user[(user.Date !="null") & (user.Coupon_id !="null")][["User_id"]]
    user_feature=add_count_new_feature(user_feature,t7,"User_id","buy_use_coupon")
    #每个用户消费的总次数
    t8=user[user.Date!="null"][["User_id"]]
    user_feature=add_count_new_feature(user_feature,t8,"User_id","buy_total")
    #每个用户收到优惠券的总数
    t9=user[user.Coupon_id !="null"][["User_id"]]
    user_feature=add_count_new_feature(user_feature,t9,"User_id","coupon_received")
    #用户从收到优惠券到用优惠券的时间间隔，统计其最大值，最小值，平均值，中位数
    t10=user[(user.Date_received!="null") & (user.Date!="null")][["User_id","Data_received","Data"]]
    t10=add_day_gap(t10)
    t10=t10[["User_id","day_gap"]]
    user_feature=add_agg_feature(user_feature,t10,["User_id"],"day_gap",["min",'max',"mean","median"],"user")

    #将数据中的nan用0来替换
    user_feature.count_merchant=user_feature.count_merchant.replace(np.nan,0)
    user_feature.buy_use_coupon=user_feature.buy_use_coupon.replace(np.nan,0)

    #统计用户用券消费在总消费中的占比
    user_feature["buy_use_coupon_rate"]=user_feature.buy_use_coupon.astype("float")/user_feature.buy_total.astype("float")

    #统计用户收到消费券的使用率
    user_feature["user_coupon_transfer_rate"]=user_feature.buy_use_coupon.astype("float")/user_feature.coupon_received.astype("float")
    #将数据中的nan用0来替换
    user_feature.buy_total=user_feature.buy_total.replace(np.nan,0)
    user_feature.coupon_received=user_feature.coupon_received.replace(np.nan,0)
    return user_feature

In [26]:
#用户和商户关系特征群
def get_user_merchant_feature(feature):
    t=feature[["User_id","Merchant_id"]].copy()
    t.drop_duplicates(inplace=True)
    #一个用户在一个商家交易的总次数
    t0=feature[["User_id","Merchant_id","Date"]].copy()
    t0=t0[t0.Date!="null"][["User_id","Merchant_id"]]
    user_merchant=add_count_new_feature(t,t0,["User_id","Merchant_id"],"User_merchant_buy_total")
    #一个用户在一个商家一共收到的优惠券数量
    t1=feature[["User_id","Merchant_id","Coupon_id"]]
    t1=t1[t1.Coupon_id!="null"][['User_id',"Merchant_id"]]
    user_merchant=add_count_new_feature(user_merchant,t1,['User_id','Merchant_id'],'user_merchant_received')
    #一个用户在一个商家使用优惠券消费的次数
    t2=feature[['User_id',"Merchant_id",'Data','Data_recevied']]
    t2=t2[(t2.Data!="null") & (t2.Data_recevied!="null")][["User_id",'Merchant_id']]
    user_merchant=add_count_new_feature(user_merchant,t2,['User_id','Merchant_id'],'user_merchant_buy_use_coupon')
    #一个用户在一个商家的到店次数
    t3=feature[['User_id','Merchant_id']]
    user_merchant=add_count_new_feature(user_merchant,t3,['User_id','Merchant_id'],'user_merchant_any')
    #一个用户在一个商家没有使用优惠券消费的次数
    t4=feature[['User_id','Merchant_id','Data','Coupon_id']]
    t4=t4[(t4.Data!="null") & (t4.Coupon_id=='null')]['User_id','Merchant_id']
    user_merchant=add_count_new_feature(user_merchant,t4,['User_id','Merchant_id'],'user_merchant_buy_common')
    #将数据中nan用0来替换
    user_merchant.user_merchant_buy_use_coupon=user_merchant.user_merchant_buy_use_coupon.replace(np.nan,0)
    user_merchant.user_merchant_buy_common=user_merchant.user_merchant_buy_common.replace(np.nan,0)
    #一个用户对一个商家发放优惠券的使用率
    user_merchant['user_merchant_coupon_transfer_rate'] = user_merchant.user_merchant_buy_use_coupon.astype('float') / user_merchant.user_merchant_received.astype('float')
    #一个用户在一个商家的总消费次数中，用优惠券消费的次数占比
    user_merchant['user_merchant_coupon_buy_rate'] = user_merchant.user_merchant_buy_use_coupon.astype('float') / user_merchant.user_merchant_buy_total.astype('float')
    #一个用户到店后消费的可能性统计
    user_merchant['user_merchant_rate'] = user_merchant.user_merchant_buy_total.astype('float') / user_merchant.user_merchant_any.astype('float')
    #一个用户在一个商家的总消费次数中，不用优惠券消费次数占比
    user_merchant['user_merchant_common_buy_rate'] = user_merchant.user_merchant_buy_common.astype('float') / user_merchant.user_merchant_buy_total.astype('float')
    return user_merchant


In [1]:
#提取穿越特征（Leakage特征群）
def get_leakage_feature(dataset):
    t = dataset[['user_id']].copy()
    t['this_month_user_receive_all_coupon_count'] = 1
    t = t.groupby('user_id').agg('sum').reset_index()
    
    t1 = dataset[['user_id','coupon_id']].copy()
    t1['this_month_user_receive_same_coupn_count'] = 1
    t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
        
    t2 = dataset[['user_id','coupon_id','date_received']].copy()
    t2.date_received = t2.date_received.astype('str')
    #如果出现相同的用户接收相同的优惠券在接收时间上用‘：’连接上第n次接受优惠券的时间
    t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    #将接收时间的一组按着':'分开，这样就可以计算接受了优惠券的数量,apply是合并
    t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
    t2 = t2[t2.receive_number > 1]
    #最大接受的日期
    t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
    #最小的接收日期
    t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(d) for d in s.split(':')]))
    t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]

    t3 = dataset[['user_id','coupon_id','date_received']]
    #将两表融合只保留左表数据,这样得到的表，相当于保留了最近接收时间和最远接受时间
    t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
    #这个优惠券最近接受时间
    t3['this_month_user_receive_same_coupon_lastone']= t3.max_date_received-t3.date_received.astype(int)
    #这个优惠券最远接受时间
    t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(int)-t3.min_date_received
    
    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
    t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
    t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
       
    #提取第四个特征,一个用户所接收到的所有优惠券的数量
    t4 = dataset[['user_id','date_received']].copy()
    t4['this_day_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()

    #提取第五个特征,一个用户不同时间所接收到不同优惠券的数量
    t5 = dataset[['user_id','coupon_id','date_received']].copy()
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
    
    #一个用户不同优惠券 的接受时间
    t6 = dataset[['user_id','coupon_id','date_received']].copy()
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
    t6.rename(columns={'date_received':'dates'},inplace = True)
    
    t7 = dataset[['user_id','coupon_id','date_received']]
    t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
    t7['date_received_date'] = t7.date_received.astype('str')+'-'+t7.dates
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
    t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
    
    other_feature = pd.merge(t1,t,on='user_id')
    other_feature = pd.merge(other_feature,t3,on=['user_id','coupon_id'])
    other_feature = pd.merge(other_feature,t4,on=['user_id','date_received'])
    other_feature = pd.merge(other_feature,t5,on=['user_id','coupon_id','date_received'])
    other_feature = pd.merge(other_feature,t7,on=['user_id','coupon_id','date_received'])
    return other_feature

In [2]:
#########################不同版本特征#############################
#特征1只有最基础的特征
def f1(dataset,if_train):
    result=add_discount(dataset) 
    result.drop_duplicates(inplace=True)
    if if_train:
        result=add_label(result)
    return result

#特征2增加Merchant,user特征
def f2(dataset,feature,if_train):
       
    result=add_discount(dataset)
    
    merchant_feature=get_merchant_feature(feature)
    result=result.merge(merchant_feature, on='merchant_id', how="left")
    
    user_feature=get_user_feature(feature)
    result=result.merge(user_feature, on='user_id', how="left")
    
    user_merchant=get_user_merchant_feature(feature)
    result=result.merge(user_merchant, on=['user_id','merchant_id'], how="left")
    
    result.drop_duplicates(inplace=True)
    
    if if_train:
        result=add_label(result)
     
    return result

#特征3增加leakage特征
def f3(dataset,feature,if_train):
       
    result=add_discount(dataset)
    
    merchant_feature=get_merchant_feature(feature)
    result=result.merge(merchant_feature, on='merchant_id', how="left")
    
    user_feature=get_user_feature(feature)
    result=result.merge(user_feature, on='user_id', how="left")
    
    user_merchant=get_user_merchant_feature(feature)
    result=result.merge(user_merchant, on=['user_id','merchant_id'], how="left")
    
    leakage_feature=get_leakage_feature(dataset)
    result=result.merge(leakage_feature, on=['user_id','coupon_id','date_received'],how='left')
    
    result.drop_duplicates(inplace=True)
    if if_train:
        result=add_label(result)
     
    return result
    