 # 优惠券预测
 
## 选取特征
 - 选取distance的作为特征，空值填充为1
 - 选取discount_rate数据并且对其进行one-hot处理
 - 添加一列特征，设置每个优惠劵领取的星期数
 - 添加每个商铺的领取的优惠劵数目
## 使用算法
 
 - 逻辑回归（无任何参数设置）
 
## 实验测试集AUC
 

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# 读入数据

In [2]:
off_train = pd.read_csv('ccf_offline_stage1_train_have_coupon.csv', encoding='utf-8')
off_test = pd.read_csv('ccf_offline_stage1_test_revised.csv')
sample = pd.read_csv('sample_submission.csv',encoding = 'utf-8', header = None)

# 数据预处理

Distance选取为特征

In [3]:
#训练集Distance空值填充为1
off_train = off_train.fillna(value = {'Distance' : 1})
#训练集Distance空值填充为1
off_test = off_test.fillna(value = {'Distance' : 1})

添加星期数作为特征

In [4]:
#对训练集取星期数
off_train["Week"] = off_train["Date_received"]
off_train.Week = off_train.Week.astype("int")
off_train.Week = off_train.Week.astype("str")
off_train["Week"] = pd.to_datetime(off_train["Week"]).apply(lambda x : x.weekday())

#对测试集取星期数
off_test["Week"] = off_test["Date_received"]
off_test.Week = off_test.Week.astype("int")
off_test.Week = off_test.Week.astype("str")
off_test["Week"] = pd.to_datetime(off_test["Week"]).apply(lambda x : x.weekday())

判断使用优惠券日期-领取优惠券日期是否小于15天，小于取1，大于取0 

In [5]:
import datetime

#判断使用优惠券日期-领取优惠券日期是否小于15天，小于取1，大于取0 
def days(z):
    if str(z['Date_received']) != 'nan' and str(z['Date']) != 'nan':
        days = (datetime.datetime.strptime(str(int(z['Date'])), "%Y%m%d") - datetime.datetime.strptime(str(int(z['Date_received'])), "%Y%m%d"))
        if days.days > 15:
            return 0
        else:
            return 1
    else:
        return 0

#将判断后的结果赋值给训练集    
off_train['less_15'] = off_train.apply(days, axis = 1)

#用户和优惠券id是多对多的关系，所以按用户对他消费小于15天的数做了一个累加，去除掉训练集中的这列特征，方便做处理后特征的连接
train = off_train[['User_id', 'less_15']]
train = train.drop_duplicates(subset = ['User_id'])
a = train.less_15.groupby(train['User_id']).sum()
train = a.reset_index()
off_train = off_train.drop(columns = 'less_15')

#对测试集和测试集按用户增加这列特征
off_train = pd.merge(off_train, train, on = 'User_id', how = 'left')
off_test = pd.merge(off_test, train, on = 'User_id', how = 'left')

#测试集中有训练集中没有出现的用户，这类用户的这个特征默认填了空，把空值填0
off_test.less_15 = off_test.less_15.fillna(0)

用户优惠劵使用数目

In [6]:
#按用户对优惠券id和消费日期进行分组并计数，得到用户的优惠券消费次数
User_Coupon_notna = off_train[['User_id', 'Date']].groupby('User_id').count()
User_Coupon_notna = User_Coupon_notna.reset_index()
User_Coupon_notna.rename(columns = {'Coupon_id' : 'Coupon_id_count', 'Date' : 'Consume_count'}, inplace=True)

#将得到的特征返回到训练集与测试集中，测试集中有训练集中没有出现的用户，这类用户的这个特征默认填了空，把空值填0
off_train = pd.merge(off_train, User_Coupon_notna[['User_id', 'Consume_count']], on = ['User_id'], how = 'left')
off_test = pd.merge(off_test, User_Coupon_notna[['User_id', 'Consume_count']], on = 'User_id', how = 'left')
off_test = off_test.fillna(value = {'Consume_count' : 0})

Discount_rate进行onehot

In [7]:
#对训练集和测试集中的Discount_rate进行独热编码#对训练集和测 
off_train = pd.get_dummies(off_train, prefix=['Discount_rate'])
off_test = pd.get_dummies(off_test, prefix=['Discount_rate'])

#训练集中出现了4个测试集中没有出现的码段，进行填充
m = list(set(off_train.columns.tolist()) - set(off_test.columns.tolist()))
for i in m:
    off_test[i] = 0

#测试集中有一个训练集中没有的码段，进行填充    
off_train['Discount_rate_500:30'] = 0

筛选训练测试所需要的行和列

In [8]:
#训练数据
x_train = off_train.drop(columns = ['Date_received', 'User_id', 'Merchant_id', 'Coupon_id', 'Date', 'result'])
y_train = off_train.result
x_test = off_test.drop(columns = ['Date_received', 'User_id', 'Merchant_id', 'Coupon_id'])

#将训练数据和测试数据的列一一对应
x_test = x_test[x_train.columns.tolist()]

# 训练模型

In [9]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 输出数据

In [10]:
#进行预测，并将需要保存的部分写入save
predict = clf.predict_proba(x_test)
save = off_test[['User_id', 'Coupon_id', 'Date_received']]
save.insert(3, 'probability', predict[:,1], True)

In [11]:
#输出结果为csv
save.to_csv('这里为你需要设置的路径', index = None)