In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

import os, sys, pickle
import matplotlib.pyplot as plt
from datetime import date
from sklearn.linear_model import SGDClassifier

In [26]:
dfon = pd.read_csv('C:/Users/jxjsj/Desktop/tianchi/o2ocoupon/ccf_online_stage1_train.csv')

In [27]:
dfoff = pd.read_csv('C:/Users/jxjsj/Desktop/tianchi/o2ocoupon/ccf_offline_stage1_train.csv')

In [28]:
dftest = pd.read_csv('C:/Users/jxjsj/Desktop/tianchi/o2ocoupon/ccf_offline_stage1_test_revised.csv')

In [29]:
# 1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`，同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 将距离 `str` 转为 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
print("tool is ok.")

tool is ok.


In [30]:
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df

In [31]:
# 特征生成
dfoff = processData(dfoff)
dftest = processData(dftest)

[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]


In [32]:
# 收到优惠券的日子 去重 排序
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])

In [33]:
# 购买商品的日子 不去重 排序
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])

In [34]:
# 对同一天收到的优惠券聚合，计数
couponbydate = dfoff[dfoff['Date_received'].notnull()
                    ][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']

In [35]:
# 对同一天收到的优惠券，且未来被花掉了聚合，计数
buybydate_temp = dfoff[(dfoff['Date'].notnull())]
buybydate = buybydate_temp[(buybydate_temp['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']

In [36]:
# date对象的方法weekday()返回0-6表示周一到周日
def getWeekday(row):
    if row == 'nan': # NaN被str()才可被 ==‘nan’识别
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

In [37]:
# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )

In [38]:
# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
# tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf = pd.get_dummies(dfoff['weekday'])
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

In [39]:
tmpdf = pd.get_dummies(dftest['weekday'])
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

In [40]:
def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(10, 'D'):
            return 1
    return 0
dfoff['label'] = dfoff.apply(label, axis = 1)

In [48]:
# feature
original_feature_label = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type','label'] + weekdaycols
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols

# data split - train2train&valid
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy() # 把需要研究的对象0与1挑选出来
train = df[(df['Date_received'] < 20160516)].copy()[original_feature_label]
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()[original_feature_label]

x_train = train[original_feature]
y_train = train['label']
x_valid = valid[original_feature]
y_valid = valid['label']

print("end")

-----data split------
end


In [42]:
# tol sample
x_tol = pd.concat([x_train,x_valid],axis=0)
y_tol = pd.concat([y_train,y_valid],axis=0)

In [50]:
model = MLPClassifier(activation='tanh', 
                    solver='lbfgs',
                    alpha=1e-5,
                    hidden_layer_sizes=(len(x_train.columns)*2+1,),
                    random_state=1, 
                   )
model.fit(x_train,y_train)

MLPClassifier(activation='tanh', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(29,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [52]:
model = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
model.fit(x_train, y_train)



SGDClassifier(alpha=0.01, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.01, learning_rate='optimal', loss='log', max_iter=100,
       n_iter=None, n_iter_no_change=5, n_jobs=1, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [53]:
# #### 预测以及结果评价
# print(model.score(valid[original_feature], valid['label']))

y_train_predict = model.predict(x_train)
y_valid_predict = model.predict(x_valid)

print('trainAccracy:',model.score(x_train,y_train))
print(classification_report(y_train,y_train_predict)) #真实数据在前 训练结果在后！
print('testAccracy:',model.score(x_valid,y_valid))
print(classification_report(y_valid,y_valid_predict))

trainAccracy: 0.9534654850280256


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.95      1.00      0.98    763436
           1       0.00      0.00      0.00     37260

   micro avg       0.95      0.95      0.95    800696
   macro avg       0.48      0.50      0.49    800696
weighted avg       0.91      0.95      0.93    800696

testAccracy: 0.9206567268177968
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    232545
           1       0.00      0.00      0.00     20041

   micro avg       0.92      0.92      0.92    252586
   macro avg       0.46      0.50      0.48    252586
weighted avg       0.85      0.92      0.88    252586



  'precision', 'predicted', average, warn_for)


In [None]:
print("---save model---")
with open('1_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('1_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [134]:
# test prediction for submission
y_test_pred = model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('C:/Users/jxjsj/Desktop/tianchi/submission.csv', index=False, header=False)
dftest1.head()

Unnamed: 0,User_id,Coupon_id,Date_received,label
0,4129537,9983,20160712,9.233964e-07
1,6949378,3429,20160706,1.647855e-06
2,2166529,6928,20160727,1.843014e-07
3,2166529,1808,20160727,1.896743e-07
4,6172162,6500,20160708,5.266069e-12


In [137]:
# test prediction for submission - tol sample
model_tol = MLPClassifier(activation='tanh', 
                    solver='lbfgs',
                    alpha=1e-5,
                    hidden_layer_sizes=(len(x_tol.columns)*2+1,),
                    random_state=1, 
                   )
model_tol.fit(x_tol,y_tol)

y_test_pred = model_tol.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('C:/Users/jxjsj/Desktop/tianchi/submission.csv', index=False, header=False)
dftest1.head()

Unnamed: 0,User_id,Coupon_id,Date_received,label
0,4129537,9983,20160712,6.520989e-09
1,6949378,3429,20160706,8.427492e-09
2,2166529,6928,20160727,6.38715e-09
3,2166529,1808,20160727,6.384778e-09
4,6172162,6500,20160708,6.40361e-09
