In [1]:
from datetime import date, datetime

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier


import copy
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_ROOT = "../input/"

df_train = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
df_train = df_train[~df_train.Coupon_id.isna()]
df_train.reset_index(drop=True, inplace=True)

df_test = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
df_test = df_test[~df_test.Coupon_id.isna()]
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape)
print(df_test.shape)

(746969, 7)
(306313, 6)


In [3]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,8591.0,20:1,0.0,20160217.0,
1,1439408,2632,1078.0,20:1,0.0,20160319.0,
2,1832624,3381,7610.0,200:20,0.0,20160429.0,
3,2029232,3381,11951.0,200:20,1.0,20160129.0,
4,2223968,3381,9776.0,10:5,2.0,20160129.0,


In [4]:
# 幫訓練資料加標籤 - 規則：拿到 Coupon 後 15 天內有使用的為 1，未使用的為 0，沒拿到 Coupon 則為 -1
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

df_train["label"] = df_train.apply(label, axis=1)
print(df_train["label"].value_counts())
df_train.head()

0    710665
1     36304
Name: label, dtype: int64


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0


In [5]:
# 拿到 coupon 的星期及 平日/假日
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(int(row), format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

df_train['weekday'] = df_train['Date_received'].apply(getWeekday)
df_test['weekday'] = df_test['Date_received'].apply(getWeekday)

# 平日、假日
def getWeekendType(row):
    if (np.isnan(row)) or (row == -1):
        return row
    else:
        return 1 if int(row) in [6, 7] else 0

# weekday_type (weekend = 1)
df_train['weekday_type'] = df_train['weekday'].apply(lambda x: x if np.isnan(x) else 1 if int(x) in [6,7] else 0)
df_test['weekday_type'] = df_test['weekday'].apply(lambda x: x if np.isnan(x) else 1 if int(x) in [6,7] else 0)

df_train.head()


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0


In [6]:
# 拿到 coupon 的星期的 onehot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(df_train['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_train[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(df_test['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_test[weekdaycols] = tmpdf

df_train.head()

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,0,0,1,0,0,0,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,0,0,0,0,0,1,0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,0,0,0,0,1,0,0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,0,0,0,0,1,0,0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,0,0,0,0,1,0,0


In [7]:
# 拿到 coupon 的時間散佈在每月的何種階段 1: 上旬, 2: 中旬, 3: 下旬
def get_month_term(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        d = pd.to_datetime(row, format = '%Y%m%d').day
        if d < 10:
            return 1
        elif d < 20:
            return 2
        else:
            return 3
        
df_train['month_term'] = df_train['Date_received'].apply(get_month_term)
df_test['month_term'] = df_test['Date_received'].apply(get_month_term)

df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,month_term
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,0,0,1,0,0,0,0,2
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,0,0,0,0,0,1,0,2
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,0,0,0,0,1,0,0,3
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,0,0,0,0,1,0,0,3
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,0,0,0,0,1,0,0,3


In [8]:
# onehot encoding for month_term
month_term_cols = ['early_term', 'mid_term', 'late_term']

tmpdf = pd.get_dummies(df_train['month_term'])
tmpdf.columns = month_term_cols
df_train[month_term_cols] = tmpdf

tmpdf = pd.get_dummies(df_test['month_term'])
tmpdf.columns = month_term_cols
df_test[month_term_cols] = tmpdf

df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,month_term,early_term,mid_term,late_term
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,0,1,0,0,0,0,2,0,1,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,...,0,0,0,0,1,0,2,0,1,0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,...,0,0,0,1,0,0,3,0,0,1
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,0,0,0,1,0,0,3,0,0,1
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,0,0,0,1,0,0,3,0,0,1


In [9]:
# 整理 coupon 券的各種型態
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
# 計算優惠價差
def getDicountSpread(row):
    if ':' in row:
        p = row.split(':')
        return int(p[0]) - int(p[1])
#         return math.log(int(p[0]) - int(p[1]))
    else:
        # 折扣率以 5 元為進行計算（因為資料中最小為5元）
        return 10*(1-float(row))
#         return  math.log(10*(1-float(row)))
    

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    df['discount_spread'] = df['Discount_rate'].astype('str').apply(getDicountSpread)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

df_train = processData(df_train)
df_test = processData(df_test)
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_7,month_term,early_term,mid_term,late_term,discount_rate,discount_man,discount_jian,discount_type,discount_spread
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,0,2,0,1,0,0.95,20,1,1,19.0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,...,0,2,0,1,0,0.95,20,1,1,19.0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,...,0,3,0,0,1,0.9,200,20,1,180.0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,0,3,0,0,1,0.9,200,20,1,180.0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,0,3,0,0,1,0.5,10,5,1,5.0


In [10]:
# 將距離簡化成少數類型
distance_cols =  ['0', '1', '2', '3', '4']
bins  = [-1, 0, 1, 3, 10, 100]
df_train['distance_category'] = pd.cut( df_train['Distance'], bins = bins, include_lowest=False, labels=distance_cols)
tmpdf = pd.get_dummies(df_train['distance_category'], prefix="distance")
df_train[list(tmpdf.columns)] = tmpdf

df_test['distance_category'] = pd.cut( df_test['Distance'], bins = bins, include_lowest=False, labels=distance_cols)
tmpdf = pd.get_dummies(df_test['distance_category'], prefix="distance")
df_test[list(tmpdf.columns)] = tmpdf

distance_cols = list(tmpdf.columns)

df_train.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,discount_man,discount_jian,discount_type,discount_spread,distance_category,distance_0,distance_1,distance_2,distance_3,distance_4
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,20,1,1,19.0,0,1,0,0,0,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,...,20,1,1,19.0,0,1,0,0,0,0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,...,200,20,1,180.0,0,1,0,0,0,0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,200,20,1,180.0,1,0,1,0,0,0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,10,5,1,5.0,2,0,0,1,0,0
5,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7,1,...,100,10,1,90.0,4,0,0,0,0,1
6,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4,0,...,200,30,1,170.0,3,0,0,0,1,0
7,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6,1,...,200,20,1,180.0,3,0,0,0,1,0
8,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2,0,...,200,20,1,180.0,2,0,0,1,0,0
9,253750,8390,7531.0,20:5,0.0,20160327.0,,0,7,1,...,20,5,1,15.0,0,1,0,0,0,0


In [11]:
### 決定投入的參數
# predict_feature = \
#     [ 'Distance', 'distance_category' ] + distance_cols + \
#     [ 'weekday', 'weekday_type' ] + weekdaycols + \
#     [ 'month_term' ] + month_term_cols + \
#     [ 'discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'discount_spread']

predict_feature = \
    [ 'Distance' ] + distance_cols + \
    [ 'weekday', 'weekday_type' ] + weekdaycols + \
    [ 'month_term' ] + month_term_cols + \
    [ 'discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'discount_spread']


# predict_feature = \
#     [ 'Distance', 'weekday_type' ] + weekdaycols + month_term_cols + \
#     [ 'discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'discount_spread']

print(len(predict_feature),predict_feature)

24 ['Distance', 'distance_0', 'distance_1', 'distance_2', 'distance_3', 'distance_4', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'month_term', 'early_term', 'mid_term', 'late_term', 'discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'discount_spread']


In [12]:
# 切割訓練、測試用資料

# 正規化
sc=StandardScaler()
# sc.fit(df_train)

# df_train = sc.transform(df_train)
# df_test = sc.transform(df_test)

# 訓練用特徵資料
train_feature =  df_train[predict_feature]
train_label = df_train['label']

# 預測用特徵資料
test_feature = df_test[predict_feature]

In [13]:
# 顯示訓練狀況
def show_predict_result(labeld, predict_proba):
    auc_score = roc_auc_score(y_true=labeld, y_score=predict_proba[:,1])
    acc = accuracy_score(y_true=labeld, y_pred=predict_proba.argmax(axis=1))
    print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))
    return auc_score, acc

In [14]:
# 保存預測結果

def save_predict_data(classifier, predict_data, predict_feature, filename = None):

    data = predict_data[predict_feature].copy()

    out_name = filename if filename is not None else classifier.__class__.__name__

    predict_result = classifier.predict_proba(data)
    data['pred_prob'] = predict_result[:, 1]

    output = pd.concat((predict_data[["User_id", "Coupon_id", "Date_received"]], data['pred_prob']), axis=1)

    output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
    output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)

    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]

    now = datetime.now().strftime( '%Y%m%dT%H%M%S' )
    output_filename = "{}-{}.csv".format(out_name, now)

    out.to_csv(output_filename, header=["uid", "label"], index=False) # submission format

    return output_filename

In [15]:
train_x, valid_x, train_y, valid_y  =  train_test_split(train_feature,  train_label, test_size=0.2)

In [16]:
# LogisticRegression
lr = LogisticRegression()
lr.fit(train_x, train_y)
valid_predict_lr = lr.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_lr)



Validation AUC: 0.806, Accuracy: 0.951


(0.8063263529420865, 0.9513902834116498)

In [17]:
save_predict_data(lr, df_test, predict_feature, filename = 'lr' )

'lr-20190619T230726.csv'

In [18]:
# GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(train_x, train_y)
valid_predict_gbc = gbc.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_gbc)

Validation AUC: 0.824, Accuracy: 0.951


(0.8237016621017139, 0.9513969771209018)

In [19]:
save_predict_data(gbc, df_test, predict_feature, filename = 'gbc' )

'gbc-20190619T230832.csv'

In [20]:
# LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(train_x, train_y)
valid_predict_lgbm = lgbm.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_lgbm)

Validation AUC: 0.831, Accuracy: 0.951


(0.8310306605727533, 0.9514036708301539)

In [21]:
save_predict_data(lgbm, df_test, predict_feature, filename = 'lgbm' )

'lgbm-20190619T230837.csv'

In [22]:
# XGBClassifier
xgb = XGBClassifier()
xgb.fit(train_x, train_y)
valid_predict_xgb = xgb.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_xgb)

Validation AUC: 0.823, Accuracy: 0.951


(0.823002272285429, 0.9513902834116498)

In [23]:
save_predict_data(xgb, df_test, predict_feature, filename = 'xgb' )

'xgb-20190619T230925.csv'

In [24]:
# AdaBoost
ada = AdaBoostClassifier()
ada.fit(train_x, train_y)
valid_predict_ada = ada.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_ada)

Validation AUC: 0.813, Accuracy: 0.951


(0.8125889176777212, 0.9513902834116498)

In [25]:
save_predict_data(ada, df_test, predict_feature, filename = 'ada' )

'ada-20190619T230947.csv'

In [26]:
# 決策樹 - gini
dt_gini = DecisionTreeClassifier(criterion = 'gini')
dt_gini.fit(train_x, train_y)
# gini_dt_acc = cross_val_score(decisionTree_gini, train_x, train_y, cv = 10, scoring = 'accuracy')
# gini_dt_f1 = cross_val_score(decisionTree_gini, train_x, train_y, cv = 10, scoring = 'f1')
# gini_dt_auc = cross_val_score(decisionTree_gini, train_x, train_y, cv = 10, scoring = 'accuracy')
# print(gini_dt_acc.mean(), gini_dt_f1.mean(), gini_dt_auc.mean())
valid_predict_dt_gini = dt_gini.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_dt_gini)

Validation AUC: 0.811, Accuracy: 0.951


(0.8111718594325641, 0.9512028595525924)

In [27]:
save_predict_data(dt_gini, df_test, predict_feature, filename = 'dt_gini' )

'dt_gini-20190619T230953.csv'

In [28]:
# 決策樹 - entropy
dt_entropy = DecisionTreeClassifier(criterion = 'entropy')
dt_entropy.fit(train_x, train_y)
valid_predict_dt_entropy = dt_entropy.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_dt_entropy)

Validation AUC: 0.811, Accuracy: 0.951


(0.8110274965169635, 0.9511894721340883)

In [29]:
save_predict_data(dt_entropy,  df_test, predict_feature, filename = 'dt_entropy' )

'dt_entropy-20190619T230958.csv'

In [30]:
### 隨機森林
random_forest = RandomForestClassifier(n_estimators=200)
random_forest.fit(train_x, train_y)
valid_predict_random_forest = random_forest.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_random_forest)

Validation AUC: 0.816, Accuracy: 0.951


(0.8156623462723775, 0.9512430218081047)

In [31]:
pd.DataFrame(random_forest.feature_importances_, train_x.columns)

Unnamed: 0,0
Distance,0.171948
distance_0,0.11156
distance_1,0.008107
distance_2,0.010943
distance_3,0.055245
distance_4,0.008031
weekday,0.034529
weekday_type,0.007783
weekday_1,0.007112
weekday_2,0.01009


In [32]:
save_predict_data(random_forest,  df_test, predict_feature, filename = 'random_forest' )

'random_forest-20190619T231155.csv'

In [33]:
mlp_adam = MLPClassifier(hidden_layer_sizes=(100,), batch_size = 500, learning_rate = 'invscaling', solver = 'adam')
mlp_adam.fit(train_x, train_y)
valid_predict_mlp_adam = mlp_adam.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_mlp_adam)

Validation AUC: 0.826, Accuracy: 0.951


(0.825900099668794, 0.9513635085746416)

In [34]:
save_predict_data(mlp_adam,  df_test, predict_feature, filename = 'mlp_adam' )

'mlp_adam-20190619T231254.csv'

In [35]:
mlp_adam.partial_fit(valid_x, valid_y)
valid_predict_mlp_adam2 = mlp_adam.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_mlp_adam2)

Validation AUC: 0.825, Accuracy: 0.951


(0.8253278303294901, 0.9513768959931457)

In [36]:
save_predict_data(mlp_adam,  df_test, predict_feature, filename = 'mlp_adam2' )

'mlp_adam2-20190619T231258.csv'

In [37]:
# mlp sgd
mlp_sgd = MLPClassifier(hidden_layer_sizes=(100,), batch_size = 500, learning_rate = 'invscaling', solver = 'sgd')
mlp_sgd.fit(train_x, train_y)
valid_predict_mlp_sgd = mlp_sgd.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_mlp_sgd)

Validation AUC: 0.795, Accuracy: 0.951


(0.7947100599414869, 0.9513902834116498)

In [38]:
save_predict_data(mlp_sgd,  df_test, predict_feature, filename = 'mlp_sgd' )

'mlp_sgd-20190619T231313.csv'

In [39]:
mlp_sgd.fit(valid_x, valid_y)
valid_predict_mlp_sgd2 = mlp_sgd.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_mlp_sgd2)

Validation AUC: 0.760, Accuracy: 0.951


(0.7595738013111315, 0.9513902834116498)

In [40]:
save_predict_data(mlp_sgd,  df_test, predict_feature, filename = 'mlp_sgd2' )

'mlp_sgd2-20190619T231319.csv'

In [41]:
# knn
knn = KNeighborsClassifier()
knn.fit(train_x, train_y)
valid_predict_knn = knn.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_knn)

Validation AUC: 0.657, Accuracy: 0.947


(0.6565659916422624, 0.9472736522216421)

In [42]:
save_predict_data(knn,  df_test, predict_feature, filename = 'knn' )

'knn-20190619T231837.csv'

In [43]:
knn.leaf_size

30

In [44]:
# svm
# svm = SVC(kernel='linear', probability=True)
# svm.fit(train_x, train_y)
# valid_predict_svm = svm.predict_proba(valid_x)
# show_predict_result(valid_y, valid_predict_svm)

In [45]:
# save_predict_data(svm,  df_test, predict_feature, filename = 'svm' )

In [46]:
# GaussianNB
gaussian = GaussianNB()
gaussian.fit(train_x, train_y)
valid_predict_gaussian = gaussian.predict_proba(valid_x)
show_predict_result(valid_y, valid_predict_gaussian)

Validation AUC: 0.794, Accuracy: 0.798


(0.7937850419115754, 0.7976960252754461)

In [47]:
save_predict_data(gaussian,  df_test, predict_feature, filename = 'gaussian' )

'gaussian-20190619T231840.csv'