# 員工離職預測 Employee Resignation Prediciton

# 1. 環境設定 Environment Setting

In [None]:
# 基礎套件
import time, random, os
import pandas as pd
import numpy as np
from scipy.misc import derivative
from scipy.special import expit
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, fbeta_score, roc_curve, auc, f1_score
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

In [None]:
random.seed(123)
path = 'C:/Users/twnjeu/Desktop/Jeff/AIdea'
os.chdir(path)

# 2. 資料讀取 Read Data

In [None]:
tr = pd.read_csv('train.csv')
te = pd.read_csv('test.csv')
se = pd.read_csv('season.csv')
sm = pd.read_csv('submission.csv')

# 3. 資料處理 Data Manipulation

## 3.1 Seasonal Data

In [None]:
se['se_btrip'] = se['出差數A'] + se['出差數B']
se['se_leave'] = se['請假數A'] + se['請假數B']
se['se_trilev'] = se['se_btrip'] + se['se_leave']
se['se_all'] = se['se_trilev'] + se['加班數']
se['se_btrip_rt'] = se['se_btrip']/se['se_all']
se['se_leave_rt'] = se['se_leave']/se['se_all']
se['se_overtime_rt'] = se['加班數']/se['se_all']
se['se_trilev_rt'] = se['se_trilev']/se['se_all']
se.fillna(0, inplace = True)

In [None]:
se_group = se.groupby(['PerNo', 'yyyy']).agg(['sum', 'mean', 'std', 'max', 'min'])
se_nm = [e for e in list(se.columns) if e not in ['PerNo', 'yyyy', 'periodQ']]
nm = []
for i in se_nm:
    for j in ['sum', 'mean', 'std', 'max', 'min']:
        nm.append(i+'_se_'+j)
se_group.columns = nm
se_group.fillna(0, inplace = True)
se_gp = se_group.reset_index()

In [None]:
tr_se_cat = se_gp[se_gp['yyyy'] < 2017]
tv_se_cat = se_gp[se_gp['yyyy'] <= 2017]
al_se_cat = se_gp

In [None]:
tr_se_f = tr_se_cat.drop('yyyy', axis = 1).groupby('PerNo').agg(['mean', 'std'])
tv_se_f = tv_se_cat.drop('yyyy', axis = 1).groupby('PerNo').agg(['mean', 'std'])
al_se_f = al_se_cat.drop('yyyy', axis = 1).groupby('PerNo').agg(['mean', 'std'])
nm = []
se_nm = list(tr_se_cat.columns)
se_nm = [e for e in list(tr_se_cat.columns) if e not in ['PerNo','yyyy']]

for i in se_nm:
    for j in ['mean', 'std']:
        nm.append(i+'_'+j)
tr_se_f.columns = nm
tv_se_f.columns = nm
al_se_f.columns = nm
tr_se_final = tr_se_f.reset_index()
tv_se_final = tv_se_f.reset_index()
al_se_final = al_se_f.reset_index()

## 3.2 Data Partition

In [None]:
base = pd.concat([tr, te], axis = 0)
base = pd.merge(base, se_gp, on = ['PerNo', 'yyyy'], how = 'left')
for i in base.columns:
    base[i].fillna(base[i].mode()[0], inplace = True)

In [None]:
min_year = base.groupby('PerNo', as_index = False).agg({'yyyy':'min'})
min_year.columns = ['PerNo', 'first_year']
base = pd.merge(base, min_year, on = 'PerNo', how = 'left')
base['seniority'] = base['yyyy'] - base['first_year']

In [None]:
base['work_exp'] = base['工作資歷1'] + base['工作資歷2'] + base['工作資歷3'] + base['工作資歷4'] + base['工作資歷5']
base['seniority_exp'] = base['年資層級A'] + base['年資層級B'] + base['年資層級C']
base['training'] = base['訓練時數A'] + base['訓練時數B'] + base['訓練時數C']
base['training_A_rt'] = base['訓練時數A']/base['training']
base['training_B_rt'] = base['訓練時數B']/base['training']
base['training_C_rt'] = base['訓練時數C']/base['training']
base['leave_M3'] = base['近三月請假數A'] + base['近三月請假數B']
base['leave_Y1'] = base['近一年請假數A'] + base['近一年請假數B']
base['leave_rt'] = base['leave_M3']/base['leave_Y1']
base['leave_A_rt'] = base['近三月請假數A']/base['近一年請假數A']
base['leave_B_rt'] = base['近三月請假數B']/base['近一年請假數B']
base['leave_M3_rt'] = base['近三月請假數A']/base['leave_M3']
base['leave_Y1_rt'] = base['近一年請假數A']/base['leave_Y1']
base['btrip'] = base['出差數A'] + base['出差數B']
base['btrip_A_rt'] = base['出差數A']/base['btrip']
base['proj_rt'] = base['專案時數']/base['專案總數']
base['performance'] = base['年度績效等級A'] + base['年度績效等級B'] + base['年度績效等級C']
base['performance_mean'] = base['performance']/3

In [None]:
base['age_marry'] = base['年齡層級']*100 + base['婚姻狀況']
base['marry_child'] = base['婚姻狀況']*100 + base['眷屬量']
base['title_age'] = base['職等']*100 + base['年齡層級']
base['manage_age'] = base['管理層級']*100 + base['年齡層級']
base['manage_title'] = base['管理層級']*100 + base['職等']
base['seniority_cat'] = base['年資層級A']*10000 + base['年資層級B']*100 + base['年資層級C']
base['performance_cat'] = base['年度績效等級A']*10000 + base['年度績效等級B']*100 + base['年度績效等級C']
base['dept_title'] = base['歸屬部門']*100 + base['職等']
base['dept_manage'] = base['歸屬部門']*100 + base['管理層級']

In [None]:
cat_var = ['sex', '工作分類', '職等', '廠區代碼', '管理層級', '工作資歷1', '工作資歷2', '工作資歷3', '工作資歷4', '工作資歷5', '專案總數', '當前專案角色', 
           '工作地點', '榮譽數', '是否升遷', '升遷速度', '出差集中度', '年度績效等級A', '年度績效等級B', '年度績效等級C', '年齡層級', '婚姻狀況', 
            '年資層級A', '年資層級B', '年資層級C', '任職前工作平均年數', '最高學歷', '畢業學校類別', '畢業科系類別', '眷屬量', '通勤成本', '歸屬部門', 
            '加班數_se_sum', 'training', 'btrip', 'work_exp', 'seniority_exp', 'leave_M3', 'leave_Y1', 'age_marry', 'marry_child', 'title_age', 'manage_age', 'manage_title',
           'seniority_cat', 'performance_cat', 'dept_title', 'dept_manage']

## 3.3 Target Encoding

In [None]:
for i in cat_var:
    t_encoding = base.groupby(['yyyy', i], as_index = False).agg({'PerStatus':'mean'})
    t_encoding['yyyy'] = t_encoding['yyyy'] + 1
    t_encoding.columns = ['yyyy', i, i+'_ec']
    base = pd.merge(base, t_encoding, on = ['yyyy', i], how = 'left')
    base[i+'_ec'].fillna(base[i+'_ec'].mean(), inplace = True)

## 3.4 Worth of Evidence(WOE)

In [None]:
for i in cat_var:
    woe = base.groupby(['yyyy', i], as_index = False).agg({'PerStatus': ['count', 'sum']})
    woe.columns = ['yyyy', i, 'total', 'bad']
    woe['good'] = woe['total'] - woe['bad']
    woe['p_bad'] = woe['bad']/woe['bad'].sum()
    woe['p_good'] = woe['good']/woe['good'].sum()
    woe[i+'_woe'] = np.log1p(woe['p_good']/woe['p_bad'])
    woe[i+'_iv'] = (woe['p_good'] - woe['p_bad'])*woe[i+'_woe']
    woe['yyyy'] = woe['yyyy'] + 1
    woe[i+'_woe'].replace(np.inf, 2, inplace = True)
    woe[i+'_iv'].replace(np.inf, 2, inplace = True)
    base = pd.merge(base, woe[['yyyy', i, i+'_woe', i+'_iv']], on = ['yyyy', i], how = 'left')
    base[i+'_woe'].fillna(base[i+'_woe'].mean(), inplace = True)
    base[i+'_iv'].fillna(base[i+'_iv'].mean(), inplace = True)

In [None]:
base_ly = base.copy()
base_ly['yyyy'] = base['yyyy'] + 1
base_ly.set_index(['PerNo', 'yyyy'], inplace = True)
base_ly.drop(['PerStatus', 'sex', '任職前工作平均年數', '最高學歷', '畢業學校類別', '畢業科系類別'], axis = 1, inplace = True)
nm_o = base_ly.columns
nm_n = [i + '_ly' for i in nm_o]
base_ly.columns = nm_n
base_ly.reset_index(inplace=True)
base_f = pd.merge(base, base_ly, on = ['PerNo', 'yyyy'], how = 'left' )
for i in base_f.columns:
    base_f[i] = base_f[i].fillna(base_f[i].mean())

In [None]:
feat = list(base_f.columns)
feat.remove('PerStatus')
base_f.fillna(0, inplace = True)

In [None]:
tr_x = base_f.loc[base_f['yyyy'] <  2017, feat]
va_x = base_f.loc[base_f['yyyy'] == 2017, feat]
tv_x = base_f.loc[base_f['yyyy'] <= 2017, feat]
te_x = base_f.loc[base_f['yyyy'] == 2018, feat]
tr_y = base_f.loc[base_f['yyyy'] <  2017, 'PerStatus']
va_y = base_f.loc[base_f['yyyy'] == 2017, 'PerStatus']
tv_y = base_f.loc[base_f['yyyy'] <= 2017, 'PerStatus']
all_x = base_f[feat]

## 3.5 Feature of Fist Year, Last Year

In [None]:
fl_var = tr_x.columns.difference(['PerNo', 'FLG', 'yyyy', 'sex', '任職前工作平均年數', '最高學歷', '畢業學校類別', '畢業科系類別'])
dif_var = ['訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', '年度績效等級A', '年度績效等級B', '年度績效等級C', '出差集中度','特殊專案佔比', '眷屬量', '通勤成本']

In [None]:
tr_first = tr_x.groupby('PerNo')[fl_var].first()
tr_last = tr_x.groupby('PerNo')[fl_var].last()
tr_first.columns = [i + '_ft' for i in fl_var]
tr_last.columns = [i + '_lt' for i in fl_var]
tr_fl = pd.concat([tr_first, tr_last], axis = 1).reset_index()
for i in dif_var:
    tr_fl[i+'_dif'] = tr_fl[i+'_lt'] - tr_fl[i+'_ft']
    tr_fl[i+'_rt'] = tr_fl[i+'_dif']/tr_fl[i+'_ft']

In [None]:
tv_first = tv_x.groupby('PerNo')[fl_var].first()
tv_last = tv_x.groupby('PerNo')[fl_var].last()
tv_first.columns = [i + '_ft' for i in fl_var]
tv_last.columns = [i + '_lt' for i in fl_var]
tv_fl = pd.concat([tv_first, tv_last], axis = 1).reset_index()
for i in dif_var:
    tv_fl[i+'_dif'] = tv_fl[i+'_lt'] - tv_fl[i+'_ft']
    tv_fl[i+'_rt'] = tv_fl[i+'_dif']/tv_fl[i+'_ft']

In [None]:
all_first = all_x.groupby('PerNo')[fl_var].first()
all_last = all_x.groupby('PerNo')[fl_var].last()
all_first.columns = [i + '_ft' for i in fl_var]
all_last.columns = [i + '_lt' for i in fl_var]
all_fl = pd.concat([all_first, all_last], axis = 1).reset_index()
for i in dif_var:
    all_fl[i+'_dif'] = all_fl[i+'_lt'] - all_fl[i+'_ft']
    all_fl[i+'_rt'] = all_fl[i+'_dif']/all_fl[i+'_ft']

## 3.6 Dummy Variable

In [None]:
dum_var = ['婚姻狀況', '最高學歷', '畢業學校類別', '畢業科系類別', '工作地點', '工作分類', '職等', '廠區代碼', '管理層級', '歸屬部門', '當前專案角色', '年齡層級', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', '年資層級A', '年資層級B', '年資層級C',
           'seniority', 'age_marry', 'marry_child', 'title_age', 'manage_age', 'manage_title', 'seniority_cat', 'performance_cat', 'dept_title', 'dept_manage']

In [None]:
all_cat = pd.get_dummies(all_x[dum_var].astype('int').astype('category'))
all_dum = pd.concat([all_x, all_cat], axis = 1)

In [None]:
dct_var = ['yyyy', '婚姻狀況', '最高學歷', '畢業學校類別', '畢業科系類別', '工作地點', '工作分類', '職等', '廠區代碼', '管理層級', '歸屬部門', '當前專案角色', '年齡層級']
sum_var = ['專案時數', '專案總數', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '是否升遷', 
           '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', 'training', 'leave_M3', 'leave_Y1', 'btrip', 'performance']
avg_var = ['專案時數', '專案總數', '特殊專案佔比', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '是否升遷', '升遷速度', 
           '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', '出差集中度', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', '年資層級A', '年資層級B', '年資層級C', '眷屬量', '通勤成本',
           'training', 'training_A_rt', 'training_B_rt', 'training_C_rt', 
           'leave_M3','leave_Y1', 'leave_rt', 'leave_A_rt', 'leave_B_rt', 'leave_M3_rt', 'leave_Y1_rt', 'btrip', 'btrip_A_rt', 'proj_rt', 'performance', 'performance_mean']
max_var = ['專案時數', '專案總數', '特殊專案佔比', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '是否升遷', '升遷速度', 
           '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', '出差集中度', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', '年資層級A', '年資層級B', '年資層級C', '眷屬量', '通勤成本',
           '工作資歷1', '工作資歷2', '工作資歷3', '工作資歷4', '工作資歷5', '任職前工作平均年數',
           'training', 'training_A_rt', 'training_B_rt', 'training_C_rt', 
           'leave_M3','leave_Y1', 'leave_rt', 'leave_A_rt', 'leave_B_rt', 'leave_M3_rt', 'leave_Y1_rt', 'btrip', 'btrip_A_rt', 'proj_rt', 'performance', 'performance_mean']
min_var = ['專案時數', '專案總數', '特殊專案佔比', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '升遷速度', 
           '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', '出差集中度', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', '年資層級A', '年資層級B', '年資層級C', '眷屬量', '通勤成本',
           'training', 'training_A_rt', 'training_B_rt', 'training_C_rt', 
           'leave_M3','leave_Y1', 'leave_rt', 'leave_A_rt', 'leave_B_rt', 'leave_M3_rt', 'leave_Y1_rt', 'btrip', 'btrip_A_rt', 'proj_rt', 'performance', 'performance_mean']
std_var = ['專案時數', '專案總數', '特殊專案佔比', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '升遷速度', 
           '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B', '出差集中度', 
           '年度績效等級A', '年度績效等級B', '年度績效等級C', '年資層級A', '年資層級B', '年資層級C', '眷屬量', '通勤成本',
           'training', 'training_A_rt', 'training_B_rt', 'training_C_rt', 
           'leave_M3','leave_Y1', 'leave_rt', 'leave_A_rt', 'leave_B_rt', 'leave_M3_rt', 'leave_Y1_rt', 'btrip', 'btrip_A_rt', 'proj_rt', 'performance', 'performance_mean']

In [None]:
dfs = []
i = 0
for df in [tr_dum, tv_dum, all_dum]:
    df_dummax = df.groupby('PerNo')[all_cat.columns].agg('max')
    df_dct = df.groupby('PerNo')[dct_var].agg(pd.Series.nunique)
    df_sum = df.groupby('PerNo')[sum_var].agg('sum')
    df_avg = df.groupby('PerNo')[avg_var].agg('mean')
    df_max = df.groupby('PerNo')[max_var].agg('max')
    df_min = df.groupby('PerNo')[min_var].agg('min')
    df_std = df.groupby('PerNo')[min_var].agg('std')
    df_dct.columns = [i + '_dcnt'for i in dct_var]
    df_sum.columns = [i + '_sum' for i in sum_var]
    df_avg.columns = [i + '_avg' for i in avg_var]
    df_max.columns = [i + '_max' for i in max_var]
    df_min.columns = [i + '_min' for i in min_var]
    df_std.columns = [i + '_std' for i in std_var]
    df_group = pd.concat([df_dummax, df_dct, df_sum, df_avg, df_max, df_min, df_std], axis = 1).reset_index()
    df[i] = df_group
    i+=1
    

In [None]:
tr_dum = all_dum[all_dum['yyyy'] < 2017]
tr_dummax = tr_dum.groupby('PerNo')[all_cat.columns].agg('max')
tr_dct = tr_dum.groupby('PerNo')[dct_var].agg(pd.Series.nunique)
tr_sum = tr_dum.groupby('PerNo')[sum_var].agg('sum')
tr_avg = tr_dum.groupby('PerNo')[avg_var].agg('mean')
tr_max = tr_dum.groupby('PerNo')[max_var].agg('max')
tr_min = tr_dum.groupby('PerNo')[min_var].agg('min')
tr_std = tr_dum.groupby('PerNo')[min_var].agg('std')
tr_dct.columns = [i + '_dcnt'for i in dct_var]
tr_sum.columns = [i + '_sum' for i in sum_var]
tr_avg.columns = [i + '_avg' for i in avg_var]
tr_max.columns = [i + '_max' for i in max_var]
tr_min.columns = [i + '_min' for i in min_var]
tr_std.columns = [i + '_std' for i in std_var]
tr_group = pd.concat([tr_dummax, tr_dct, tr_sum, tr_avg, tr_max, tr_min, tr_std], axis = 1).reset_index()

In [None]:
tv_dum = all_dum[all_dum['yyyy'] <= 2017]
tv_dummax = tv_dum.groupby('PerNo')[all_cat.columns].agg('max')
tv_dct = tv_dum.groupby('PerNo')[dct_var].agg(pd.Series.nunique)
tv_sum = tv_dum.groupby('PerNo')[sum_var].agg('sum')
tv_avg = tv_dum.groupby('PerNo')[avg_var].agg('mean')
tv_max = tv_dum.groupby('PerNo')[max_var].agg('max')
tv_min = tv_dum.groupby('PerNo')[min_var].agg('min')
tv_std = tv_dum.groupby('PerNo')[min_var].agg('std')
tv_dct.columns = [i + '_dcnt'for i in dct_var]
tv_sum.columns = [i + '_sum' for i in sum_var]
tv_avg.columns = [i + '_avg' for i in avg_var]
tv_max.columns = [i + '_max' for i in max_var]
tv_min.columns = [i + '_min' for i in min_var]
tv_std.columns = [i + '_std' for i in std_var]
tv_group = pd.concat([tv_dummax, tv_dct, tv_sum, tv_avg, tv_max, tv_min, tv_std], axis = 1).reset_index()

In [None]:
# tr_final = pd.concat([df.set_index('PerNo') for df in [tr_x, tr_group, tr_fl, tr_se_final]], axis = 1, join = 'inner').drop(['FLG', 'yyyy'], axis = 1)
# va_final = pd.concat([df.set_index('PerNo') for df in [va_x, tv_group, tv_fl, tv_se_final]], axis = 1, join = 'inner').drop(['FLG', 'yyyy'], axis = 1)
# tv_final = pd.concat([df.set_index('PerNo') for df in [tv_x, tv_group, tv_fl, tv_se_final]], axis = 1, join = 'inner').drop(['FLG', 'yyyy'], axis = 1)
# te_final = pd.concat([df.set_index('PerNo') for df in [all_x[all_x['FLG'] == 'test'], all_group, all_fl, al_se_final]], axis = 1, join = 'inner').drop(['FLG', 'yyyy'], axis = 1)

In [None]:
all_byyear = all_dum.copy()
all_byyear.replace(np.inf, 0, inplace=True)
all_byyear.fillna(0, inplace=True)

In [None]:
tr_byyear = all_byyear[all_byyear['yyyy'] <  2017].set_index(['PerNo', 'yyyy'])
va_byyear = all_byyear[all_byyear['yyyy'] == 2017].set_index(['PerNo', 'yyyy'])
tv_byyear = all_byyear[all_byyear['yyyy'] <= 2017].set_index(['PerNo', 'yyyy'])
te_byyear = all_byyear[all_byyear['yyyy'] == 2018].set_index(['PerNo', 'yyyy'])

In [None]:
minmax = MinMaxScaler()
tr_year = pd.DataFrame(minmax.fit_transform(tr_byyear), columns = tr_byyear.columns)
va_year = pd.DataFrame(minmax.transform(va_byyear), columns = tr_byyear.columns)
tv_year = pd.DataFrame(minmax.transform(tv_byyear), columns = tr_byyear.columns)
te_year = pd.DataFrame(minmax.transform(te_byyear), columns = tr_byyear.columns)

In [None]:
# tr_final = pd.concat([df.set_index('PerNo') for df in [tr_byyear.reset_index(), tr_fl, tr_se_final]], axis = 1, join = 'inner').drop(['yyyy'], axis = 1)
# va_final = pd.concat([df.set_index('PerNo') for df in [va_byyear.reset_index(), tv_fl, tv_se_final]], axis = 1, join = 'inner').drop(['yyyy'], axis = 1)
# tv_final = pd.concat([df.set_index('PerNo') for df in [tv_byyear.reset_index(), tv_fl, tv_se_final]], axis = 1, join = 'inner').drop(['yyyy'], axis = 1)
# te_final = pd.concat([df.set_index('PerNo') for df in [te_byyear.reset_index(), all_fl, al_se_final]], axis = 1, join = 'inner').drop(['yyyy'], axis = 1)

# 4. 模型建置 Modeling

## 4.1 訓練及驗證樣本 Train and Validation Data

In [None]:
d_train = lgb.Dataset(tr_year, label = tr_y, free_raw_data = False)
d_valid = lgb.Dataset(va_year, label = va_y, free_raw_data = False)
d_all = lgb.Dataset(tv_year, label = tv_y)

## 4.2 客製化損失函數 Customized Loss Function

In [None]:
# def Find_Optimal_Cutoff(target, predicted):
#     """ Find the optimal probability cutoff point for a classification model related to event rate
#     Parameters
#     ----------
#     target : Matrix with dependent or target data, where rows are observations

#     predicted : Matrix with predicted data, where rows are observations

#     Returns
#     -------     
#     list type, with optimal cutoff value
        
#     """
#     fpr, tpr, threshold = roc_curve(target, predicted)
#     i = np.arange(len(tpr)) 
#     roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
#     roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

#     return list(roc_t['threshold'])

In [None]:
# Objective Function
def lgb_fbeta_score(y_hat, data):
    y_true = data.get_label()
    y_hat = [1 if i >= 0.05 else 0 for i in y_hat]
    return 'fbeta', fbeta_score(y_true, y_hat, beta = 1.5), True

In [None]:
def focal_loss_lgb(y_pred, dtrain, alpha, gamma):
    a,g = alpha, gamma
    y_true = dtrain.label
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

def focal_loss_lgb_eval_error(y_pred, dtrain, alpha, gamma):
    a,g = alpha, gamma
    y_true = dtrain.label
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False


def focal_loss_lgb_f1_score(preds, lgbDataset):
    preds = expit(preds)
    binary_preds = [int(p>0.5) for p in preds]
    y_true = lgbDataset.get_label()
    return 'f1', f1_score(y_true, binary_preds), True

def focal_loss_lgb_fbeta_score(preds, lgbDataset):
    preds = expit(preds)
    binary_preds = [int(p>0.5) for p in preds]
    y_true = lgbDataset.get_label()
    return 'fbeta', fbeta_score(y_true, binary_preds, beta = 1.5), True

focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 1.)
eval_error = lambda x,y: focal_loss_lgb_eval_error(x, y, 0.25, 1.)

## 4.3 貝式參數最佳化 Baysian Optimization for LigtGBM

In [None]:
def hyp_lgbm(num_leaves, max_depth, min_data_in_leaf, learning_rate, lambda_l1, lambda_l2):
    
    params = {
      'objective': 'binary',
      'bagging_fraction': 0.8,
      'feature_fraction': 0.8,
      'bagging_freq' : 1,
      'bagging_seed' : 2020,
      'min_gain_to_split': 0.0001,
      'num_threads': 6,
      'verbosity': -1,
      'random_state': 123
    }

    params['num_leaves'] = int(round(num_leaves))
    params['max_depth'] = int(round(max_depth))
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['learning_rate']= round(learning_rate, 2)
    params['lambda_l1'] = round(lambda_l1, 2)
    params['lambda_l2'] = round(lambda_l2, 2)
    
    rs = {}
    clf = lgb.train(params, d_train, valid_sets=[d_train, d_valid],valid_names=['train','valid'],num_boost_round=1000,evals_result=rs,verbose_eval=200,early_stopping_rounds=200)

    return -np.min(rs['valid']['binary_logloss'])

In [None]:
def hyp_lgbm_focal(num_leaves, max_depth, min_data_in_leaf, learning_rate, lambda_l1, lambda_l2):
    
    params = {
      'objective': 'binary',
      'bagging_fraction': 0.8,
      'feature_fraction': 0.8,
      'bagging_freq' : 1,
      'bagging_seed' : 2020,
      'min_gain_to_split': 0.01,
      'num_threads': 6,
      'verbosity': -1,
      'random_state': 123
    }

    params['num_leaves'] = int(round(num_leaves))
    params['max_depth'] = int(round(max_depth))
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['learning_rate']= round(learning_rate, 2)
    params['lambda_l1'] = round(lambda_l1, 2)
    params['lambda_l2'] = round(lambda_l2, 2)
    
    rs = {}
    clf = lgb.train(params, d_train, valid_sets=[d_train, d_valid],valid_names=['train','valid'],num_boost_round=1000,fobj=focal_loss,feval=eval_error,evals_result=rs,verbose_eval=200,early_stopping_rounds=200)

    return -np.min(rs['valid']['focal_loss'])

In [None]:
# Domain space-- Range of hyperparameters
pds = {
       'num_leaves': (20, 70),
       'max_depth': (5, 12),
       'min_data_in_leaf': (20, 150),
       'learning_rate':(0.01, 0.05),
       'lambda_l1': (0.01, 1),
       'lambda_l2': (0.01, 1)
       }

# Surrogate model
optimizer = BayesianOptimization(hyp_lgbm_focal, pds, random_state = 1)

# Optimize
optimizer.maximize(init_points = 3, n_iter = 1)
best_params = optimizer.max['params']
print(optimizer.max)

## 4.4 根據貝式參數最佳化結果更新參數 Update Hyperparameter of LightGBM Based on Bayesian Optimization

In [None]:
params = {
      'objective': 'binary',
      'bagging_fraction': 0.8,  
      'feature_fraction': 0.8,  
      'bagging_freq' : 1,
      'bagging_seed' : 2020,
      'min_data_in_leaf': 50,
      'num_threads': 6,
      'verbosity': -1,
      'random_state': 123
        }

params['num_leaves'] = int(round(best_params['num_leaves']))
params['max_depth'] = int(round(best_params['max_depth']))
params['min_data_in_leaf'] = int(round(best_params['min_data_in_leaf']))
params['learning_rate']= round(best_params['learning_rate'], 2)
params['lambda_l1'] = round(best_params['lambda_l1'], 2)
params['lambda_l2'] = round(best_params['lambda_l2'], 2)

In [None]:
d_train = lgb.Dataset(tr_byyear, label = tr_y, free_raw_data = False)
d_valid = lgb.Dataset(va_byyear, label = va_y, free_raw_data = False)
d_all = lgb.Dataset(tv_byyear, label = tv_y)

In [None]:
# F-beta Score
fb_vrs = {}
fb_clf = lgb.train(params, d_train, valid_sets = [d_train, d_valid], valid_names = ['train', 'valid'], num_boost_round = 1000, verbose_eval = 200, early_stopping_rounds = 200,
                 feval = lgb_fbeta_score, evals_result = fb_vrs)

In [None]:
# Focal loss
fc_vrs = {}
fc_clf = lgb.train(params, d_train, valid_sets = [d_train, d_valid], valid_names = ['train', 'valid'], num_boost_round = 1000, verbose_eval = 200, early_stopping_rounds = 200,
                fobj = focal_loss, feval = eval_error, evals_result = fc_vrs)

In [None]:
# Focal loss and F1 Score
fcf_vrs = {}
fcf_clf = lgb.train(params, d_train, valid_sets = [d_train, d_valid], valid_names = ['train','valid'], num_boost_round = 1000, verbose_eval = 200, early_stopping_rounds = 200,
                fobj = focal_loss, feval = focal_loss_lgb_f1_score, evals_result = fcf_vrs)

## 4.5 全部訓練資料重新訓練模型 Retrain with All Train Data

In [None]:
d_train = lgb.Dataset(tr_byyear, label = tr_y, free_raw_data = False)
d_valid = lgb.Dataset(va_byyear, label = va_y, free_raw_data = False)
d_all = lgb.Dataset(tv_byyear, label = tv_y)

In [None]:
# F-Beta Score
iters = 1000
frs = {}
model = lgb.train(params, d_all, valid_sets = [d_train, d_valid], valid_names = ['train', 'valid'], num_boost_round = iters, verbose_eval = 200, early_stopping_rounds = 200, 
                  feval = lgb_fbeta_score, evals_result = frs)

In [None]:
iters = 92
frs = {}
model = lgb.train(params, d_all, valid_sets = [d_train, d_valid], valid_names = ['train', 'valid'], num_boost_round = iters, verbose_eval = 200, early_stopping_rounds = 200, 
                  fobj = focal_loss, feval = eval_error, evals_result = frs)

# 5. 測試資料評分 Scoring Test Data

In [None]:
train_pred = model.predict(tv_byyear)
test_pred = model.predict(te_byyear)

In [None]:
if test_pred.min()<0:
    print(test_pred.min(), '<0')
    te_pred = expit(test_pred)
    th = 0.1
else:
    print(test_pred.min(), '>0')
    te_pred = test_pred
    th = 0.05
sumit = pd.DataFrame({'PerNo': sm['PerNo'], 'PerStatus': te_pred})
sumit.to_csv('submit_prob.csv', index = False)
sumit.loc[sumit['PerStatus'] >= th, 'PerStatus'] = 1
sumit.loc[sumit['PerStatus'] < th, 'PerStatus'] = 0
sumit.to_csv('submit.csv', index = False)