In [1]:
!pip install xgboost pandas sklearn matplotlib

[33mYou are using pip version 19.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import os, copy
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from datetime import date
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
font = FontProperties(fname='font_ch.ttf', size=14)

from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve, accuracy_score, brier_score_loss, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# 因為擬合(fit)與編碼(transform)需要分開, 因此不使用.get_dummy, 而採用 sklearn 的 OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
df_train = pd.read_csv('./T-BrainAI_dataset/train.csv', encoding='big5')
Y1_mapping = {
    'Y': 1,
    'N': 0
}
df_train['Y1'] = df_train['Y1'].map(Y1_mapping)
train_Y = df_train[['Y1']]
train_Y # 0:1 = 98000:2000

train_X = df_train.drop(['CUS_ID','Y1'], axis=1)
train_X

Unnamed: 0,Y1
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


Unnamed: 0,GENDER,AGE,CHARGE_CITY_CD,CONTACT_CITY_CD,EDUCATION_CD,MARRIAGE_CD,LAST_A_CCONTACT_DT,L1YR_A_ISSUE_CNT,LAST_A_ISSUE_DT,L1YR_B_ISSUE_CNT,...,IF_ADD_INSD_IND,L1YR_GROSS_PRE_AMT,CUST_9_SEGMENTS_CD,FINANCETOOLS_A,FINANCETOOLS_B,FINANCETOOLS_C,FINANCETOOLS_D,FINANCETOOLS_E,FINANCETOOLS_F,FINANCETOOLS_G
0,M,低,A1,A1,,,Y,0,N,0,...,N,0.000174,C,,,,,,,
1,M,低,A1,A1,,,Y,0,N,0,...,N,0.008724,A,,,,,,,
2,M,低,A1,A1,1.0,0.0,Y,1,Y,0,...,N,0.005359,A,Y,N,N,N,N,N,N
3,M,低,A1,A1,,0.0,Y,0,N,0,...,N,0.000000,C,,,,,,,
4,M,低,A1,A1,1.0,0.0,N,0,N,0,...,N,0.000603,C,,,,,,,
5,M,低,A1,A1,,0.0,Y,0,N,0,...,N,0.000243,C,,,,,,,
6,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.002600,A,,,,,,,
7,M,低,A1,A1,,0.0,Y,0,N,0,...,N,0.002887,A,,,,,,,
8,M,低,A1,A1,1.0,0.0,N,0,N,0,...,N,0.000140,C,,,,,,,
9,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000182,C,,,,,,,


In [4]:
df_test = pd.read_csv('./T-BrainAI_dataset/test.csv', encoding='big5')
test_X = df_test.drop('CUS_ID', axis=1)
test_X

Unnamed: 0,GENDER,AGE,CHARGE_CITY_CD,CONTACT_CITY_CD,EDUCATION_CD,MARRIAGE_CD,LAST_A_CCONTACT_DT,L1YR_A_ISSUE_CNT,LAST_A_ISSUE_DT,L1YR_B_ISSUE_CNT,...,IF_ADD_INSD_IND,L1YR_GROSS_PRE_AMT,CUST_9_SEGMENTS_CD,FINANCETOOLS_A,FINANCETOOLS_B,FINANCETOOLS_C,FINANCETOOLS_D,FINANCETOOLS_E,FINANCETOOLS_F,FINANCETOOLS_G
0,M,低,A1,A1,2.0,0.0,Y,4,Y,0,...,Y,0.000151,C,,,,,,,
1,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000000,C,,,,,,,
2,M,低,A1,A1,1.0,0.0,N,0,N,0,...,N,0.000090,C,,,,,,,
3,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000000,C,,,,,,,
4,M,低,A1,A1,1.0,0.0,Y,0,Y,0,...,N,0.000179,C,,,,,,,
5,M,低,A1,A1,,0.0,Y,0,N,0,...,N,0.001452,A,,,,,,,
6,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000314,C,,,,,,,
7,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000000,C,,,,,,,
8,M,低,A1,A1,,0.0,Y,0,N,0,...,N,0.006626,A,,,,,,,
9,M,低,A1,A1,,0.0,N,0,N,0,...,N,0.000000,C,,,,,,,


In [5]:
# 補缺值
nonNum_features = []
Num_features = []
for feature in train_X.columns:
    try:
        if train_X[feature].dtype == object:
            nonNum_features.append(feature)
    except TypeError as e:
        print('nonNum_features: ', e)

    try:
        if (train_X[feature].dtype == int) or (train_X[feature].dtype == float):
            Num_features.append(feature)
    except TypeError as e:
        print('Num_features: ', e)

print(f'{len(nonNum_features)} Non-num Features : {nonNum_features}\n')
print(f'{len(Num_features)} Num Features : {Num_features}\n')


col_keep_null = ['IF_ADD_INSD_R_IND', 'IF_ADD_INSD_Q_IND', 'IF_ADD_INSD_L_IND',
       'IF_ADD_INSD_G_IND', 'IF_ADD_INSD_F_IND', 'ANNUAL_PREMIUM_AMT',
       'FINANCETOOLS_F', 'FINANCETOOLS_A', 'FINANCETOOLS_B', 'FINANCETOOLS_C',
       'FINANCETOOLS_D', 'FINANCETOOLS_E', 'FINANCETOOLS_G', 'C_IND', 'B_IND',
       'A_IND', 'L1YR_C_CNT' , 'APC_1ST_AGE', 'RFM_R', 'REBUY_TIMES_CNT']

for i in col_keep_null:
    try:
        train_X[i] = train_X[i].fillna('None')
    except:
        print('train_X: ', i)
        
    try:
        test_X[i] = test_X[i].fillna('None')
    except:
        print('test_X: ', i)

        
# fill_values = {
#     'INSD_LAST_YEARDIF_CNT': 0.1724875816240972, 'EDUCATION_CD': 3.0, 'APC_1ST_YEARDIF': 0.2587685984593603, 
#     'LEVEL': 5.0, 'TERMINATION_RATE': 12.090984696216369
# }

# for i,j in fill_values.items():
#     try:
#         if (len(train_X[i]) > 0):
#             train_X[i] = train_X[i].fillna(value=fill_values[i])
#     except:
#         print('train_X: ', i)

#     try:
#         if (len(test_X[i]) > 0):
#             test_X[i] = test_X[i].fillna(value=fill_values[i])
#     except:
#         print('test_X: ', i)
# 因為train和test資料分布不同，不能拿一樣的值來補，可能是這個原因test資料的AUC比valid相差很多

        
nonNum_values = {
    'AGE': '低', 'APC_1ST_AGE': '中高', 'A_IND': 'N', 'B_IND': 'N', 'CHARGE_CITY_CD': 'B1', 'CONTACT_CITY_CD': 'A1', 
    'CUST_9_SEGMENTS_CD': 'C', 'C_IND': 'N', 'FINANCETOOLS_A': 'Y', 'FINANCETOOLS_B': 'N', 'FINANCETOOLS_C': 'N', 
    'FINANCETOOLS_D': 'N', 'FINANCETOOLS_E': 'N', 'FINANCETOOLS_F': 'N', 'FINANCETOOLS_G': 'N', 'GENDER': 'F', 
    'IF_2ND_GEN_IND': 'Y', 'IF_ADD_F_IND': 'N', 'IF_ADD_G_IND': 'N', 'IF_ADD_IND': 'N', 'IF_ADD_INSD_F_IND': 'None', 
    'IF_ADD_INSD_G_IND': 'N', 'IF_ADD_INSD_IND': 'N', 'IF_ADD_INSD_L_IND': 'Y', 'IF_ADD_INSD_Q_IND': 'Y', 
    'IF_ADD_INSD_R_IND': 'N', 'IF_ADD_L_IND': 'N', 'IF_ADD_Q_IND': 'N', 'IF_ADD_R_IND': 'N', 'IF_HOUSEHOLD_CLAIM_IND': 'Y', 
    'IF_ISSUE_A_IND': 'N', 'IF_ISSUE_B_IND': 'N', 'IF_ISSUE_C_IND': 'N', 'IF_ISSUE_D_IND': 'N', 'IF_ISSUE_E_IND': 'N', 
    'IF_ISSUE_F_IND': 'N', 'IF_ISSUE_G_IND': 'N', 'IF_ISSUE_H_IND': 'N', 'IF_ISSUE_INSD_A_IND': 'N', 
    'IF_ISSUE_INSD_B_IND': 'N', 'IF_ISSUE_INSD_C_IND': 'N', 'IF_ISSUE_INSD_D_IND': 'N', 'IF_ISSUE_INSD_E_IND': 'N', 
    'IF_ISSUE_INSD_F_IND': 'N', 'IF_ISSUE_INSD_G_IND': 'N', 'IF_ISSUE_INSD_H_IND': 'N', 'IF_ISSUE_INSD_I_IND': 'N', 
    'IF_ISSUE_INSD_J_IND': 'N', 'IF_ISSUE_INSD_K_IND': 'N', 'IF_ISSUE_INSD_L_IND': 'N', 'IF_ISSUE_INSD_M_IND': 'N', 
    'IF_ISSUE_INSD_N_IND': 'N', 'IF_ISSUE_INSD_O_IND': 'N', 'IF_ISSUE_INSD_P_IND': 'N', 'IF_ISSUE_INSD_Q_IND': 'N', 
    'IF_ISSUE_I_IND': 'N', 'IF_ISSUE_J_IND': 'N', 'IF_ISSUE_K_IND': 'N', 'IF_ISSUE_L_IND': 'N', 'IF_ISSUE_M_IND': 'N', 
    'IF_ISSUE_N_IND': 'N', 'IF_ISSUE_O_IND': 'N', 'IF_ISSUE_P_IND': 'N', 'IF_ISSUE_Q_IND': 'N', 'IF_S_REAL_IND': 'N', 
    'IF_Y_REAL_IND': 'N', 'IM_IS_A_IND': 'N', 'IM_IS_B_IND': 'N', 'IM_IS_C_IND': 'N', 'IM_IS_D_IND': 'N', 
    'INSD_1ST_AGE': '中', 'L1YR_LAPSE_IND': 'N', 'L1YR_PAYMENT_REMINDER_IND': 'N', 'LAST_A_CCONTACT_DT': 'N', 
    'LAST_A_ISSUE_DT': 'N', 'LAST_B_CONTACT_DT': 'N', 'LAST_B_ISSUE_DT': 'N', 'LAST_C_DT': 'N', 'LIFE_CNT': '低', 
    'REBUY_TIMES_CNT': '低', 'RFM_R': '低', 'X_A_IND': 'N', 'X_B_IND': 'N', 'X_C_IND': 'N', 'X_D_IND': 'N', 'X_E_IND': 'N', 
    'X_F_IND': 'N', 'X_G_IND': 'N', 'X_H_IND': 'N'
}

for i,j in nonNum_values.items():
    try:
        if train_X[i][0]:
            train_X[i] = train_X[i].fillna(value=nonNum_values[i])
    except KeyError as e:
        print('train_X: ', e)

    try:
        if test_X[i][0]:
            test_X[i] = test_X[i].fillna(value=nonNum_values[i])
    except KeyError as e:
        print('test_X: ', e)


# 塞平均數的表現沒有太多影響
# for i in Num_features:
#     try:
#         if train_X[i][0]:
#             train_X[i] = train_X[i].fillna(np.mean(train_X[i]))
#     except KeyError as e:
#         print('train_X: ', e)

#     try:
#         if test_X[i][0]:
#             test_X[i] = test_X[i].fillna(np.mean(test_X[i]))
#     except KeyError as e:
#         print('test_X: ', e)
        

# Y1_1 vs Y1_0  # 不篩反而比較好
# variance_col = ['AGE', 'CHARGE_CITY_CD', 'LAST_A_CCONTACT_DT', 'LAST_A_ISSUE_DT', 'APC_1ST_AGE', 'INSD_1ST_AGE', 
#                 'IF_2ND_GEN_IND', 'RFM_R', 'REBUY_TIMES_CNT', 'LIFE_CNT', 'IF_ISSUE_N_IND', 'IF_ISSUE_P_IND', 
#                 'IF_ISSUE_Q_IND', 'IF_ADD_L_IND', 'IF_ADD_Q_IND', 'IF_ADD_R_IND', 'IF_ADD_IND', 'LAST_C_DT', 'IF_S_REAL_IND',
#                 'IM_IS_B_IND', 'IM_IS_D_IND', 'X_A_IND', 'X_B_IND', 'X_C_IND', 'X_E_IND', 'X_H_IND', 'IF_ISSUE_INSD_I_IND', 
#                 'IF_ISSUE_INSD_J_IND', 'IF_ISSUE_INSD_P_IND', 'IF_ISSUE_INSD_Q_IND', 'IF_ADD_INSD_R_IND', 'IF_ADD_INSD_Q_IND', 
#                 'IF_ADD_INSD_L_IND', 'CUST_9_SEGMENTS_CD']

# train_X = train_X[variance_col]
# test_X = test_X[variance_col]

# exclude RF feature importance lower columns
exc = ['DIEBENEFIT_AMT', 'X_E_IND', 'IF_ISSUE_INSD_I_IND', 'RFM_M_LEVEL',
       'ACCIDENT_HOSPITAL_REC_AMT', 'OUTPATIENT_SURGERY_AMT',
       'PAY_LIMIT_MED_MISC_AMT', 'IF_ADD_L_IND',
       'DISEASES_HOSPITAL_REC_AMT', 'INPATIENT_SURGERY_AMT', 'C_IND',
       'RFM_R', 'BANK_NUMBER_CNT', 'ANNUAL_INCOME_AMT', 'IM_CNT',
       'FINANCETOOLS_A', 'MONTHLY_CARE_AMT', 'CHARGE_CITY_CD',
       'FINANCETOOLS_F', 'IF_ADD_INSD_L_IND', 'POLICY_VALUE_AMT',
       'FINANCETOOLS_G', 'FINANCETOOLS_E', 'FIRST_CANCER_AMT',
       'FINANCETOOLS_B', 'APC_1ST_AGE', 'MARRIAGE_CD',
       'LONG_TERM_CARE_AMT', 'FINANCETOOLS_D', 'A_IND', 'ANNUITY_AMT',
       'ILL_ACCELERATION_AMT', 'IF_2ND_GEN_IND', 'FINANCETOOLS_C',
       'CONTACT_CITY_CD', 'TERMINATION_RATE', 'ILL_ADDITIONAL_AMT',
       'IF_ISSUE_J_IND', 'IF_S_REAL_IND', 'B_IND', 'AG_CNT', 'LAST_C_DT',
       'IM_IS_D_IND', 'IF_ADD_INSD_R_IND', 'IF_ISSUE_INSD_P_IND',
       'IF_ADD_INSD_Q_IND', 'APC_CNT', 'IF_ADD_INSD_F_IND',
       'IF_ADD_F_IND', 'EXPIRATION_AMT', 'L1YR_LAPSE_IND',
       'IF_ISSUE_INSD_Q_IND', 'X_D_IND', 'IF_HOUSEHOLD_CLAIM_IND',
       'GENDER', 'IF_ISSUE_INSD_J_IND', 'IF_ADD_R_IND', 'X_C_IND',
       'IF_ISSUE_D_IND', 'IF_ADD_INSD_IND', 'IM_IS_C_IND', 'IM_IS_B_IND',
       'IF_ISSUE_INSD_N_IND', 'IF_ISSUE_INSD_D_IND', 'IF_ISSUE_A_IND',
       'IF_ADD_INSD_G_IND', 'LAST_B_CONTACT_DT', 'IF_Y_REAL_IND',
       'IF_ISSUE_P_IND', 'IF_ISSUE_N_IND', 'IF_ISSUE_INSD_G_IND',
       'IF_ISSUE_INSD_C_IND', 'IF_ISSUE_INSD_A_IND', 'IF_ISSUE_G_IND',
       'IF_ISSUE_C_IND', 'L1YR_PAYMENT_REMINDER_IND', 'IM_IS_A_IND',
       'IF_ISSUE_INSD_F_IND', 'IF_ISSUE_INSD_B_IND', 'IF_ADD_G_IND',
       'LAST_B_ISSUE_DT', 'IF_ISSUE_INSD_E_IND', 'X_G_IND', 'X_F_IND',
       'X_A_IND', 'L1YR_B_ISSUE_CNT', 'IF_ISSUE_O_IND', 'IF_ISSUE_M_IND',
       'IF_ISSUE_L_IND', 'IF_ISSUE_K_IND', 'IF_ISSUE_INSD_O_IND',
       'IF_ISSUE_INSD_M_IND', 'IF_ISSUE_INSD_L_IND',
       'IF_ISSUE_INSD_K_IND', 'IF_ISSUE_INSD_H_IND', 'IF_ISSUE_H_IND',
       'IF_ISSUE_F_IND', 'IF_ISSUE_E_IND', 'IF_ISSUE_B_IND',
       'CHANNEL_B_POL_CNT']

train_X = train_X[train_X.columns.difference(exc)]
test_X = test_X[test_X.columns.difference(exc)]

train_X.isnull().sum().sort_values()
test_X.isnull().sum().sort_values()

89 Non-num Features : ['GENDER', 'AGE', 'CHARGE_CITY_CD', 'CONTACT_CITY_CD', 'LAST_A_CCONTACT_DT', 'LAST_A_ISSUE_DT', 'LAST_B_ISSUE_DT', 'APC_1ST_AGE', 'INSD_1ST_AGE', 'IF_2ND_GEN_IND', 'RFM_R', 'REBUY_TIMES_CNT', 'LIFE_CNT', 'IF_ISSUE_A_IND', 'IF_ISSUE_B_IND', 'IF_ISSUE_C_IND', 'IF_ISSUE_D_IND', 'IF_ISSUE_E_IND', 'IF_ISSUE_F_IND', 'IF_ISSUE_G_IND', 'IF_ISSUE_H_IND', 'IF_ISSUE_I_IND', 'IF_ISSUE_J_IND', 'IF_ISSUE_K_IND', 'IF_ISSUE_L_IND', 'IF_ISSUE_M_IND', 'IF_ISSUE_N_IND', 'IF_ISSUE_O_IND', 'IF_ISSUE_P_IND', 'IF_ISSUE_Q_IND', 'IF_ADD_F_IND', 'IF_ADD_L_IND', 'IF_ADD_Q_IND', 'IF_ADD_G_IND', 'IF_ADD_R_IND', 'IF_ADD_IND', 'L1YR_PAYMENT_REMINDER_IND', 'L1YR_LAPSE_IND', 'LAST_B_CONTACT_DT', 'A_IND', 'B_IND', 'C_IND', 'LAST_C_DT', 'IF_S_REAL_IND', 'IF_Y_REAL_IND', 'IM_IS_A_IND', 'IM_IS_B_IND', 'IM_IS_C_IND', 'IM_IS_D_IND', 'X_A_IND', 'X_B_IND', 'X_C_IND', 'X_D_IND', 'X_E_IND', 'X_F_IND', 'X_G_IND', 'X_H_IND', 'IF_HOUSEHOLD_CLAIM_IND', 'IF_ISSUE_INSD_A_IND', 'IF_ISSUE_INSD_B_IND', 'IF_ISSUE_IN

AGE                          0
TOOL_VISIT_1YEAR_CNT         0
REBUY_TIMES_CNT              0
LIFE_INSD_CNT                0
LIFE_CNT                     0
LAST_A_ISSUE_DT              0
LAST_A_CCONTACT_DT           0
L1YR_GROSS_PRE_AMT           0
L1YR_C_CNT                   0
L1YR_A_ISSUE_CNT             0
INSD_CNT                     0
X_B_IND                      0
IF_ISSUE_Q_IND               0
INSD_1ST_AGE                 0
IF_ADD_Q_IND                 0
IF_ADD_IND                   0
CUST_9_SEGMENTS_CD           0
CLC_CUR_NUM                  0
CHANNEL_A_POL_CNT            0
ANNUAL_PREMIUM_AMT           0
AG_NOW_CNT                   0
IF_ISSUE_I_IND               0
X_H_IND                      0
INSD_LAST_YEARDIF_CNT      171
OCCUPATION_CLASS_CD       3960
BMI                      16645
EDUCATION_CD             20562
DIEACCIDENT_AMT          27540
APC_1ST_YEARDIF          43282
LEVEL                    43305
dtype: int64

AGE                          0
TOOL_VISIT_1YEAR_CNT         0
REBUY_TIMES_CNT              0
LIFE_INSD_CNT                0
LIFE_CNT                     0
LAST_A_ISSUE_DT              0
LAST_A_CCONTACT_DT           0
L1YR_GROSS_PRE_AMT           0
L1YR_C_CNT                   0
L1YR_A_ISSUE_CNT             0
INSD_CNT                     0
X_B_IND                      0
IF_ISSUE_Q_IND               0
INSD_1ST_AGE                 0
IF_ADD_Q_IND                 0
IF_ADD_IND                   0
CUST_9_SEGMENTS_CD           0
CLC_CUR_NUM                  0
CHANNEL_A_POL_CNT            0
ANNUAL_PREMIUM_AMT           0
AG_NOW_CNT                   0
IF_ISSUE_I_IND               0
X_H_IND                      0
INSD_LAST_YEARDIF_CNT       14
OCCUPATION_CLASS_CD       4925
BMI                      23692
EDUCATION_CD             30295
DIEACCIDENT_AMT          41465
APC_1ST_YEARDIF          64214
LEVEL                    64216
dtype: int64

In [6]:
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in train_X.columns:
    train_X[c] = train_X[c].fillna(-1)
    if train_X[c].dtype == 'object':
        train_X[c] = LEncoder.fit_transform(list(train_X[c].values))

#     if (train_X[c].dtype == int) or (train_X[c].dtype == float):
#         train_X[c] = train_X[c].fillna(-1)
    train_X[c] = MMEncoder.fit_transform(train_X[c].values.reshape(-1, 1))

# train_X[c] = train_X[c].fillna('None')
# t = pd.get_dummies(train_X)
# train_X = pd.DataFrame(t)
train_X.head()


for c in test_X.columns:
    test_X[c] = test_X[c].fillna(-1)
    if test_X[c].dtype == 'object':
        test_X[c] = LEncoder.fit_transform(list(test_X[c].values))
#     if (test_X[c].dtype == int) or (test_X[c].dtype == float):
#         test_X[c] = test_X[c].fillna(-1)
#         test_X[c] = MMEncoder.fit_transform(test_X[c].values.reshape(-1, 1))
    test_X[c] = MMEncoder.fit_transform(test_X[c].values.reshape(-1, 1))

# test_X[c] = test_X[c].fillna('None')
# t = pd.get_dummies(test_X)
# test_X = pd.DataFrame(t)
test_X.head()

Unnamed: 0,AGE,AG_NOW_CNT,ANNUAL_PREMIUM_AMT,APC_1ST_YEARDIF,BMI,CHANNEL_A_POL_CNT,CLC_CUR_NUM,CUST_9_SEGMENTS_CD,DIEACCIDENT_AMT,EDUCATION_CD,...,LAST_A_CCONTACT_DT,LAST_A_ISSUE_DT,LEVEL,LIFE_CNT,LIFE_INSD_CNT,OCCUPATION_CLASS_CD,REBUY_TIMES_CNT,TOOL_VISIT_1YEAR_CNT,X_B_IND,X_H_IND
0,0.666667,0.0,0.104821,0.542169,0.584416,0.0,0.0,0.285714,0.0,0.0,...,1.0,0.0,0.666667,0.0,0.016949,0.285714,0.75,0.0,0.0,0.0
1,0.666667,0.0,0.943052,0.542169,0.61039,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.016949,0.285714,0.75,0.0,0.0,0.0
2,0.666667,0.0,0.232142,0.518072,0.636364,0.010309,0.125,0.0,0.734078,0.4,...,1.0,1.0,1.0,0.0,0.033898,0.285714,0.75,0.0,0.0,0.0
3,0.666667,0.0,1.0,0.60241,0.597403,0.0,0.0,0.285714,0.0,0.0,...,1.0,0.0,1.0,0.0,0.016949,0.285714,0.75,0.0,0.0,0.0
4,0.666667,0.1,0.514275,0.578313,0.61039,0.010309,0.125,0.285714,0.734078,0.4,...,0.0,0.0,0.5,0.0,0.067797,0.285714,0.75,0.0,1.0,0.0


Unnamed: 0,AGE,AG_NOW_CNT,ANNUAL_PREMIUM_AMT,APC_1ST_YEARDIF,BMI,CHANNEL_A_POL_CNT,CLC_CUR_NUM,CUST_9_SEGMENTS_CD,DIEACCIDENT_AMT,EDUCATION_CD,...,LAST_A_CCONTACT_DT,LAST_A_ISSUE_DT,LEVEL,LIFE_CNT,LIFE_INSD_CNT,OCCUPATION_CLASS_CD,REBUY_TIMES_CNT,TOOL_VISIT_1YEAR_CNT,X_B_IND,X_H_IND
0,0.666667,0.111111,0.174754,0.53012,0.6625,0.04,0.111111,0.285714,0.0,0.6,...,1.0,1.0,1.0,1.0,0.071429,0.285714,0.75,0.0,0.0,0.0
1,0.666667,0.0,1.0,0.626506,0.5375,0.0,0.0,0.285714,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.017857,0.285714,0.75,0.0,0.0,0.0
2,0.666667,0.222222,0.054719,0.566265,0.575,0.01,0.111111,0.285714,0.828863,0.4,...,0.0,0.0,0.5,0.0,0.017857,0.285714,0.75,0.0,0.0,0.0
3,0.666667,0.0,1.0,0.614458,0.6,0.0,0.0,0.285714,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.017857,0.285714,0.75,0.0,0.0,0.0
4,0.666667,0.111111,0.234252,0.554217,0.5625,0.01,0.111111,0.285714,0.828792,0.4,...,1.0,1.0,1.0,0.0,0.017857,0.285714,0.75,0.004525,0.0,0.0


In [7]:
train_X_values = train_X.values
train_x, val_x, train_y, val_y = train_test_split(train_X_values, train_Y, test_size=0.25, random_state=37)
train_x

test_x = test_X.values
test_x

array([[0.        , 0.1       , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.66666667, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.66666667, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.1       , 0.63966723, ..., 0.00568182, 1.        ,
        1.        ]])

array([[0.66666667, 0.11111111, 0.17475443, ..., 0.        , 0.        ,
        0.        ],
       [0.66666667, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.66666667, 0.22222222, 0.0547192 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.11111111, 0.1939195 , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.11111111, 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
# use RandomizedSearchCV best result
rfc = RandomForestClassifier(n_estimators=54, min_samples_split=12, min_samples_leaf=20, max_depth=6, max_features='sqrt', class_weight={0: 1.0, 1:13.0})
rfc.fit(train_x, train_y)
rfc_predict = rfc.predict(val_x)
rfc_cv_score = cross_val_score(rfc, train_X_values, train_Y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(val_y, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(val_y, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

RandomForestClassifier(bootstrap=True, class_weight={0: 1.0, 1: 13.0},
                       criterion='gini', max_depth=6, max_features='sqrt',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=20,
                       min_samples_split=12, min_weight_fraction_leaf=0.0,
                       n_estimators=54, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

=== Confusion Matrix ===
[[23995   506]
 [  407    92]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     24501
           1       0.15      0.18      0.17       499

    accuracy                           0.96     25000
   macro avg       0.57      0.58      0.57     25000
weighted avg       0.97      0.96      0.97     25000



=== All AUC Scores ===
[0.71491556 0.70601199 0.95465969 0.97565434 0.88079031 0.77378189
 0.80681046 0.80427041 0.77472781 0.81987194]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8211494387755103


In [23]:
len(val_y['Y1'].values)

25000

In [22]:
# # use val threshold
for test in [0.5,0.6,0.7,0.8,0.9,1.0]:
    val_y_rfc_pred = rfc.predict_proba(val_x)
    df_val_x_y = pd.DataFrame({'val_y': val_y['Y1'].values, 'val_y_rfc_pred_proba': val_y_rfc_pred[:, 1]})

    df_val_x_y['val_y_rfc_pred'] = df_val_x_y['val_y_rfc_pred_proba'].map(lambda x:1 if x>test else 0)
#     df_val_x_y.groupby('val_y').count()
#     df_val_x_y.groupby('val_y_rfc_pred').count()
    # df_val_x_y = df_val_x_y.drop('val_y_rfc_pred',axis=1)

    df_val_x_y['rfc_result'] = df_val_x_y['val_y'] - df_val_x_y['val_y_rfc_pred']
    print(f'threshold {test} - accuracy: ', len(df_val_x_y.query('rfc_result == [0]'))/len(df_val_x_y))
    print(len(df_val_x_y.query('val_y_rfc_pred == [0]')))

threshold 0.5 - accuracy:  0.96348
24402
threshold 0.6 - accuracy:  0.97852
24914
threshold 0.7 - accuracy:  0.98
24997
threshold 0.8 - accuracy:  0.98004
25000
threshold 0.9 - accuracy:  0.98004
25000
threshold 1.0 - accuracy:  0.98004
25000


In [43]:
# RFC output result
test_Y_rfc_pred = rfc.predict_proba(test_x)
# df_test['rfc_predict_proba'] = test_Y_rfc_pred[:, 1]
# df_test[['CUS_ID', 'rfc_predict_proba']]

# output = pd.concat((df_test[['CUS_ID']], df_test['rfc_predict_proba']), axis=1)
# output['predict'] = output['rfc_predict_proba'].map(lambda x:1 if x>0.5 else 0)
# print(output.shape)
# output.groupby('predict').agg('count')

# out = output.groupby('CUS_ID', as_index=False).mean()
# out = out[['CUS_ID', 'predict_proba']]
# out.columns = ['CUS_ID', 'Ypred']
# out.to_csv('output_190927_15_RFC_RandomizedSearchCV.csv')
# out

In [39]:
# XGBoost
xgbc = XGBClassifier()
xgbc.fit(train_x, train_y)

val_y_xgbc_pred = xgbc.predict_proba(val_x)
auc_score = roc_auc_score(y_true=val_y.Y1, y_score=val_y_xgbc_pred[:, 1])
acc = accuracy_score(y_true=val_y.Y1, y_pred=val_y_xgbc_pred.argmax(axis=1))
print('Validation AUC: {:.3f}, Accuracy: {:.3f}'.format(auc_score, acc))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

Validation AUC: 0.840, Accuracy: 0.980


In [42]:
crossv_xgbc = cross_val_score(xgbc, train_X_values, train_Y, cv=10, scoring='roc_auc')
crossv_xgbc
print(crossv_xgbc.mean())

array([0.73435612, 0.71100102, 0.95975408, 0.9611523 , 0.87708903,
       0.77003087, 0.81634923, 0.81075332, 0.78215102, 0.82580077])

0.8248437755102038


In [34]:
# use val threshold
len(df_val_x_y.query('val_y == [0]'))
for test in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    val_y_xgbc_pred = xgbc.predict_proba(val_x)
    df_val_x_y['val_y_xgbc_pred_proba'] = val_y_xgbc_pred[:, 1]
    df_val_x_y['val_y_xgbc_pred'] = df_val_x_y['val_y_xgbc_pred_proba'].map(lambda x:1 if x>test else 0)
    
    df_val_x_y['xgbc_result'] = df_val_x_y['val_y'] - df_val_x_y['val_y_xgbc_pred']
    print(f'threshold {test} - accuracy: ', len(df_val_x_y.query('xgbc_result == [0]'))/len(df_val_x_y))
    print(len(df_val_x_y.query('val_y_xgbc_pred == [0]')))

24501

threshold 0.1 - accuracy:  0.9618
24310
threshold 0.2 - accuracy:  0.9768
24839
threshold 0.3 - accuracy:  0.979
24954
threshold 0.4 - accuracy:  0.97948
24978
threshold 0.5 - accuracy:  0.97996
24994
threshold 0.6 - accuracy:  0.98008
24999
threshold 0.7 - accuracy:  0.98004
25000
threshold 0.8 - accuracy:  0.98004
25000
threshold 0.9 - accuracy:  0.98004
25000
threshold 1.0 - accuracy:  0.98004
25000


In [36]:
# blending
for test in [0.5,0.6,0.7,0.8,0.9,1.0]:
    blending_pred = val_y_rfc_pred*0.35 + val_y_xgbc_pred*0.65
    df_val_x_y['blending_pred_proba'] = blending_pred[:, 1]
    df_val_x_y['blending_pred'] = df_val_x_y['blending_pred_proba'].map(lambda x:1 if x>test else 0)
    
    df_val_x_y['blending_result'] = df_val_x_y['val_y'] - df_val_x_y['blending_pred']
    print(f'threshold {test} - accuracy: ', len(df_val_x_y.query('blending_result == [0]'))/len(df_val_x_y))
    print(len(df_val_x_y.query('blending_pred == [0]')))

threshold 0.5 - accuracy:  0.97972
24986
threshold 0.6 - accuracy:  0.98004
24998
threshold 0.7 - accuracy:  0.98004
25000
threshold 0.8 - accuracy:  0.98004
25000
threshold 0.9 - accuracy:  0.98004
25000
threshold 1.0 - accuracy:  0.98004
25000


In [44]:
# XGBoost output result
test_Y_xgbc_pred = xgbc.predict_proba(test_x)

# df_test['xgbc_predict_proba'] = test_Y_xgbc_pred[:, 1]
# df_test[['CUS_ID', 'xgbc_predict_proba']]

# output = pd.concat((df_test[['CUS_ID']], df_test['predict_proba']), axis=1)
# output['predict'] = output['predict_proba'].map(lambda x:1 if x>0.5 else 0)
# print(output.shape)
# output.groupby('predict').agg('count')

# out = output.groupby('CUS_ID', as_index=False).mean()
# out = out[['CUS_ID', 'predict_proba']]
# out.columns = ['CUS_ID', 'Ypred']
# out.to_csv('output_190927_15_RFC_RandomizedSearchCV.csv')
# out

In [48]:
# blending output result
blending_y_pred = (test_Y_rfc_pred*0.35 + test_Y_xgbc_pred*0.65)*1.6
df_test['blending_y_pred_proba'] = blending_y_pred[:,1]
df_test[['CUS_ID', 'blending_y_pred_proba']]

output = pd.concat((df_test[['CUS_ID']], df_test['blending_y_pred_proba']), axis=1)
output['predict'] = output['blending_y_pred_proba'].map(lambda x:1 if x>0.5 else 0)
print(output.shape)
output.groupby('predict').agg('count')

out = output.groupby('CUS_ID', as_index=False).mean()
out = out[['CUS_ID', 'blending_y_pred_proba']]
out.columns = ['CUS_ID', 'Ypred']
out.to_csv('output_190929_17_RFC_XGBC_Blending_probaMax.csv')
out

Unnamed: 0,CUS_ID,blending_y_pred_proba
0,1193,0.308957
1,1727,0.011712
2,3293,0.102825
3,4328,0.018424
4,5555,0.259773
5,7193,0.059616
6,7337,0.052061
7,8142,0.021319
8,8531,0.058383
9,8849,0.011228


(150000, 3)


Unnamed: 0_level_0,CUS_ID,blending_y_pred_proba
predict,Unnamed: 1_level_1,Unnamed: 2_level_1
0,149109,149109
1,891,891


Unnamed: 0,CUS_ID,Ypred
0,3,0.170353
1,5,0.523782
2,20,0.216437
3,48,0.162022
4,49,0.495304
5,55,0.192376
6,62,0.186401
7,68,0.281050
8,72,0.185041
9,73,0.180486
