In [1]:
# 경고(Warning)가 출력되지 않도록 설정.
import warnings
warnings.filterwarnings(action='ignore')

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/data-v01.csv')
data.shape

(150000, 11)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   seriousdlqin2yrs                      150000 non-null  int64  
 1   revolvingutilizationofunsecuredlines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   numberoftime30-59dayspastduenotworse  150000 non-null  int64  
 4   debtratio                             150000 non-null  float64
 5   monthlyincome                         150000 non-null  float64
 6   numberofopencreditlinesandloans       150000 non-null  int64  
 7   numberoftimes90dayslate               150000 non-null  int64  
 8   numberrealestateloansorlines          150000 non-null  int64  
 9   numberoftime60-89dayspastduenotworse  150000 non-null  int64  
 10  numberofdependents                    150000 non-null  float64
dtype

In [4]:
data.head()

Unnamed: 0,seriousdlqin2yrs,revolvingutilizationofunsecuredlines,age,numberoftime30-59dayspastduenotworse,debtratio,monthlyincome,numberofopencreditlinesandloans,numberoftimes90dayslate,numberrealestateloansorlines,numberoftime60-89dayspastduenotworse,numberofdependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,12645.0,7,0,1,0,0.0


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score

import pickle

In [6]:
# X, y 분리
y = data.seriousdlqin2yrs
X = data.drop('seriousdlqin2yrs', axis='columns')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [9]:
y_train.shape,y_test.shape

((112500,), (37500,))

In [10]:
# y의 label별 비율
np.unique(y_train, return_counts=True)[1]/y_train.size

array([0.93315556, 0.06684444])

## Feature Scaler생성

In [11]:
scaler = StandardScaler()
# scaler = MinMaxScaler()

## Base-line 모델 정의

In [12]:
# knn. losgistic은 scaler 적용
knn = make_pipeline(scaler, KNeighborsClassifier())
lr = make_pipeline(scaler, LogisticRegression(max_iter=2000, random_state=0))
rf = RandomForestClassifier(random_state=0)
grb = GradientBoostingClassifier(random_state=0)
xgb = XGBClassifier(random_state=0)

In [13]:
knn.fit(X_train, y_train)
lr.fit(X_train, y_train)
xgb.fit(X_train, y_train)
grb.fit(X_train, y_train)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [14]:
base_line = [knn, lr, xgb, grb, rf]
model_names = ['KNN', 'LogisticRegression', 'XGBoost', 'GradientBoosting', 'RandomForest']

In [15]:
for model, name in zip(base_line, model_names):

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    pred_train_proba = model.predict_proba(X_train)
    pred_test_proba = model.predict_proba(X_test)
    
    acc_train = np.round(accuracy_score(y_train,pred_train),3)
    acc_test = np.round(accuracy_score(y_test, pred_test), 3)
    
    auc_train = np.round(roc_auc_score(y_train, pred_train_proba[:, 1]), 3)
    auc_test = np.round(roc_auc_score(y_test, pred_test_proba[:, 1]), 3)
    
    print(f'{name}')
    print(f'train정확도:{acc_train}, Test정확도:{acc_test}\t train AUC:{auc_train}, Test AUC:{auc_test}')
    print('='*50)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN
train정확도:0.942, Test정확도:0.933	 train AUC:0.951, Test AUC:0.694
LogisticRegression
train정확도:0.934, Test정확도:0.934	 train AUC:0.804, Test AUC:0.803
XGBoost
train정확도:0.949, Test정확도:0.936	 train AUC:0.917, Test AUC:0.861
GradientBoosting
train정확도:0.939, Test정확도:0.936	 train AUC:0.868, Test AUC:0.866
RandomForest
train정확도:0.999, Test정확도:0.935	 train AUC:1.0, Test AUC:0.842


# GridSearchCV를 이용한 하이퍼파라미터 튜닝

### XGBoost

In [16]:
param = {
    'learning_rate':[0.01,0.1,0.5,1],
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,6),
    'subsample':[0.6,0.7,0.8,0.9,1],
}

In [17]:
rs_xgb = RandomizedSearchCV(XGBClassifier(random_state=0), 
                            param, 
                            n_iter=60, 
                            scoring='roc_auc',
                            cv=5,
                            n_jobs=-1 )

In [18]:
rs_xgb.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                           max_delta_step=None, max_depth=None,
               

In [19]:
rs_xgb.best_params_

{'subsample': 0.8, 'n_estimators': 400, 'max_depth': 2, 'learning_rate': 0.1}

In [20]:
rs_df = pd.DataFrame(rs_xgb.cv_results_)
rs_df.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,21.746735,0.830167,0.122645,0.013457,0.8,400,2,0.1,"{'subsample': 0.8, 'n_estimators': 400, 'max_d...",0.861043,0.861962,0.86441,0.862929,0.869406,0.86395,0.002947,1
18,27.415235,0.553589,0.132146,0.004968,0.6,500,2,0.1,"{'subsample': 0.6, 'n_estimators': 500, 'max_d...",0.8612,0.861852,0.864695,0.862592,0.869279,0.863923,0.002924,2
37,25.698649,0.792798,0.148192,0.016542,0.9,300,3,0.1,"{'subsample': 0.9, 'n_estimators': 300, 'max_d...",0.860806,0.862249,0.864745,0.862478,0.868887,0.863833,0.002824,3
2,14.074668,0.202105,0.088636,0.003208,1.0,300,2,0.1,"{'subsample': 1, 'n_estimators': 300, 'max_dep...",0.86083,0.861043,0.863726,0.862772,0.869237,0.863522,0.003055,4
23,30.928662,0.576916,0.161731,0.004889,0.9,400,3,0.1,"{'subsample': 0.9, 'n_estimators': 400, 'max_d...",0.860171,0.861566,0.864486,0.861742,0.868454,0.863284,0.002939,5


In [21]:
# 상세하게 찾기
param = {
    "subsample":[0.6,0.7,0.8,0.9,1], 
    "max_depth":[2,3,4]
}
gs_xgb = GridSearchCV(XGBClassifier(n_estimators=400, learning_rate=0.1, random_state=0), 
                      param, 
                      scoring='roc_auc', 
                      cv=5,
                      n_jobs=-1 )

gs_xgb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_bin=None,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_

In [22]:
gs_xgb.best_params_

{'max_depth': 2, 'subsample': 0.6}

In [23]:
gs_xgb.best_score_

0.8639777510812596

In [24]:
gs_df = pd.DataFrame(gs_xgb.cv_results_)
gs_df.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,24.86754,0.940949,0.133453,0.01984,2,0.6,"{'max_depth': 2, 'subsample': 0.6}",0.861307,0.861856,0.864509,0.862717,0.869499,0.863978,0.002966,1
2,24.247563,0.677032,0.122826,0.011768,2,0.8,"{'max_depth': 2, 'subsample': 0.8}",0.861043,0.861962,0.86441,0.862929,0.869406,0.86395,0.002947,2
3,25.049569,0.927565,0.143926,0.014422,2,0.9,"{'max_depth': 2, 'subsample': 0.9}",0.861912,0.861382,0.864254,0.862435,0.869265,0.86385,0.002875,3
1,24.604692,0.915433,0.128743,0.0069,2,0.7,"{'max_depth': 2, 'subsample': 0.7}",0.861375,0.861462,0.864095,0.862721,0.869425,0.863816,0.002974,4
4,22.311286,1.122252,0.121923,0.006668,2,1.0,"{'max_depth': 2, 'subsample': 1}",0.861013,0.861343,0.863914,0.86292,0.869556,0.863749,0.003089,5


In [25]:
best_model_xgb = gs_xgb.best_estimator_

In [21]:
# 모델저장
import os
save_dir = 'saved_model'
os.makedirs(save_dir, exist_ok=True) #디렉토리 만들기(없으면 만들고 있으면 안 만든다.)


xgb_file_path = os.path.join(save_dir, 'xgb_best.pkl')
best_model_xgb.save_model(xgb_file_path) #xgb객체.save_model(경로) => 모델을 파일로 저장.

In [22]:
# Load Model
saved_xgb = XGBClassifier()
saved_xgb.load_model(xgb_file_path)

### GradientBoosting

In [26]:
param_gb = {
    'learning_rate':[0.001, 0.01,0.1,0.5,1,10],
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,6),
    'subsample':[0.6,0.7,0.8,0.9,1],
}
rs_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=0), 
                           param_distributions=param_gb, 
                           n_iter=60, 
                           cv=5, 
                           scoring='roc_auc', 
                           n_jobs=-1)

In [27]:
rs_gb.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=0),
                   n_iter=60, n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1, 0.5,
                                                          1, 10],
                                        'max_depth': range(1, 6),
                                        'n_estimators': [100, 200, 300, 400,
                                                         500],
                                        'subsample': [0.6, 0.7, 0.8, 0.9, 1]},
                   scoring='roc_auc')

In [28]:
rs_gb.best_params_

{'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}

In [29]:
rs_gb.best_score_

0.8628826716740372

In [30]:
rs_df2 = pd.DataFrame(rs_gb.cv_results_)
rs_df2.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,35.538048,1.267787,0.063381,0.002326,0.8,100,5,0.1,"{'subsample': 0.8, 'n_estimators': 100, 'max_d...",0.860344,0.862704,0.863764,0.860199,0.867403,0.862883,0.002642,1
33,18.655808,0.236207,0.04933,0.002141,0.6,100,3,0.1,"{'subsample': 0.6, 'n_estimators': 100, 'max_d...",0.859014,0.861523,0.86217,0.860825,0.868612,0.862429,0.003267,2
28,147.195277,1.700289,0.250741,0.008951,0.8,500,4,0.01,"{'subsample': 0.8, 'n_estimators': 500, 'max_d...",0.857962,0.861562,0.8626,0.860257,0.869364,0.862349,0.003835,3
51,176.314855,2.802965,0.247478,0.00923,1.0,500,4,0.01,"{'subsample': 1, 'n_estimators': 500, 'max_dep...",0.857775,0.860747,0.861882,0.86016,0.86949,0.862011,0.003973,4
49,31.857775,1.399209,0.059154,0.004438,0.8,200,2,0.5,"{'subsample': 0.8, 'n_estimators': 200, 'max_d...",0.858951,0.861123,0.862291,0.860526,0.864862,0.861551,0.001975,5


In [31]:
param={
    "subsample":[0.6, 0.7, 0.8], 
    "n_estimators":[300,400,500],
    "max_depth":[2,3,4]
}
gs_gb = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1, random_state=0),
                    param_grid=param, 
                    scoring='roc_auc',
                    cv=5,
                    n_jobs=-1)

In [62]:
gs_gb.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4],
                         'n_estimators': [300, 400, 500],
                         'subsample': [0.6, 0.7, 0.8]},
             scoring='roc_auc')

In [63]:
gs_gb.best_params_

{'max_depth': 3, 'n_estimators': 400, 'subsample': 0.8}

In [64]:
gs_gb.best_score_

0.8651901976338401

In [65]:
gs_df2 = pd.DataFrame(gs_gb.cv_results_)
gs_df2.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,94.270927,0.613004,0.218747,1.1e-05,3,400,0.8,"{'max_depth': 3, 'n_estimators': 400, 'subsamp...",0.874113,0.866089,0.863399,0.860763,0.861586,0.86519,0.004821,1
17,118.147506,0.748996,0.265629,0.009872,3,500,0.8,"{'max_depth': 3, 'n_estimators': 500, 'subsamp...",0.874049,0.865394,0.863408,0.860701,0.861606,0.865032,0.004787,2
11,76.431602,1.126797,0.177856,0.008967,3,300,0.8,"{'max_depth': 3, 'n_estimators': 300, 'subsamp...",0.874041,0.866068,0.86322,0.860193,0.861567,0.865018,0.004918,3
16,105.908822,0.339065,0.262501,0.006252,3,500,0.7,"{'max_depth': 3, 'n_estimators': 500, 'subsamp...",0.874683,0.865589,0.863736,0.860127,0.860571,0.864941,0.005273,4
13,85.046958,0.418646,0.212501,0.007652,3,400,0.7,"{'max_depth': 3, 'n_estimators': 400, 'subsamp...",0.874596,0.865694,0.863766,0.860195,0.860377,0.864925,0.005263,5


In [74]:
best_model_gb = gs_gb.best_estimator_

In [95]:
# 모델 저장
gb_file_path = os.path.join(save_dir, 'gradient_boosting_best.pkl')
with open(gb_file_path, 'wb') as fw:
    pickle.dump(best_model_gb, fw) 

In [24]:
gb_file_path = os.path.join(save_dir, 'gradient_boosting_best.pkl')
with open(gb_file_path, 'rb') as fr:
    saved_gb = pickle.load(fr) 

### RandomForest

In [56]:
param_rf = {
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,5),
    'max_features':range(5,11)
}
rs_rf = RandomizedSearchCV(RandomForestClassifier(random_state=0), 
                                 param_distributions=param_rf, 
                                 n_iter=60, 
                                 cv=5, 
                                 scoring='roc_auc', 
                                 n_jobs=-1)

In [57]:
rs_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   n_iter=60, n_jobs=-1,
                   param_distributions={'max_depth': range(1, 5),
                                        'max_features': range(5, 11),
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   scoring='roc_auc')

In [58]:
rs_rf.best_params_

{'n_estimators': 500, 'max_features': 5, 'max_depth': 4}

In [59]:
rs_rf.best_score_

0.8574212595493437

In [60]:
rs_df3 = pd.DataFrame(rs_rf.cv_results_)
rs_df3.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,73.531847,2.457526,1.36875,0.021194,500,5,4,"{'n_estimators': 500, 'max_features': 5, 'max_...",0.867495,0.857701,0.857847,0.851686,0.852378,0.857421,0.005658,1
24,40.256247,0.420611,0.824998,0.006249,300,5,4,"{'n_estimators': 300, 'max_features': 5, 'max_...",0.867615,0.857822,0.857946,0.851409,0.852292,0.857417,0.005776,2
9,80.320407,1.818594,1.550004,0.121598,500,6,4,"{'n_estimators': 500, 'max_features': 6, 'max_...",0.866967,0.856554,0.857327,0.851474,0.852273,0.856919,0.005522,3
1,30.761284,1.485742,0.549998,0.011696,200,6,4,"{'n_estimators': 200, 'max_features': 6, 'max_...",0.867609,0.856124,0.8568,0.851451,0.852574,0.856912,0.005722,4
26,61.568321,0.602533,1.09062,0.006253,400,6,4,"{'n_estimators': 400, 'max_features': 6, 'max_...",0.866947,0.856386,0.857085,0.85153,0.852166,0.856823,0.005522,5


In [79]:
param = {
    "n_estimators":[400,500,600,700], 
    "max_features":[3,4,5,6],
    "max_depth":[3,4,5,6]
}

gs_rf = GridSearchCV(RandomForestClassifier(random_state=0),
                    param_grid=param, 
                    scoring='roc_auc',
                    cv=5,
                    n_jobs=-1)

In [80]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6],
                         'max_features': [3, 4, 5, 6],
                         'n_estimators': [400, 500, 600, 700, 700]},
             scoring='roc_auc')

In [81]:
gs_rf.best_params_

{'max_depth': 6, 'max_features': 3, 'n_estimators': 400}

In [82]:
gs_rf.best_score_

0.861018496990928

In [83]:
gs_df3 = pd.DataFrame(gs_rf.cv_results_)
gs_df3.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
60,51.003557,0.36124,1.300001,0.011693,6,3,400,"{'max_depth': 6, 'max_features': 3, 'n_estimat...",0.871344,0.861713,0.861344,0.854574,0.856118,0.861018,0.005878,1
61,64.879531,0.439918,1.649999,0.03062,6,3,500,"{'max_depth': 6, 'max_features': 3, 'n_estimat...",0.871227,0.861697,0.861102,0.854534,0.856227,0.860958,0.005825,2
62,78.449902,0.316428,1.962504,0.015931,6,3,600,"{'max_depth': 6, 'max_features': 3, 'n_estimat...",0.871321,0.861691,0.861041,0.85455,0.856165,0.860954,0.005865,3
63,91.26952,0.806867,2.28334,0.007059,6,3,700,"{'max_depth': 6, 'max_features': 3, 'n_estimat...",0.871279,0.861667,0.861067,0.854563,0.856156,0.860946,0.005848,4
64,90.540809,0.366533,2.275007,0.015934,6,3,700,"{'max_depth': 6, 'max_features': 3, 'n_estimat...",0.871279,0.861667,0.861067,0.854563,0.856156,0.860946,0.005848,4


In [84]:
best_model_rf = gs_rf.best_estimator_

In [85]:
rf_file_path = os.path.join(save_dir, "random_forest_best.pkl")
with open(rf_file_path, 'wb') as fo:
    pickle.dump(best_model_rf, fo)

In [25]:
rf_file_path = os.path.join(save_dir, "random_forest_best.pkl")
with open(rf_file_path, 'rb') as fo:
    saved_rf = pickle.load(fo)


## VotingClassifier
- best model들 사용

In [106]:
from sklearn.ensemble import VotingClassifier
estimators = [('xgb', saved_xgb), ('gradient boost', saved_gb), ('random forest', saved_rf)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

In [107]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1,
                                            enable_categorical=False, gamma=0,
                                            gpu_id=-1, importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.1, max_delta_step=0,
                                            max_depth=2, min_child_weight=1,
                                            missing=nan,
                                            monotone_constraints='()',
                                            n_estimators=40...
                                            predictor='auto', random_state=0,
                                     

In [109]:
pred_proba  = voting_clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_proba[:, 1])
auc_score

0.8618094979187302

- xgboost : 0.8657477556440014
- grandient boosting : 0.8651901976338401
- RandomForest: 0.861018496990928
- voting: 0.8618094979187302

## Test Set 으로 검증

In [26]:
def test(estimator, X, y):
    pred_prob = estimator.predict_proba(X)
    return roc_auc_score(y, pred_prob[:, 1])

In [27]:
# XGB
test(saved_xgb, X_test, y_test)

0.870387089730717

In [28]:
# Gradient Boosting
test(saved_gb, X_test, y_test)

0.8734470316904399

### Test set 최종 검증결과
- xgboost : 0.870387089730717
- grandient boosting : 0.8734470316904399