# XGBoost 라벨인코딩 모든 변수

In [1]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [4]:
# train data set의 컬럼 확인 
loan_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62847 entries, 0 to 62846
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan Amount                   62847 non-null  int64  
 1   Funded Amount                 62847 non-null  int64  
 2   Funded Amount Investor        62847 non-null  float64
 3   Term                          62847 non-null  int64  
 4   Interest Rate                 62847 non-null  float64
 5   Grade                         62847 non-null  int64  
 6   Sub Grade                     62847 non-null  int64  
 7   Home Ownership                62847 non-null  int64  
 8   Employment Duration           62847 non-null  float64
 9   Verification Status           62847 non-null  int64  
 10  Loan Title                    62847 non-null  int64  
 11  Debit to Income               62847 non-null  float64
 12  Delinquency - two years       62847 non-null  int64  
 13  I

In [5]:
# 독립변수와 종속변수 선택하기
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [6]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto',random_state = 20)
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61196, 1: 61196})


In [7]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)

(0    0.5
 1    0.5
 Name: Loan Status, dtype: float64,
 0    0.5
 1    0.5
 Name: Loan Status, dtype: float64)

## XGBoost 예측 및 시행

In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier( random_state = 20)
xgb.fit(X_train, y_train, eval_metric = "logloss",eval_set = [(X_test, y_test)],  verbose = True)

[0]	validation_0-logloss:0.58787
[1]	validation_0-logloss:0.52124
[2]	validation_0-logloss:0.47697
[3]	validation_0-logloss:0.44176
[4]	validation_0-logloss:0.41259
[5]	validation_0-logloss:0.39155
[6]	validation_0-logloss:0.37223
[7]	validation_0-logloss:0.35646
[8]	validation_0-logloss:0.34258
[9]	validation_0-logloss:0.32918
[10]	validation_0-logloss:0.32000
[11]	validation_0-logloss:0.31036
[12]	validation_0-logloss:0.30273
[13]	validation_0-logloss:0.29396
[14]	validation_0-logloss:0.28597
[15]	validation_0-logloss:0.27781
[16]	validation_0-logloss:0.27171
[17]	validation_0-logloss:0.26251
[18]	validation_0-logloss:0.25730
[19]	validation_0-logloss:0.25367
[20]	validation_0-logloss:0.24848
[21]	validation_0-logloss:0.24444
[22]	validation_0-logloss:0.24081
[23]	validation_0-logloss:0.23716
[24]	validation_0-logloss:0.23277
[25]	validation_0-logloss:0.23054
[26]	validation_0-logloss:0.22648
[27]	validation_0-logloss:0.22164
[28]	validation_0-logloss:0.21970
[29]	validation_0-loglos

In [9]:

def get_eval_score(y_test, y_pred, y_pred_proba):
    from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix,accuracy_score, precision_score, recall_score
    confusion = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print("-"* 100)
    print("오차행렬:")
    print(confusion)
    print("정확도: {:.7f} 정밀도: {:.7f} 재현율: {:.7f} F1:{:.7f} AUC: {:.7f}".\
          format(acc, prec,recall, f1, roc_auc ))
    return


In [10]:
get_eval_score(y_train, xgb.predict(X_train), xgb.predict_proba(X_train)[:,1])
get_eval_score(y_test, xgb.predict(X_test), xgb.predict_proba(X_test)[:,1])

----------------------------------------------------------------------------------------------------
오차행렬:
[[42919  2978]
 [  447 45450]]
정확도: 0.9626882 정밀도: 0.9385066 재현율: 0.9902608 F1:0.9636894 AUC: 0.9933230
----------------------------------------------------------------------------------------------------
오차행렬:
[[13931  1368]
 [  278 15021]]
정확도: 0.9462056 정밀도: 0.9165294 재현율: 0.9818289 F1:0.9480560 AUC: 0.9863311
