# CatBoost 라벨인코딩 모든변수

In [13]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [15]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [16]:
# train data set의 컬럼 확인 
loan_train.columns

Index(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Home Ownership',
       'Employment Duration', 'Verification Status', 'Loan Title',
       'Debit to Income', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Application Type', 'Last week Pay', 'Accounts Delinquent',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

In [17]:
# 독립변수와 종속변수 선택하기
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [18]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto', random_state = 20)
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61200, 1: 61200})


In [19]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)

(0    0.5
 1    0.5
 Name: Loan Status, dtype: float64,
 0    0.5
 1    0.5
 Name: Loan Status, dtype: float64)

## CatBoost 모델 예측 및 시행

In [20]:
import catboost

In [21]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(random_state = 20)
cb.fit(X_train, y_train)

Learning rate set to 0.070969
0:	learn: 0.6350457	total: 74.5ms	remaining: 1m 14s
1:	learn: 0.5945976	total: 137ms	remaining: 1m 8s
2:	learn: 0.5603185	total: 195ms	remaining: 1m 4s
3:	learn: 0.5324362	total: 255ms	remaining: 1m 3s
4:	learn: 0.5092285	total: 312ms	remaining: 1m 2s
5:	learn: 0.4862850	total: 361ms	remaining: 59.8s
6:	learn: 0.4716713	total: 415ms	remaining: 58.9s
7:	learn: 0.4583861	total: 465ms	remaining: 57.6s
8:	learn: 0.4463752	total: 513ms	remaining: 56.5s
9:	learn: 0.4358699	total: 575ms	remaining: 56.9s
10:	learn: 0.4259248	total: 636ms	remaining: 57.2s
11:	learn: 0.4154129	total: 698ms	remaining: 57.4s
12:	learn: 0.4079015	total: 747ms	remaining: 56.7s
13:	learn: 0.3995323	total: 806ms	remaining: 56.7s
14:	learn: 0.3924618	total: 858ms	remaining: 56.4s
15:	learn: 0.3867291	total: 915ms	remaining: 56.3s
16:	learn: 0.3810735	total: 975ms	remaining: 56.4s
17:	learn: 0.3768829	total: 1.03s	remaining: 56.3s
18:	learn: 0.3722645	total: 1.09s	remaining: 56.4s
19:	learn

<catboost.core.CatBoostClassifier at 0x1cab4a55c40>

In [22]:

def get_eval_score(y_test, y_pred, y_pred_proba):
    from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix,accuracy_score, precision_score, recall_score
    confusion = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print("-"* 100)
    print("오차행렬:")
    print(confusion)
    print("정확도: {:.7f} 정밀도: {:.7f} 재현율: {:.7f} F1:{:.7f} AUC: {:.7f}".\
          format(acc, prec,recall, f1, roc_auc ))
    return


In [23]:
get_eval_score(y_train, cb.predict(X_train), cb.predict_proba(X_train)[:,1])
get_eval_score(y_test, cb.predict(X_test), cb.predict_proba(X_test)[:,1])

----------------------------------------------------------------------------------------------------
오차행렬:
[[43990  1910]
 [  237 45663]]
정확도: 0.9766122 정밀도: 0.9598512 재현율: 0.9948366 F1:0.9770308 AUC: 0.9974066
----------------------------------------------------------------------------------------------------
오차행렬:
[[14303   997]
 [  223 15077]]
정확도: 0.9601307 정밀도: 0.9379744 재현율: 0.9854248 F1:0.9611143 AUC: 0.9933352
