# CatBoost 라벨인코딩 변수 선택

In [53]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [55]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [56]:
# train data set의 컬럼 확인 
loan_train.columns

Index(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Home Ownership',
       'Employment Duration', 'Verification Status', 'Loan Title',
       'Debit to Income', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Application Type', 'Last week Pay', 'Accounts Delinquent',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

In [57]:
len(loan_train.columns)

32

In [58]:
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [59]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto',random_state = 20 )
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61196, 1: 61196})


In [60]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)
len(X_train.columns)

31

# SelectFromModel 사용

In [61]:

from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostClassifier

f_select = SelectFromModel(CatBoostClassifier(random_state = 20),  max_features =31)
f_select.fit(X_train, y_train)
f_scores = pd.DataFrame()

Learning rate set to 0.070967
0:	learn: 0.6321651	total: 52.1ms	remaining: 52s
1:	learn: 0.5905975	total: 103ms	remaining: 51.4s
2:	learn: 0.5563901	total: 169ms	remaining: 56.2s
3:	learn: 0.5296140	total: 224ms	remaining: 55.8s
4:	learn: 0.5101872	total: 273ms	remaining: 54.4s
5:	learn: 0.4894777	total: 337ms	remaining: 55.8s
6:	learn: 0.4758141	total: 386ms	remaining: 54.8s
7:	learn: 0.4612460	total: 449ms	remaining: 55.7s
8:	learn: 0.4489714	total: 501ms	remaining: 55.1s
9:	learn: 0.4394454	total: 556ms	remaining: 55.1s
10:	learn: 0.4289515	total: 615ms	remaining: 55.3s
11:	learn: 0.4188955	total: 674ms	remaining: 55.5s
12:	learn: 0.4080984	total: 735ms	remaining: 55.8s
13:	learn: 0.4017891	total: 790ms	remaining: 55.7s
14:	learn: 0.3929168	total: 833ms	remaining: 54.7s
15:	learn: 0.3863325	total: 899ms	remaining: 55.3s
16:	learn: 0.3784585	total: 953ms	remaining: 55.1s
17:	learn: 0.3739917	total: 1.01s	remaining: 55.3s
18:	learn: 0.3688128	total: 1.06s	remaining: 55s
19:	learn: 0.3

In [62]:
f_scores['attribute'] = X_train.columns
f_scores['support'] = f_select.get_support()
from_model = f_scores[f_scores['support'] == True]["attribute"].values
from_model

array(['Funded Amount Investor', 'Employment Duration',
       'Delinquency - two years', 'Inquires - six months', 'Open Account',
       'Revolving Balance', 'Total Received Interest',
       'Total Received Late Fee', 'Total Collection Amount',
       'Total Current Balance', 'Total Revolving Credit Limit'],
      dtype=object)

# SelectKBest 사용

In [63]:
kbest = ['Delinquency - two years', 'Open Account', 'Total Received Interest',
       'Home Ownership', 'Loan Title', 'Revolving Balance',
       'Verification Status', 'Initial List Status', 'Total Collection Amount',
       'Total Received Late Fee', 'Funded Amount Investor',
       'Employment Duration', 'Total Revolving Credit Limit',
       'Inquires - six months', 'Term', 'Recoveries', 'Total Current Balance',
       'Public Record', 'Grade', 'Funded Amount', 'Last week Pay',
       'Debit to Income', 'Interest Rate', 'Collection Recovery Fee',
       'Sub Grade', 'Total Accounts','Revolving Utilities']

In [64]:
select = list(set(kbest).union(set(from_model)))
len(select)

27

In [65]:
X = X_train_ns[select]
y = y_train_ns

In [66]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 20, stratify = y)
y_train_ns.value_counts(True), y_test.value_counts(True)

(0    0.5
 1    0.5
 Name: Loan Status, dtype: float64,
 0    0.5
 1    0.5
 Name: Loan Status, dtype: float64)

# CatBoost 예측 및 시행

In [67]:
cb_clf = CatBoostClassifier(random_state= 20)

# 학습
cb_clf.fit(X_train,y_train)

Learning rate set to 0.070967
0:	learn: 0.6319527	total: 61.5ms	remaining: 1m 1s
1:	learn: 0.5864606	total: 107ms	remaining: 53.4s
2:	learn: 0.5520363	total: 163ms	remaining: 54.3s
3:	learn: 0.5276642	total: 221ms	remaining: 55s
4:	learn: 0.5025588	total: 271ms	remaining: 53.9s
5:	learn: 0.4859937	total: 326ms	remaining: 54s
6:	learn: 0.4703709	total: 370ms	remaining: 52.4s
7:	learn: 0.4584998	total: 433ms	remaining: 53.7s
8:	learn: 0.4437031	total: 498ms	remaining: 54.8s
9:	learn: 0.4329557	total: 547ms	remaining: 54.2s
10:	learn: 0.4214573	total: 622ms	remaining: 55.9s
11:	learn: 0.4125328	total: 676ms	remaining: 55.6s
12:	learn: 0.4049366	total: 742ms	remaining: 56.3s
13:	learn: 0.3969762	total: 791ms	remaining: 55.7s
14:	learn: 0.3895906	total: 851ms	remaining: 55.9s
15:	learn: 0.3829530	total: 909ms	remaining: 55.9s
16:	learn: 0.3785873	total: 967ms	remaining: 55.9s
17:	learn: 0.3742516	total: 1.02s	remaining: 55.8s
18:	learn: 0.3684944	total: 1.08s	remaining: 55.8s
19:	learn: 0.3

<catboost.core.CatBoostClassifier at 0x282a53718e0>

In [68]:
from getscore import get_eval_score


In [69]:
get_eval_score(y_train, cb_clf.predict(X_train), cb_clf.predict_proba(X_train)[:,1])
get_eval_score(y_test, cb_clf.predict(X_test),cb_clf.predict_proba(X_test)[:,1])

----------------------------------------------------------------------------------------------------
오차행렬:
[[43877  2020]
 [  239 45658]]
정확도: 0.9753905 정밀도: 0.9576325 재현율: 0.9947927 F1:0.9758589 AUC: 0.9972366
----------------------------------------------------------------------------------------------------
오차행렬:
[[14220  1079]
 [  186 15113]]
정확도: 0.9586574 정밀도: 0.9333622 재현율: 0.9878423 F1:0.9598298 AUC: 0.9920447
