# LGBM 라벨인코딩 모든 변수

In [11]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [13]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [14]:
# train data set의 컬럼 확인 
loan_train.columns

Index(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Home Ownership',
       'Employment Duration', 'Verification Status', 'Loan Title',
       'Debit to Income', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Application Type', 'Last week Pay', 'Accounts Delinquent',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

In [15]:
# 독립변수와 종속변수 선택하기
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [16]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto',random_state = 20)
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61196, 1: 61196})


In [17]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)

(0    0.5
 1    0.5
 Name: Loan Status, dtype: float64,
 0    0.5
 1    0.5
 Name: Loan Status, dtype: float64)

# LGBM 예측 및 성능지표

In [18]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=400, random_state=20)

lgbm.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss',
         eval_set = [(X_test,y_test)], verbose=True)

[1]	valid_0's binary_logloss: 0.63681
[2]	valid_0's binary_logloss: 0.5906
[3]	valid_0's binary_logloss: 0.551993
[4]	valid_0's binary_logloss: 0.518694
[5]	valid_0's binary_logloss: 0.491187
[6]	valid_0's binary_logloss: 0.465153
[7]	valid_0's binary_logloss: 0.444552
[8]	valid_0's binary_logloss: 0.424628
[9]	valid_0's binary_logloss: 0.40966
[10]	valid_0's binary_logloss: 0.394475
[11]	valid_0's binary_logloss: 0.380988
[12]	valid_0's binary_logloss: 0.37046
[13]	valid_0's binary_logloss: 0.357882
[14]	valid_0's binary_logloss: 0.348022
[15]	valid_0's binary_logloss: 0.338123
[16]	valid_0's binary_logloss: 0.330756
[17]	valid_0's binary_logloss: 0.324744
[18]	valid_0's binary_logloss: 0.316679
[19]	valid_0's binary_logloss: 0.310071
[20]	valid_0's binary_logloss: 0.30525
[21]	valid_0's binary_logloss: 0.297446
[22]	valid_0's binary_logloss: 0.291865
[23]	valid_0's binary_logloss: 0.288382
[24]	valid_0's binary_logloss: 0.282857
[25]	valid_0's binary_logloss: 0.277593
[26]	valid_0's 

In [19]:

def get_eval_score(y_test, y_pred, y_pred_proba):
    from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix,accuracy_score, precision_score, recall_score
    confusion = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print("-"* 100)
    print("오차행렬:")
    print(confusion)
    print("정확도: {:.7f} 정밀도: {:.7f} 재현율: {:.7f} F1:{:.7f} AUC: {:.7f}".\
          format(acc, prec,recall, f1, roc_auc ))
    return


In [20]:
get_eval_score(y_train, lgbm.predict(X_train), lgbm.predict_proba(X_train)[:,-1])
get_eval_score(y_test, lgbm.predict(X_test), lgbm.predict_proba(X_test)[:,-1])

----------------------------------------------------------------------------------------------------
오차행렬:
[[44539  1358]
 [  137 45760]]
정확도: 0.9837135 정밀도: 0.9711787 재현율: 0.9970151 F1:0.9839273 AUC: 0.9989899
----------------------------------------------------------------------------------------------------
오차행렬:
[[14282  1017]
 [  157 15142]]
정확도: 0.9616315 정밀도: 0.9370629 재현율: 0.9897379 F1:0.9626804 AUC: 0.9935473
