# LGBM 라벨인코딩 변수선택

In [69]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [71]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [72]:
# train data set의 컬럼 확인 
loan_train.columns

Index(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Home Ownership',
       'Employment Duration', 'Verification Status', 'Loan Title',
       'Debit to Income', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Application Type', 'Last week Pay', 'Accounts Delinquent',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

In [73]:
# 컬럼간의 상관관계 확인
# plt.figure(figsize = (14, 14))
# sns.heatmap(loan_train.corr(), annot = True, annot_kws={"size": 5})
# plt.savefig('corr.png');

**object의 컬럼을 label encoding 할 필요성이 있어 보임  
null 값이 하나도 없음**

In [74]:
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [75]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto', random_state = 20)
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61196, 1: 61196})


In [76]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)
len(X_train.columns)

31

# SelectFromModel 사용

In [77]:
from sklearn.feature_selection import SelectFromModel

f_select = SelectFromModel(LGBMClassifier(random_state = 20),  max_features = 31)
f_select.fit(X_train, y_train)
f_scores = pd.DataFrame()

In [78]:
f_scores['attribute'] = X_train.columns
f_scores['support'] = f_select.get_support()
from_model = f_scores[f_scores['support'] == True]["attribute"].values
from_model

array(['Funded Amount', 'Funded Amount Investor', 'Interest Rate',
       'Employment Duration', 'Loan Title', 'Debit to Income',
       'Open Account', 'Revolving Balance', 'Revolving Utilities',
       'Total Accounts', 'Total Received Interest',
       'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee',
       'Last week Pay', 'Total Collection Amount',
       'Total Current Balance', 'Total Revolving Credit Limit'],
      dtype=object)

# SelectKBest 사용

In [79]:
kbest = ['Delinquency - two years', 'Open Account', 'Total Received Interest',
       'Home Ownership', 'Loan Title', 'Revolving Balance',
       'Verification Status', 'Initial List Status', 'Total Collection Amount',
       'Total Received Late Fee', 'Funded Amount Investor',
       'Employment Duration', 'Total Revolving Credit Limit',
       'Inquires - six months', 'Term', 'Recoveries', 'Total Current Balance',
       'Public Record', 'Grade', 'Funded Amount', 'Last week Pay',
       'Debit to Income', 'Interest Rate', 'Collection Recovery Fee',
       'Sub Grade', 'Total Accounts', 'Revolving Utilities']

In [80]:
select = list(set(kbest).union(set(from_model)))
len(select)

27

In [81]:
X = X_train_ns[select]
y = y_train_ns

In [82]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 20, stratify = y)
y_train.value_counts(True), y_test.value_counts(True)

(0    0.5
 1    0.5
 Name: Loan Status, dtype: float64,
 0    0.5
 1    0.5
 Name: Loan Status, dtype: float64)

# LGBM 예측 및 시행

In [83]:
lgbm_clf = LGBMClassifier(n_jobs=-1 , random_state= 20, n_estimators = 1000)

# 학습
lgbm_clf.fit(X_train,y_train)

In [84]:
get_eval_score(y_train, lgbm_clf.predict(X_train), lgbm_clf.predict_proba(X_train)[:,1])
get_eval_score(y_test, lgbm_clf.predict(X_test),lgbm_clf.predict_proba(X_test)[:,1])

----------------------------------------------------------------------------------------------------
오차행렬:
[[45780   117]
 [    0 45897]]
정확도: 0.9987254 정밀도: 0.9974573 재현율: 1.0000000 F1:0.9987270 AUC: 0.9999987
----------------------------------------------------------------------------------------------------
오차행렬:
[[14546   753]
 [   54 15245]]
정확도: 0.9736257 정밀도: 0.9529316 재현율: 0.9964704 F1:0.9742148 AUC: 0.9974812
