## KBest 모델

In [47]:
# 필요한 sklearn import 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [49]:
# train data set
loan_train = pd.read_csv('../label_loan_train.csv')

In [50]:
# train data set의 컬럼 확인 
loan_train.columns

Index(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Home Ownership',
       'Employment Duration', 'Verification Status', 'Loan Title',
       'Debit to Income', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Application Type', 'Last week Pay', 'Accounts Delinquent',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

In [51]:
X = loan_train.drop("Loan Status", axis = 1)
y = loan_train["Loan Status"]

In [52]:
# 타겟의 비대칭 데이터 문제를 해결하기 위하여 SMOTETomek를 사용

#데이터 클래스 비율이 너무 차이가 나면(highly-imbalanced data) 
#단순히 우세한 클래스를 택하는 모형의 정확도가 높아지므로 모형의 성능판별이 어려워진다. 
#즉, 정확도(accuracy)가 높아도 데이터 갯수가 적은 클래스의 재현율(recall-rate)이 급격히 작아지는 현상이 발생할 수 있다.

from imblearn.combine import SMOTETomek
from collections import Counter
Counter(y)

os=SMOTETomek(sampling_strategy='auto')
X_train_ns,y_train_ns=os.fit_resample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 61222, 1: 1625})
The number of classes after fit Counter({0: 61198, 1: 61198})


In [53]:
# 데이터 셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, test_size = 0.25, random_state = 20, stratify = y_train_ns)
y_train_ns.value_counts(True), y_test.value_counts(True)
len(X_train.columns)

31

# SelectKBest 사용

In [54]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_train,y_train, random_state = 20)

mutual_data = pd.Series(mutual_info,index=X_train.columns)


In [69]:
feature = mutual_data.sort_values(ascending = False)
feature[feature.values > 0.01].index

Index(['Delinquency - two years', 'Open Account', 'Total Received Interest',
       'Home Ownership', 'Loan Title', 'Revolving Balance',
       'Verification Status', 'Initial List Status', 'Total Collection Amount',
       'Total Received Late Fee', 'Funded Amount Investor',
       'Employment Duration', 'Total Revolving Credit Limit',
       'Inquires - six months', 'Term', 'Recoveries', 'Total Current Balance',
       'Public Record', 'Grade', 'Funded Amount', 'Last week Pay',
       'Debit to Income', 'Interest Rate', 'Collection Recovery Fee',
       'Sub Grade', 'Total Accounts', 'Revolving Utilities'],
      dtype='object')