# Classification 종합실습

 ## 신용대출 심사

* 고객사는 ## 은행입니다. 신용평가 업무를 인공지능으로 전환하고자 여러분에게 모델링을 의뢰하였습니다.
* 대출업무는
    * 은행 창구에서 신청을 받고
    * 본사의 심사부서에서는 신용평가를 통해 대출 신청에 대한 승인 여부를 결정해 왔습니다.

* 현장의 요구
    * 경쟁사의 공격적인 대출상품 판매로, 본사에서는 자사 은행의 대출 실적이 줄어들고 있는 것에 부담을 느끼고 있습니다.
    * 그런데, 자사 은행에서는 신용평가 결과의 정확성에 의문을 품고 있으며, 신용평가 기준을 완화하여 가급적 대출승인 범위를 더 확대해 주기를 요구합니다. 

* 신용평가 업무를 인공지능으로 전환
    * 현장의 요구를 감안하여, 과거 사람이 하던 평가방식을 개선하고자 인공지능에 의한 예측 모델을 만들고, 정확도를 높이고자 합니다.
    * 최적의 모델을 생성해 봅시다.


## 1.환경준비

### (1) import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import * 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) 데이터 준비

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.loc[data['Payment'] == 4, 'Payment'] = 3
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,Apartment,CreditCount,Occupation,Dependents,Telephone,ForeignWorker
0,1,3,24,2,0,1249,2,4,2,1,28,2,1,3,1,1,1
1,1,2,9,2,0,276,3,4,4,1,22,1,1,2,1,1,1
2,1,1,18,3,2,1049,2,2,4,2,21,1,1,3,1,1,1
3,1,1,24,3,1,6419,5,2,4,4,44,3,2,4,2,2,1
4,1,3,12,2,2,1424,5,2,4,1,55,2,1,4,1,2,1


|	칼럼명	|	설명	|	 	|	값 의미	|
|	-----	|	-----	|	-----	|	-----	|
|	Creditability	|	Creditability(Target)	|	향후 신용도	|	0 : Bad, 1 : Good	|
|	AccountBalance	|	Account Balance	|	은행잔고	|	1: No account, 2 : None (No balance), 3 : Some Balance	|
|	CreditDuration	|	Duration of Credit (month)	|	신청한 대출기간(월)	|	숫자	|
|	Payment	|	Payment Status of Previous Credit	|	과거 대출 납입 상태	|	0 : 연체, 1 : 기타신용, 2 : 완납, 3 : 정상 대출상환 중 |
|	Purpose	|	Purpose	|	신청한 대출목적	|	1 : New Car , 2 : Used Car , 3 : Furniture , 4 : TV , 5 : Appliances , 6 : Repair , 8 : Vacation , 9 :Retraining , 10 : Business , 0 : Other	|
|	CreditAmount	|	Credit Amount($)	|	신청한 대출금액	|		|
|	Employment	|	Length of current employment(Month)	|	현 직업 근무 기간	|	1: Unemployed,  2: <1 Year,  3: [1, 4),  4: [4, 7),  5: Above 7	|
|	SexMarital	|	Sex & Marital Status	|	성별 & 결혼상태	|	1: Male, Divorced, 2: Male, Single , 3: Male, Married/Widowed , 4: Female	|
|	CurrentAddress	|	Duration in Current address	|	현 거주지 거주기간	|	1: <1 Year , 2: [1, 4) , 3: [4, 7) , 4: Above 7	|
|	MostValuable	|	Most valuable available asset	|	가장 가치있는 자산	|	1: None , 2: Car , 3: Life Insurance , 4: Real Estate	|
|	Age	|	Age (years)	|	나이	|		|
|	AppartmentType	|	Type of apartment	|	주거환경	|	1: free apartment, 2: Rented, 3: Owned	|
|	NoCredits	|	No of Credits at this Bank	|	현재 총 대출 건수	|	1 : one, 2 : 2 ~ 3, 3 : 4 ~ 5, 4 : 6 ~	|
|	Occupation	|	Occupation	|	직업	|	1: Unemployed, unskilled, 2: Unskilled Permanent Resident, 3: Skilled, 4: Executive	|
|	Telephone	|	Telephone	|	전화기 소유 여부	|	2: Yes , 1: No	|
|	ForeignWorker	|	Foreign Worker	|	외국인 근로자 여부	|	2: Yes , 1: No	|


## 2.데이터 준비

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [9]:
target = 'Creditability'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

In [10]:
dummy_vars = ['Employment', 'CurrentAddress', 'CreditCount', 'Dependents', 'Telephone', 'AccountBalance', 'Payment', 'Purpose', 'SexMarital', 'MostValuableAsset', 'Apartment','Occupation','ForeignWorker']
x = pd.get_dummies(x, columns = dummy_vars, drop_first = True)
x.head()

Unnamed: 0,Duration,CreditAmount,Age,Employment_2,Employment_3,Employment_4,Employment_5,CurrentAddress_2,CurrentAddress_3,CurrentAddress_4,...,SexMarital_4,MostValuableAsset_2,MostValuableAsset_3,MostValuableAsset_4,Apartment_2,Apartment_3,Occupation_2,Occupation_3,Occupation_4,ForeignWorker_2
2,18,1049,21,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,24,6419,44,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,0,1,0
4,12,1424,55,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
5,30,1715,26,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
6,10,781,63,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0


### (5) 데이터분할2 : train : validation 나누기

In [11]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state = 2022)

### (6) Scaling

In [12]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## 3.모델링

* 사용 알고리즘 : LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, SVC
* 3가지의 알고리즘을 선정하고 성능을 튜닝해 봅시다.

### (1) 로지스틱 회귀

* 함수 생성

In [44]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit(disp=False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

* 전진선택법 수행

In [45]:
vars, result = forward_stepwise_logistic(x_train, y_train)

* 선택된 변수

In [14]:
vars

['AccountBalance_3',
 'Apartment_2',
 'Duration',
 'Age',
 'Purpose_1',
 'Employment_4',
 'Employment_2',
 'Purpose_3',
 'Purpose_2',
 'SexMarital_2',
 'AccountBalance_2',
 'CurrentAddress_2',
 'ForeignWorker_2',
 'Payment_3',
 'CreditCount_2',
 'CreditCount_4',
 'CurrentAddress_4']

* 전체 변수로 모델링

In [21]:
model_log1 = LogisticRegression()
model_log1.fit(x_train, y_train)
pred_log1 = model_log1.predict(x_val)
print(confusion_matrix(y_val, pred_log1))
print(classification_report(y_val, pred_log1))

[[ 31  68]
 [ 14 187]]
              precision    recall  f1-score   support

           0       0.69      0.31      0.43        99
           1       0.73      0.93      0.82       201

    accuracy                           0.73       300
   macro avg       0.71      0.62      0.63       300
weighted avg       0.72      0.73      0.69       300



* 전진선택법 변수로 모델링

In [22]:
model_log2 = LogisticRegression()
model_log2.fit(x_train[vars], y_train)
pred_log2 = model_log2.predict(x_val[vars])
print(confusion_matrix(y_val, pred_log2))
print(classification_report(y_val, pred_log2))

[[ 44  55]
 [ 21 180]]
              precision    recall  f1-score   support

           0       0.68      0.44      0.54        99
           1       0.77      0.90      0.83       201

    accuracy                           0.75       300
   macro avg       0.72      0.67      0.68       300
weighted avg       0.74      0.75      0.73       300



### (2) 의사결정나무

In [27]:
params1 = {'max_depth' : range(1,16), 'min_samples_leaf' : range(1, 101, 5)}

In [31]:
model2 = DecisionTreeClassifier()
model_de = GridSearchCV(model2, params1, cv=5)

In [32]:
model_de.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 16),
                         'min_samples_leaf': range(1, 101, 5)})

In [33]:
model_de.best_score_

0.7277903391572456

In [34]:
model_de.best_params_

{'max_depth': 3, 'min_samples_leaf': 66}

In [35]:
pred1 = model_de.predict(x_val)

In [36]:
print(confusion_matrix(y_val,pred1))
print(classification_report(y_val, pred1))

[[ 25  74]
 [ 21 180]]
              precision    recall  f1-score   support

           0       0.54      0.25      0.34        99
           1       0.71      0.90      0.79       201

    accuracy                           0.68       300
   macro avg       0.63      0.57      0.57       300
weighted avg       0.65      0.68      0.64       300



### (3) KNN

In [39]:
params2 = { 'n_neighbors' : range(3,31,2), 'metric' : ['euclidean', 'manhattan']  }

In [40]:
model3 = KNeighborsClassifier()
model_knn = GridSearchCV(model3, params2, cv=5)

In [41]:
model_knn.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(3, 31, 2)})

In [42]:
print(model_knn.best_params_)
print(model_knn.best_score_)
pred2 = model_knn.predict(x_val_s)

{'metric': 'manhattan', 'n_neighbors': 13}
0.734994861253854


In [43]:
print(confusion_matrix(y_val, pred2))
print(classification_report(y_val, pred2))

[[ 20  79]
 [  7 194]]
              precision    recall  f1-score   support

           0       0.74      0.20      0.32        99
           1       0.71      0.97      0.82       201

    accuracy                           0.71       300
   macro avg       0.73      0.58      0.57       300
weighted avg       0.72      0.71      0.65       300



### (4) SVM

In [54]:
params4 = {'kernel' : ['linear', 'rbf'], 'C':np.arange(0.1, 2.2, 0.3)}

In [55]:
model4 = SVC()
model_SVC = GridSearchCV(model4, params4, cv=5)

In [None]:
model_SVC.fit(x_train_s, y_train)

In [None]:
print(model_SVC.best_params_)
print(model_SVC.best_score_)
pred3 = model_SVC.predict(x_val_s)