# 분류 모델링_실습


# 1.환경준비

* 라이브러리 Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import  MinMaxScaler

# 2.Classification

## (1) 데이터 전처리
* 데이터 준비
* 가변수화
* 스케일링(필요하다면)
* 데이터 분할

### 1) 데이터 준비

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.loc[data['Payment'] == 4, 'Payment'] = 3
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,Apartment,CreditCount,Occupation,Dependents,Telephone,ForeignWorker
0,1,3,24,2,0,1249,2,4,2,1,28,2,1,3,1,1,1
1,1,2,9,2,0,276,3,4,4,1,22,1,1,2,1,1,1
2,1,1,18,3,2,1049,2,2,4,2,21,1,1,3,1,1,1
3,1,1,24,3,1,6419,5,2,4,4,44,3,2,4,2,2,1
4,1,3,12,2,2,1424,5,2,4,1,55,2,1,4,1,2,1


|	칼럼명	|	설명	|	 	|	값 의미	|
|	-----	|	-----	|	-----	|	-----	|
|	Creditability	|	Creditability(Target)	|	향후 신용도	|	0 : Bad, 1 : Good	|
|	AccountBalance	|	Account Balance	|	은행잔고	|	1: No account, 2 : None (No balance), 3 : Some Balance	|
|	CreditDuration	|	Duration of Credit (month)	|	신청한 대출기간(월)	|	숫자	|
|	Payment	|	Payment Status of Previous Credit	|	과거 대출 납입 상태	|	0 : 연체, 1 : 기타신용, 2 : 완납, 3 : 정상 대출상환 중 |
|	Purpose	|	Purpose	|	신청한 대출목적	|	1 : New Car , 2 : Used Car , 3 : Furniture , 4 : TV , 5 : Appliances , 6 : Repair , 8 : Vacation , 9 :Retraining , 10 : Business , 0 : Other	|
|	CreditAmount	|	Credit Amount($)	|	신청한 대출금액	|		|
|	Employment	|	Length of current employment(Month)	|	현 직업 근무 기간	|	1: Unemployed,  2: <1 Year,  3: [1, 4),  4: [4, 7),  5: Above 7	|
|	SexMarital	|	Sex & Marital Status	|	성별 & 결혼상태	|	1: Male, Divorced, 2: Male, Single , 3: Male, Married/Widowed , 4: Female	|
|	CurrentAddress	|	Duration in Current address	|	현 거주지 거주기간	|	1: <1 Year , 2: [1, 4) , 3: [4, 7) , 4: Above 7	|
|	MostValuable	|	Most valuable available asset	|	가장 가치있는 자산	|	1: None , 2: Car , 3: Life Insurance , 4: Real Estate	|
|	Age	|	Age (years)	|	나이	|		|
|	AppartmentType	|	Type of apartment	|	주거환경	|	1: free apartment, 2: Rented, 3: Owned	|
|	NoCredits	|	No of Credits at this Bank	|	현재 총 대출 건수	|	1 : one, 2 : 2 ~ 3, 3 : 4 ~ 5, 4 : 6 ~	|
|	Occupation	|	Occupation	|	직업	|	1: Unemployed, unskilled, 2: Unskilled Permanent Resident, 3: Skilled, 4: Executive	|
|	Telephone	|	Telephone	|	전화기 소유 여부	|	2: Yes , 1: No	|
|	ForeignWorker	|	Foreign Worker	|	외국인 근로자 여부	|	2: Yes , 1: No	|


In [3]:
target = 'Creditability'
x = data.drop([target], axis=1)
y = data.loc[:, target]

### 2) 데이터분할

In [4]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state = 20)

In [5]:
validation = x_val.copy()
validation[target] = y_val

### 3) 가변수화

In [6]:
cat_cols = ['Employment', 'CurrentAddress', 'CreditCount', 'Dependents', 'Telephone', 'AccountBalance', 'Payment', 'Purpose', 'SexMarital', 'MostValuableAsset', 'Apartment','Occupation','ForeignWorker']

x_train = pd.get_dummies(x_train, columns = cat_cols, drop_first = True)
x_val = pd.get_dummies(x_val, columns = cat_cols, drop_first = True)

### 4) Scaling

In [7]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## (2) 모델링
* 기본코드를 작성해 놓았습니다.
* 여러분은 아래 모델들의 성능을 최적화 해 봅시다. 항목 중 하나를 시도하세요.
    * 1) GridSearchCV로 튜닝
    * 2) 특정 하이퍼파리미터 값 지정
    * 3) 알고리즘을 변경

In [8]:
# 함수들 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

* 모델1

In [9]:
model1 = LogisticRegression()
model1.fit(x_train, y_train)
pred1 = model1.predict(x_val)

print(confusion_matrix(y_val, pred1))
print(classification_report(y_val, pred1))

[[ 37  62]
 [ 16 185]]
              precision    recall  f1-score   support

           0       0.70      0.37      0.49        99
           1       0.75      0.92      0.83       201

    accuracy                           0.74       300
   macro avg       0.72      0.65      0.66       300
weighted avg       0.73      0.74      0.71       300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


* 모델2

In [10]:
model2 = KNeighborsClassifier()
model2.fit(x_train_s, y_train)
pred2 = model2.predict(x_val_s)

print(confusion_matrix(y_val, pred2))
print(classification_report(y_val, pred2))

[[ 25  74]
 [ 18 183]]
              precision    recall  f1-score   support

           0       0.58      0.25      0.35        99
           1       0.71      0.91      0.80       201

    accuracy                           0.69       300
   macro avg       0.65      0.58      0.58       300
weighted avg       0.67      0.69      0.65       300



* 모델3

In [11]:
model3 = RandomForestClassifier()
model3.fit(x_train, y_train)
pred3 = model3.predict(x_val)

print(confusion_matrix(y_val, pred3))
print(classification_report(y_val, pred3))

[[ 38  61]
 [ 11 190]]
              precision    recall  f1-score   support

           0       0.78      0.38      0.51        99
           1       0.76      0.95      0.84       201

    accuracy                           0.76       300
   macro avg       0.77      0.66      0.68       300
weighted avg       0.76      0.76      0.73       300



* 모델4

In [12]:
model4 = XGBClassifier()
model4.fit(x_train, y_train)
pred4 = model4.predict(x_val)

print(confusion_matrix(y_val, pred4))
print(classification_report(y_val, pred4))

[[ 45  54]
 [ 27 174]]
              precision    recall  f1-score   support

           0       0.62      0.45      0.53        99
           1       0.76      0.87      0.81       201

    accuracy                           0.73       300
   macro avg       0.69      0.66      0.67       300
weighted avg       0.72      0.73      0.72       300



In [21]:
# 1. 하이터파라미터 값의 범위 설정
params = {'n_estimators': range(10, 101, 10),
          'learning_rate': np.linspace(0.01, 0.1, 10)} # linspace: range 함수와 같은 기능. 소수점

# 2. 그리드서치 선언
model4 = GridSearchCV(XGBClassifier(), params, cv=3)
model4.fit(x_train, y_train)

print(model4.best_params_, model4.best_score_) # 끝에 _ 언더바 붙으면 값

pred4 = model4.predict(x_val)

print(confusion_matrix(y_val, pred4))
print(classification_report(y_val, pred4))

{'learning_rate': 0.030000000000000006, 'n_estimators': 10} 0.7271193279776971
[[ 34  65]
 [ 28 173]]
              precision    recall  f1-score   support

           0       0.55      0.34      0.42        99
           1       0.73      0.86      0.79       201

    accuracy                           0.69       300
   macro avg       0.64      0.60      0.61       300
weighted avg       0.67      0.69      0.67       300



## (3) 결과 저장하고 csv로 내보내기 

In [22]:
pred = [pred1, pred2, pred3, pred4]
model = ['pred_LR','pred_KNN','pred_RF','pred_XGB']
for i in range(4) :
    colname = model[i]
    validation[colname] = pred[i]

validation.head() 

Unnamed: 0,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,...,CreditCount,Occupation,Dependents,Telephone,ForeignWorker,Creditability,pred_LR,pred_KNN,pred_RF,pred_XGB
890,3,10,3,3,1347,4,3,2,2,27,...,2,3,1,2,1,1,1,0,1,1
694,3,9,0,3,1337,2,3,2,3,34,...,2,4,1,2,1,0,1,1,1,1
798,3,15,2,2,2221,3,2,4,3,20,...,1,3,1,1,1,1,1,0,1,1
147,1,36,2,1,8229,3,3,2,2,26,...,1,3,2,1,1,0,0,1,1,0
858,3,24,3,2,4151,3,3,3,2,35,...,2,3,1,1,1,1,1,1,1,1


In [23]:
validation.to_csv('result3.csv', index = False)