# Classification : LogisticRegression


## 00 환경준비

### 1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### 2) data loading

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.drop(['Purpose','MostValuableAsset','CreditCount','Occupation','Telephone'], axis = 1, inplace = True)
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,CreditAmount,Employment,SexMarital,CurrentAddress,Age,Apartment,Dependents,ForeignWorker
0,1,3,24,2,1249,2,4,2,28,2,1,1
1,1,2,9,2,276,3,4,4,22,1,1,1
2,1,1,18,4,1049,2,2,4,21,1,1,1
3,1,1,24,4,6419,5,2,4,44,3,2,1
4,1,3,12,2,1424,5,2,4,55,2,1,1


|	칼럼명	|	설명	|		|	값 설명	|
|	----	|	----	|	----	|	----	|
|	**Creditability**	|	Creditability	|	신용도	|	0 : 낮은 신용도, 1 : 높은 신용도, **Target**	|
|	AccountBalance	|	Account Balance	|	은행잔고	|	1: No account, 2 : None (No balance), 3 : Some Balance	|
|	CreditDuration	|	Duration of Credit (month)	|	 신청한 대출기간(월)	|	숫자	|
|	Payment	|	Payment Status of Previous Credit	|	과거 대출 납입 상태	|	0 : Delayed, 1 : Other Credits, 2 : Paid Up, 3 : No Problem with Current Credits, 4 : Previous Credits Paid	|
|	CreditAmount	|	Credit Amount($)	|	신청한 대출금액	|	숫자	|
|	Employment	|	Length of current employment(Month)	|	현 직업 근무 기간	|	1: Unemployed, 2: <1 Year, 3: [1, 4), 4: [4, 7), 5: Above 7	|
|	SexMarital	|	Sex & Marital Status	|	성별 & 결혼상태	|	1: Male, Divorced, 2: Male, Single, 3: Male, Married/Widowed, 4: Female	|
|	CurrentAddress	|	Duration in Current address	|	현 거주지 거주기간	|	1: <1 Year , 2: [1, 4), 3: [4, 7), 4: Above 7	|
|	Age	|	Age (years)	|	나이	|	숫자	|
|	Appartment	|	Type of apartment	|	주거환경	|	1: free apartment, 2: Rented, 3: Owned	|
|	Occupation	|	Occupation	|	직업	|	1: Unemployed, unskilled, 2: Unskilled Permanent Resident, 3: Skilled, 4: Executive	|
|	Telephone	|	Telephone	|	전화기 소유 여부	|	1: No, 2: Yes 	|
|	ForeignWorker	|	Foreign Worker	|	외국인 근로자 여부	|	1: No, 2: Yes	|

## 10.데이터 이해

### 1)둘러보기

In [3]:
# 상/하위 몇개 행을 살펴 봅시다.
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,CreditAmount,Employment,SexMarital,CurrentAddress,Age,Apartment,Dependents,ForeignWorker
0,1,3,24,2,1249,2,4,2,28,2,1,1
1,1,2,9,2,276,3,4,4,22,1,1,1
2,1,1,18,4,1049,2,2,4,21,1,1,1
3,1,1,24,4,6419,5,2,4,44,3,2,1
4,1,3,12,2,1424,5,2,4,55,2,1,1


In [4]:
# 각 칼럼의 타입을 살펴 봅시다.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Creditability   1000 non-null   int64
 1   AccountBalance  1000 non-null   int64
 2   Duration        1000 non-null   int64
 3   Payment         1000 non-null   int64
 4   CreditAmount    1000 non-null   int64
 5   Employment      1000 non-null   int64
 6   SexMarital      1000 non-null   int64
 7   CurrentAddress  1000 non-null   int64
 8   Age             1000 non-null   int64
 9   Apartment       1000 non-null   int64
 10  Dependents      1000 non-null   int64
 11  ForeignWorker   1000 non-null   int64
dtypes: int64(12)
memory usage: 93.9 KB


### 2) EDA & CDA
여기에서는 다루지 않습니다.

## 20.데이터 준비

### 1) 데이터분할 : x, y 나누기

In [5]:
target = 'Creditability'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

### 2) 변수정리
불필요한 변수가 있으면 제거합니다.

### 3) NA 조치

In [6]:
x.isna().sum()

AccountBalance    0
Duration          0
Payment           0
CreditAmount      0
Employment        0
SexMarital        0
CurrentAddress    0
Age               0
Apartment         0
Dependents        0
ForeignWorker     0
dtype: int64

### 4) 가변수화

가변수화를 수행하시오.

In [7]:
cat_cols = ['AccountBalance','Payment','Employment','SexMarital','CurrentAddress','Apartment','Dependents','ForeignWorker']
x = pd.get_dummies(x, columns = cat_cols, drop_first = True)
x.head()

Unnamed: 0,Duration,CreditAmount,Age,AccountBalance_2,AccountBalance_3,Payment_1,Payment_2,Payment_3,Payment_4,Employment_2,...,SexMarital_2,SexMarital_3,SexMarital_4,CurrentAddress_2,CurrentAddress_3,CurrentAddress_4,Apartment_2,Apartment_3,Dependents_2,ForeignWorker_2
0,24,1249,28,0,1,0,1,0,0,1,...,0,0,1,1,0,0,1,0,0,0
1,9,276,22,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,18,1049,21,0,0,0,0,0,1,1,...,1,0,0,0,0,1,0,0,0,0
3,24,6419,44,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,1,1,0
4,12,1424,55,0,1,0,1,0,0,0,...,1,0,0,0,0,1,1,0,0,0


### 5) 데이터분할 : train : validation

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3)

### 6) Scaling

## 30.모델링

### 1) 필요한 함수 불러오기

In [9]:
# 모델링을 위해
from sklearn.linear_model import LogisticRegression

# 평가를 위해.
from sklearn.metrics import *

### 2) 선언

In [10]:
model = LogisticRegression()

### 3) 모델링(학습)

In [11]:
model.fit(x_train, y_train)

LogisticRegression()

* 모델의 coeficient, intercept 확인

In [12]:
model.coef_

array([[-3.73457217e-02, -4.31213950e-06,  1.88702274e-03,
         1.24026042e-01,  1.52269055e+00, -4.48729776e-01,
         1.99061278e-01, -3.88781841e-02,  7.05560688e-01,
        -3.65077101e-01,  2.24616491e-02,  6.77327950e-01,
        -2.99260826e-02, -5.05028400e-02,  3.67949818e-01,
         1.98048690e-01, -3.27024515e-01,  1.15739732e-02,
         3.52081615e-01,  4.06388389e-01, -2.60830014e-02,
         1.47959447e-02,  1.80160082e-01]])

In [13]:
list(x_train)

['Duration',
 'CreditAmount',
 'Age',
 'AccountBalance_2',
 'AccountBalance_3',
 'Payment_1',
 'Payment_2',
 'Payment_3',
 'Payment_4',
 'Employment_2',
 'Employment_3',
 'Employment_4',
 'Employment_5',
 'SexMarital_2',
 'SexMarital_3',
 'SexMarital_4',
 'CurrentAddress_2',
 'CurrentAddress_3',
 'CurrentAddress_4',
 'Apartment_2',
 'Apartment_3',
 'Dependents_2',
 'ForeignWorker_2']

In [14]:
model.intercept_

array([0.29029292])

### 4) 검증 : 예측

In [15]:
pred = model.predict(x_val)
pred[:5]

array([0, 1, 1, 0, 1])

In [16]:
# 확률값으로 예측값을 저장하고 싶다면...
pred_prob = model.predict_proba(x_val)
pred_prob

array([[0.53778096, 0.46221904],
       [0.12161976, 0.87838024],
       [0.19743578, 0.80256422],
       [0.64823043, 0.35176957],
       [0.09682389, 0.90317611],
       [0.18118737, 0.81881263],
       [0.47469524, 0.52530476],
       [0.06463323, 0.93536677],
       [0.06189442, 0.93810558],
       [0.82117553, 0.17882447],
       [0.20912946, 0.79087054],
       [0.6084938 , 0.3915062 ],
       [0.07382243, 0.92617757],
       [0.0559035 , 0.9440965 ],
       [0.38877698, 0.61122302],
       [0.75354302, 0.24645698],
       [0.15521385, 0.84478615],
       [0.61404182, 0.38595818],
       [0.15271438, 0.84728562],
       [0.03690082, 0.96309918],
       [0.13657294, 0.86342706],
       [0.07830425, 0.92169575],
       [0.06048993, 0.93951007],
       [0.18814454, 0.81185546],
       [0.28296879, 0.71703121],
       [0.1392285 , 0.8607715 ],
       [0.01967367, 0.98032633],
       [0.37780704, 0.62219296],
       [0.20530802, 0.79469198],
       [0.34412027, 0.65587973],
       [0.

### 5) 검증 : 평가

In [17]:
accuracy_score(y_val, pred   )

0.73

## 40.분류 모델 평가

### 1) Confusion Matrix
https://en.wikipedia.org/wiki/Confusion_matrix

In [18]:
confusion_matrix( y_val, pred)

array([[ 40,  63],
       [ 18, 179]])

### 2) classification_report
* 분류문제 평가지표 종합판
* print 문으로 결과를 출력해야 제대로 보임.

In [19]:
classification_report(y_val, pred)

'              precision    recall  f1-score   support\n\n           0       0.69      0.39      0.50       103\n           1       0.74      0.91      0.82       197\n\n    accuracy                           0.73       300\n   macro avg       0.71      0.65      0.66       300\nweighted avg       0.72      0.73      0.71       300\n'

In [20]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.69      0.39      0.50       103
           1       0.74      0.91      0.82       197

    accuracy                           0.73       300
   macro avg       0.71      0.65      0.66       300
weighted avg       0.72      0.73      0.71       300



### 3) accuracy_score, precision_score, recall_score, f1_score
* 개별 평가지표 도출
* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

In [21]:
precision_score(y_val, pred)

0.7396694214876033

In [22]:
precision_score(y_val, pred, pos_label = 0)

0.6896551724137931

In [23]:
recall_score(y_val, pred)

0.9086294416243654

In [24]:
recall_score(y_val, pred, pos_label = 0)

0.3883495145631068

In [25]:
f1_score(y_val, pred)

0.815489749430524

In [26]:
f1_score(y_val, pred, pos_label = 0)

0.49689440993788825