### 로지스틱 회귀
: 회귀라는 단어가 있지만 실제적으로 선형방정식을 사용해서 분류 (Classification)에 대한 확률을 계산하는 모델

In [100]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [101]:
fish = pd.read_csv('../Data/fishes.csv')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [102]:
fish.Species.unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [103]:
fish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Species   159 non-null    object 
 1   Weight    159 non-null    float64
 2   Length    159 non-null    float64
 3   Diagonal  159 non-null    float64
 4   Height    159 non-null    float64
 5   Width     159 non-null    float64
dtypes: float64(5), object(1)
memory usage: 7.6+ KB


In [104]:
fish.describe()

Unnamed: 0,Weight,Length,Diagonal,Height,Width
count,159.0,159.0,159.0,159.0,159.0
mean,398.326415,28.415723,31.227044,8.970994,4.417486
std,357.978317,10.716328,11.610246,4.286208,1.685804
min,0.0,8.4,8.8,1.7284,1.0476
25%,120.0,21.0,23.15,5.9448,3.38565
50%,273.0,27.3,29.4,7.786,4.2485
75%,650.0,35.5,39.65,12.3659,5.5845
max,1650.0,63.4,68.0,18.957,8.142


# Feature과 Target

In [105]:
# Feature
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_input[:5]

array([[242.    ,  25.4   ,  30.    ,  11.52  ,   4.02  ],
       [290.    ,  26.3   ,  31.2   ,  12.48  ,   4.3056],
       [340.    ,  26.5   ,  31.1   ,  12.3778,   4.6961],
       [363.    ,  29.    ,  33.5   ,  12.73  ,   4.4555],
       [430.    ,  29.    ,  34.    ,  12.444 ,   5.134 ]])

In [106]:
# Target
fish_target = fish['Species'].to_numpy()
fish_target

array(['Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream',
       'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Roach',
       'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Roach',
       'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Roach', 'Whitefish',
       'Whitefish', 'Whitefish', 'Whitefish', 'Whitefish', 'Whitefish',
       'Parkki', 'Parkki', 'Parkki', 'Parkki', 'Parkki', 'Parkki',
       'Parkki', 'Parkki', 'Parkki', 'Parkki', 'Parkki', 'Perch', 'Perch',
       'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch',
       'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch',
       'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch', 'Perch',
       'Perc

# Train, Test

In [107]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input,fish_target,random_state=42)

# 회귀분석을 위한 표준화 작업

In [108]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input,train_target)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [109]:
print(train_scaled[:5])
print(test_scaled[:5])

[[ 0.91965782  0.60943175  0.81041221  1.85194896  1.00075672]
 [ 0.30041219  1.54653445  1.45316551 -0.46981663  0.27291745]
 [-1.0858536  -1.68646987 -1.70848587 -1.70159849 -2.0044758 ]
 [-0.79734143 -0.60880176 -0.67486907 -0.82480589 -0.27631471]
 [-0.71289885 -0.73062511 -0.70092664 -0.0802298  -0.7033869 ]]
[[-0.88741352 -0.91804565 -1.03098914 -0.90464451 -0.80762518]
 [-1.06924656 -1.50842035 -1.54345461 -1.58849582 -1.93803151]
 [-0.54401367  0.35641402  0.30663259 -0.8135697  -0.65388895]
 [-0.34698097 -0.23396068 -0.22320459 -0.11905019 -0.12233464]
 [-0.68475132 -0.51509149 -0.58801052 -0.8998784  -0.50124996]]


### KNN을 이용한 확률 예측

In [110]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled,train_target)
print("Train Score:",kn.score(train_scaled,train_target))
print("Test Score:",kn.score(test_scaled,test_target))

Train Score: 0.8907563025210085
Test Score: 0.85


In [111]:
# test data로 예측
kn.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Perch', 'Perch'], dtype=object)

In [112]:
kn.predict_proba(test_scaled[:5])

array([[0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.66666667, 0.        , 0.33333333,
        0.        , 0.        ],
       [0.        , 0.        , 0.66666667, 0.        , 0.33333333,
        0.        , 0.        ]])

In [113]:
# test의 확률 예측값 보기
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba*100,decimals=4)) # 소수점 이하 4자리

[[  0.       0.     100.       0.       0.       0.       0.    ]
 [  0.       0.       0.       0.       0.     100.       0.    ]
 [  0.       0.       0.     100.       0.       0.       0.    ]
 [  0.       0.      66.6667   0.      33.3333   0.       0.    ]
 [  0.       0.      66.6667   0.      33.3333   0.       0.    ]]


---
### 로지스틱 회귀를 이용한 확률 예측
- 데이터의 각 컬럼을 새로운 변수로 사용하는 다중회귀를 이용한 확률
- 회귀식을 사용해서 로지스틱 회귀라고 하나 실질적으로 확률을 예측하는 모델
- 선형회귀식에서 출발하였고 딥러닝 모델의 기반임.

z = a*Weight + b*Length + c*Diagonal + d*Width + e*Height + f       
위의 식을 이용하면 나오는 결과값은 -무한대~+무한대의 범위로 결과값이 출력된다.      
이를 해결하는 방법이 Sigmoid함수를 사용한다.        
Sigmoid함수는 결과값이 0~1사이의 수만 출력하므로 0.5를 기준으로 양성(True)과 음성(False)로 분류가 가능하다.     
로지스틱 회귀를 사용할 경우는 Sigmoid함수를 로지스틱 함수라고 표현한다.     


In [114]:
# Sigmoid Graph 출력해보기
import matplotlib.pyplot as plt

z = np.arange(-5,5,0.1)
y = np.full((100,1),0.5) # 양성, 음성 기준점
phi = 1 / (1 + np.exp(-z))

In [115]:
print(phi)

[0.00669285 0.00739154 0.00816257 0.0090133  0.0099518  0.01098694
 0.01212843 0.01338692 0.01477403 0.0163025  0.01798621 0.01984031
 0.02188127 0.02412702 0.02659699 0.02931223 0.03229546 0.03557119
 0.03916572 0.04310725 0.04742587 0.05215356 0.05732418 0.06297336
 0.06913842 0.07585818 0.0831727  0.09112296 0.09975049 0.10909682
 0.11920292 0.13010847 0.14185106 0.15446527 0.16798161 0.18242552
 0.19781611 0.21416502 0.23147522 0.24973989 0.26894142 0.2890505
 0.31002552 0.33181223 0.35434369 0.37754067 0.40131234 0.42555748
 0.450166   0.47502081 0.5        0.52497919 0.549834   0.57444252
 0.59868766 0.62245933 0.64565631 0.66818777 0.68997448 0.7109495
 0.73105858 0.75026011 0.76852478 0.78583498 0.80218389 0.81757448
 0.83201839 0.84553473 0.85814894 0.86989153 0.88079708 0.89090318
 0.90024951 0.90887704 0.9168273  0.92414182 0.93086158 0.93702664
 0.94267582 0.94784644 0.95257413 0.95689275 0.96083428 0.96442881
 0.96770454 0.97068777 0.97340301 0.97587298 0.97811873 0.980159

---
### 로지스틱 회귀로 이진 분류 수행하기

In [116]:
# 2종류의 데이터만 발췌하기
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

In [117]:
# 로지스틱 회귀 실행하기
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

In [118]:
lr.predict(train_bream_smelt[:5])

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [119]:
print(lr.predict_proba(train_bream_smelt[:5]))

[[0.99759855 0.00240145]
 [0.02735183 0.97264817]
 [0.99486072 0.00513928]
 [0.98584202 0.01415798]
 [0.99767269 0.00232731]]


### 확률분포 검증

In [120]:
# 클래스 종류
lr.classes_

array(['Bream', 'Smelt'], dtype=object)

In [121]:
# 회귀식의 기울기(가중치)확인하기
print(lr.coef_,lr.intercept_)

[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]


In [122]:
# 다중식으로 계산
decisions = lr.decision_function(train_bream_smelt[:5])

In [123]:
# Sigmoid 함수에 대입하여 확률값을 구한다.
from scipy.special import expit
print(expit(decisions))

[0.00240145 0.97264817 0.00513928 0.01415798 0.00232731]


---
### 로지스틱 회귀로 다중 분류 수행하기

In [124]:
lr = LogisticRegression()
lr.fit(train_scaled,train_target)
print("Train:",lr.score(train_scaled,train_target))
print("Test:",lr.score(test_scaled,test_target))

Train: 0.8067226890756303
Test: 0.85


In [125]:
# test data로 예측
lr.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Perch', 'Perch'], dtype=object)

In [126]:
# test data로 확률
proba = lr.predict_proba(test_scaled[:5])
np.round(proba,decimals=3)

array([[0.004, 0.092, 0.545, 0.008, 0.281, 0.061, 0.01 ],
       [0.   , 0.061, 0.126, 0.002, 0.087, 0.722, 0.001],
       [0.009, 0.009, 0.223, 0.569, 0.17 , 0.006, 0.013],
       [0.061, 0.077, 0.526, 0.036, 0.257, 0.003, 0.04 ],
       [0.004, 0.03 , 0.664, 0.026, 0.245, 0.018, 0.013]])

In [127]:
# 사용된 다중 공식 확인
print(lr.coef_,lr.intercept_)

[[-1.48635407e-01 -8.16338300e-02  6.40677025e-01  2.81886254e+00
  -3.23791215e-01]
 [-3.35559709e-01 -8.21454187e-01 -8.35625183e-01  1.91834624e+00
  -8.55673174e-01]
 [ 1.30777936e+00  3.20627792e-01 -1.31830404e+00 -1.71471286e+00
   1.64054397e+00]
 [ 1.33816721e-03  1.72779398e+00  1.75766272e+00 -1.28638162e+00
  -6.35345781e-01]
 [-8.14258430e-01 -5.24809489e-01  3.53859503e-01 -4.21963234e-01
   6.26572878e-01]
 [-3.55031912e-01 -6.41091506e-01 -6.57484132e-01 -1.46560568e+00
  -1.51685481e+00]
 [ 3.44367925e-01  2.05672435e-02  5.92141133e-02  1.51454608e-01
   1.06454813e+00]] [ 0.38862776 -0.11439729  2.40795555  0.04859162  1.15910169 -3.78185807
 -0.10802126]


---
### 확률분포 검증

In [128]:
decisions = lr.decision_function(test_scaled[:5])
np.round(decisions,decimals=2)

array([[-2.35,  0.75,  2.54, -1.67,  1.87,  0.35, -1.49],
       [-4.17,  1.38,  2.11, -2.  ,  1.73,  3.85, -2.9 ],
       [-1.44, -1.48,  1.73,  2.66,  1.46, -1.83, -1.09],
       [ 0.02,  0.26,  2.18, -0.52,  1.46, -3.  , -0.39],
       [-2.22, -0.27,  2.84, -0.4 ,  1.84, -0.74, -1.06]])

> 계산할 항목이 여러개이므로 이를 Sigmoid를 사용하면 각자의 값이 생성되어 확률이 1이 넘는 경우가 발생한다   
> 다중 항목인 경우에는 Softmax를 사용한다

In [129]:
from scipy.special import softmax
proba = softmax(decisions, axis=1)
np.round(proba,decimals=2)

array([[0.  , 0.09, 0.54, 0.01, 0.28, 0.06, 0.01],
       [0.  , 0.06, 0.13, 0.  , 0.09, 0.72, 0.  ],
       [0.01, 0.01, 0.22, 0.57, 0.17, 0.01, 0.01],
       [0.06, 0.08, 0.53, 0.04, 0.26, 0.  , 0.04],
       [0.  , 0.03, 0.66, 0.03, 0.24, 0.02, 0.01]])