# 제 6 장 __지도학습의 이해__
___

## __사전설정__
---

(1) 저장소 데이터 가져오기

In [None]:
!rm -rf /content/BizDataAnalysis/

In [None]:
!git clone https://github.com/BizStat/BizDataAnalysis.git


(2) matplotlib 환경에서 한글 사용

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -f -v
!rm ~/.cache/matplotlib -rf

런타임 메뉴에서 '세션 다시 시작' 후 다음의 명령문 실행

In [None]:
from matplotlib import rc
rc('font', family='NanumGothicCoding')
rc('axes', unicode_minus=False)

(3) 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

___

## 6.4 __손글씨 숫자 인식__

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### (1) MNIST 데이터

* 데이터 압축풀기

In [None]:
!unzip /content/BizDataAnalysis/DATA/mnist.zip # /content 폴더에 풀림

In [None]:
mnist = pd.read_table('/content/mnist.csv',header=None,sep=',')

In [None]:
mnist.head()

In [None]:
mnist.shape

* 데이터 살펴보기

In [None]:
X = mnist.iloc[:,1:]  # 28 x 29 픽셀 이미지
y = mnist.iloc[:,[0]] # 해당 이미지가 나타내는 숫자

In [None]:
X.head()

In [None]:
np.array(X.iloc[0]).reshape(28,28)

In [None]:
plt.imshow(np.array(X.iloc[0]).reshape(28,28), cmap=plt.cm.gray_r, interpolation = "nearest")
plt.grid()

In [None]:
y.head()

* 데이터 분리 : 순서대로 학습 데이터 60,000개와 평가 데이터 10,000개 분리

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=60000,shuffle=False)

In [None]:
X_train.shape

In [None]:
X_test.shape

* 데이터 전처리 : 이미지에서 나타나는 공통 여백 제거

In [None]:
chk = X_train.apply(lambda x : (x==x[0]).all(),axis = 0)

In [None]:
plt.imshow(np.array(chk.apply(lambda x : int(x))).reshape(28,28), cmap=plt.cm.gray_r, interpolation = "nearest")
plt.grid()

In [None]:
np.arange(784)[~chk]

In [None]:
X_train = X_train.iloc[:,np.arange(784)[~chk]]

In [None]:
X_test = X_test.iloc[:,np.arange(784)[~chk]]

* 종속변수를 포함하고 있는 데이터프레임을 1차원 배열로 변환

In [None]:
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]

---
__로지스틱 회귀모형__

* 로지스틱 회귀모형의 적합

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
LR = LogisticRegression(max_iter=10000,n_jobs=-1)
LR.fit(X_train,y_train)

* 로지스틱 회귀모형을 이용한 분류 결과 살펴보기

In [None]:
LR.score(X_train,y_train)

In [None]:
LR.score(X_test,y_test)

In [None]:
y_pred = LR.predict(X_test)
y_train_pred = LR.predict(X_train)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_pred)

* 혼돈행렬 백분율로 나타내기

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

* 로지스틱 회귀모형에 의해 잘 못 인식된 최초 16개 사례

---
__kNN 분류__

* kNN 분류

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
KC = KNeighborsClassifier(n_neighbors=7,n_jobs=-1)
KC.fit(X_train,y_train)

In [None]:
KC.score(X_train,y_train)

In [None]:
KC.score(X_test,y_test)

In [None]:
y_pred = KC.predict(X_test)
y_train_pred = KC.predict(X_train)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
niter = 20
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k,weights='uniform',n_jobs=-1)
    KC.fit(X_train, y_train)
    optVal[k-1,0] = k
    optVal[k-1,1] = KC.score(X_train, y_train)
    optVal[k-1,2] = KC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

* kNN 최적 모형

In [None]:
KC = KNeighborsClassifier(n_neighbors=4,n_jobs=-1)
KC.fit(X_train,y_train)

In [None]:
KC.score(X_train,y_train)

In [None]:
KC.score(X_test,y_test)

In [None]:
y_pred = KC.predict(X_test)
y_train_pred = KC.predict(X_train)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
#@title 결과 출력
y_pred = KC.predict(X_test)
y_train_pred = KC.predict(X_train)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_pred))
confusion_matrix(y_train, y_train_pred)
confusion_matrix(y_test, y_pred)
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

---
__분류나무__

* 분류나무 사례

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix

In [None]:
DC = DecisionTreeClassifier(random_state=3)
DC.fit(X_train,y_train)

In [None]:
DC.score(X_train,y_train)

In [None]:
DC.score(X_test,y_test)

In [None]:
y_pred = DC.predict(X_test)
y_train_pred = DC.predict(X_train)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
# @title max_depth 변화에 따른 회귀나무의 성능 비교
niter = 50
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    DC = DecisionTreeClassifier(max_depth=k,random_state=3)
    DC.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = DC.score(X_train, y_train)
    optVal[k-2,2] = DC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('max_depth 값의 변화에 따른 적합결과 변화')
plt.xlabel('max_depth')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
#@title 최적 모형
DC = DecisionTreeClassifier(max_depth=19,random_state=3)
DC.fit(X_train,y_train)

In [None]:
DC.score(X_train,y_train)

In [None]:
DC.score(X_test,y_test)

In [None]:
y_pred = KC.predict(X_test)
y_train_pred = KC.predict(X_train)

In [None]:
print("## 학습 데이터에 대한 분류 정확도")
print(classification_report(y_train, y_train_pred))
print("## 평가 데이터에 대한 분류 정확도")
print(classification_report(y_test, y_pred))
print("## 학습 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_train, y_train_pred))
print("## 학습 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_test, y_pred))

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
# @title min_samples_split 변화에 따른 회귀나무의 성능 비교
niter = 50
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    DC = DecisionTreeClassifier(min_samples_split=k,random_state=3)
    DC.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = DC.score(X_train, y_train)
    optVal[k-2,2] = DC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('min_samples_split 값의 변화에 따른 적합결과 변화')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
 optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
#@title 최적 모형
DC = DecisionTreeClassifier(min_samples_split=8,random_state=3)
DC.fit(X_train,y_train)

In [None]:
DC.score(X_train,y_train)

In [None]:
DC.score(X_test,y_test)

In [None]:
y_pred = DC.predict(X_test)
y_train_pred = DC.predict(X_train)
print("## 학습 데이터에 대한 분류 정확도")
print(classification_report(y_train, y_train_pred))
print("## 평가 데이터에 대한 분류 정확도")
print(classification_report(y_test, y_pred))
print("## 학습 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_train, y_train_pred))
print("## 평가 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_test, y_pred))

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
# @title min_samples_leaf 변화에 따른 회귀나무의 성능 비교
niter = 50
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    DC = DecisionTreeClassifier(min_samples_leaf=k,random_state=3)
    DC.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = DC.score(X_train, y_train)
    optVal[k-2,2] = DC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('min_samples_leaf 값의 변화에 따른 적합결과 변화')
plt.xlabel('min_samples_leaf')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
#@title 최적 모형
DC = DecisionTreeClassifier(min_samples_leaf=4,random_state=3)
DC.fit(X_train,y_train)

In [None]:
DC.score(X_train,y_train)

In [None]:
DC.score(X_test,y_test)

In [None]:
y_pred = DC.predict(X_test)
y_train_pred = DC.predict(X_train)
print("## 학습 데이터에 대한 분류 정확도")
print(classification_report(y_train, y_train_pred))
print("## 평가 데이터에 대한 분류 정확도")
print(classification_report(y_test, y_pred))
print("## 학습 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_train, y_train_pred))
print("## 학습 데이터에 대한 혼돈행렬")
print(confusion_matrix(y_test, y_pred))

In [None]:
pd.crosstab(index=y_train,columns=y_train_pred,normalize='index')

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

* 분류 나무 기반의 앙상블 모형

In [None]:
#@title Bagging based Classification Tree
# 기본모형
DC = DecisionTreeClassifier(random_state=3)
from sklearn.ensemble import BaggingClassifier
BC = BaggingClassifier(
            estimator = DC, #
            n_estimators=500, # 붓스트랩 표본추출 횟수
            max_samples=1, # 붓스트랩 표본추출 비율
            bootstrap=True, # 복원 추출 (기본값)
            oob_score=True, # Out-of-bag 데이터를 이용한 성능 계산
            random_state=3
        )
BC.fit(X_train,y_train)

In [None]:
BC.score(X_train,y_train)

In [None]:
BC.score(X_test,y_test)

In [None]:
y_pred = BC.predict(X_test)
y_train_pred = BC.predict(X_train)

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
#@title Random Forest
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트
RFC = RandomForestClassifier(
        n_estimators= 1000, # 붓스트렙 표본추출 횟수
        criterion = 'gini',
#        max_depth = 5,
#        min_samples_split = 20,
#        min_samples_leaf = 5,
        random_state=3)
RFC.fit(X_train,y_train)

In [None]:
RFC.score(X_train,y_train)

In [None]:
RFC.score(X_test,y_test)

In [None]:
y_pred = RFC.predict(X_test)
y_train_pred = RFC.predict(X_train)

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

In [None]:
from xgboost import XGBClassifier # 종속변수가 숫자로 입력

In [None]:
#@title Gradient Boosting
GB = XGBClassifier(random_state=3,
                      eval_metric='logloss', # 기준이 되는 손실함수
                      booster = 'gbtree', # 부스팅 알고리즘 (또는 dart, gblinear)
                      objective = 'binary:logistic',
                      max_depth = 2, # 나무의 최대깊이
                      learning_rate = 0.1, # 학습률(최대 값 1) (강의노트 84쪽 lambda값)
                      n_estimators = 500, #  부스팅 라운딩 횟수(강의노트 84쪽 B값)
                      subsample = 1, # 붓스트랩 표본추출 비율
                      colsample_bytree = 1 # 독립변수의 추출 비율
)
GB.fit(X_train,y_train)

In [None]:
GB.score(X_train,y_train)

In [None]:
GB.score(X_test,y_test)

In [None]:
y_pred = GB.predict(X_test)
y_train_pred = GB.predict(X_train)

In [None]:
pd.crosstab(index=y_test,columns=y_pred,normalize='index')

---
* 부스팅 라운드 횟수에 따른 성능 비교
---

In [None]:
# @title GBM에서 부스팅 라운드 횟수 변화에 따른 분류 정확도
niter = 15
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    GB = XGBClassifier(random_state=3,
                      eval_metric='logloss', # 기준이 되는 손실함수
                      booster = 'gbtree', # 부스팅 알고리즘 (또는 dart, gblinear)
                      objective = 'binary:logistic',
                      max_depth = 2, # 나무의 최대깊이
                      learning_rate = 0.1, # 학습률(최대 값 1) (강의노트 84쪽 lambda값)
                      n_estimators = 100*k, #  부스팅 라운딩 횟수(강의노트 84쪽 B값)
                      subsample = 1, # 붓스트랩 표본추출 비율
                      colsample_bytree = 1 # 독립변수의 추출 비율
    )
    GB.fit(X_train, y_train)
    optVal[k-1,0] = 100*k
    optVal[k-1,1] = GB.score(X_train, y_train)
    optVal[k-1,2] = GB.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('부스팅 라운드 횟수에 따른 분류 정확도의 변화')
plt.xlabel('Number of boostin rounds')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()