# 제 6 장 __지도학습의 이해__
___

## __사전설정__
---

(1) 저장소 데이터 가져오기

In [None]:
!rm -rf /content/BizDataAnalysis/

In [None]:
!git clone https://github.com/BizStat/BizDataAnalysis.git


(2) matplotlib 환경에서 한글 사용

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -f -v
!rm ~/.cache/matplotlib -rf

런타임 메뉴에서 '세션 다시 시작' 후 다음의 명령문 실행

In [None]:
from matplotlib import rc
rc('font', family='NanumGothicCoding')
rc('axes', unicode_minus=False)

(3) 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

___

## 6.3 __분류모형의 이해와 활용__

### (1) Default 데이터

* 데이터 가져오기

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
default = pd.read_table('/content/BizDataAnalysis/DATA/Default.csv', sep=',')

In [None]:
default

* 데이터 살펴보기

In [None]:
default.iloc[:,[3,4]].describe()

In [None]:
default.iloc[:,[3,4]].hist(bins=20)

In [None]:
import seaborn as sns
sns.pairplot(default.iloc[:,1:], hue="default")
plt.show()

In [None]:
pd.crosstab(index=default['default'],columns=default['student'])

* 데이터 분할 : 학습데이터와 평가데이터

In [None]:
default.iloc[:,[1]]

In [None]:
default.iloc[:,2:]

In [None]:
X = default.iloc[:,2:]
y = default.iloc[:,[1]]

* 가변수로 변환

In [None]:
X = pd.get_dummies(X,drop_first=True,dtype=int)

In [None]:
y = pd.get_dummies(y,drop_first=True,dtype=int)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3,train_size=0.7)

In [None]:
print(f"학습 데이터의 수 : {len(y_train)}, 학습 데이터에서 연체개체 수 : {y_train.sum()}")
print(f"평가 데이터의 수 : {len(y_test)}, 평가 데이터에서 연체개체 수 : {y_test.sum()}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=13,train_size=0.7,stratify=default['default'])

In [None]:
print(f"학습 데이터의 수 : {len(y_train)}, 학습 데이터에서 연체개체 수 : {y_train.sum()}")
print(f"평가 데이터의 수 : {len(y_test)}, 평가 데이터에서 연체개체 수 : {y_test.sum()}")

In [None]:
# 종속변수를 가변수로 변환하여 사용하는 경우
y_train = y_train['default_Yes']
y_test = y_test['default_Yes']

In [None]:
# 종속변수를 가변수로 변환하지 않은 경우
y_train = y_train['default']
y_test = y_test['default']

In [None]:
y_test

---
__로지스틱 회귀모형__

* 로지스틱 회귀모형의 적합

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,y_train)

* 로지스틱 회귀모형의 추정 결과 살펴보기

In [None]:
for i in range(0,len(LR.feature_names_in_)+1):
  if i == 0 :
    print(f'절편 : {LR.intercept_[0]}')
  else :
    print(f'{LR.feature_names_in_[i-1]} : {LR.coef_[0,i-1]}')

In [None]:
import numpy as np
for i in range(0,len(LR.feature_names_in_)+1):
  if i == 0 :
    print(f'절편 : {np.exp(LR.intercept_[0])}')
  else :
    print(f'{LR.feature_names_in_[i-1]} : {np.exp(LR.coef_[0,i-1])}')

* 로지스틱 회귀모형 적합 결과를 이용한 예측

In [None]:
y_pred = LR.predict(X_test)

In [None]:
y_pred

In [None]:
y_pred_prob = LR.predict_proba(X_test)

In [None]:
y_pred_prob

In [None]:
y_pred_prob[y_pred_prob[:,0]<y_pred_prob[:,1]]

In [None]:
pred_df = pd.DataFrame({
    'actual_class': y_test,
    'predicted_class': y_pred,
    'probabilty_class_0': y_pred_prob[:,1]
})

In [None]:
pred_df.head()

In [None]:
pd.crosstab(index=pred_df['actual_class'],columns=pred_df['predicted_class'])

* 로지스틱 회귀모형의 성능평가

In [None]:
LR.score(X_train,y_train)

In [None]:
LR.score(X_test,y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['default_no','default_yes']))

* 분류기준값의 변화에 따른 모형의 정확도

In [None]:
def tmp(x):
    if x >= 0.2:
        result = 1
    else:
        result = 0
    return result

pred_df['predicted_class'] = pred_df['probabilty_class_0'].apply(tmp)
pd.crosstab(index=pred_df['actual_class'],columns=pred_df['predicted_class'])

In [None]:
def tmp(x):
    if x >= 0.8:
        result = 1
    else:
        result = 0
    return result

pred_df['predicted_class'] = pred_df['probabilty_class_0'].apply(tmp)
pd.crosstab(index=pred_df['actual_class'],columns=pred_df['predicted_class'])

* ROC 곡선

In [None]:
import sklearn.metrics as sm
fpr, tpr, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
sm.auc(fpr,tpr)

In [None]:
plt.figure(figsize = (8, 8))
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

---
__kNN 분류__

* kNN 분류 사례 (기본값 적용)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KC = KNeighborsClassifier()
KC.fit(X_train,y_train)

In [None]:
KC.score(X_train,y_train)

In [None]:
KC.score(X_test,y_test)

In [None]:
y_pred_proba = KC.predict_proba(X_test)

In [None]:
y_pred = KC.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as sm

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
fpr, tpr, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
sm.auc(fpr,tpr)

* kNN 분류 : k-값 변화에 따른 적합 결과

In [None]:
niter = 100
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k,weights='uniform')
    KC.fit(X_train, y_train)
    optVal[k-1,0] = k
    optVal[k-1,1] = KC.score(X_train, y_train)
    optVal[k-1,2] = KC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
KC = KNeighborsClassifier(n_neighbors=7)
KC.fit(X_train,y_train)

In [None]:
y_pred_prob = KC.predict_proba(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
fpr, tpr, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
sm.auc(fpr,tpr)

In [None]:
niter = 50
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k*2,weights='uniform')
    KC.fit(X_train, y_train)
    optVal[k-1,0] = k*2
    optVal[k-1,1] = KC.score(X_train, y_train)
    optVal[k-1,2] = KC.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
niter = 50
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k*2-1,weights='uniform')
    KC.fit(X_train, y_train)
    optVal[k-1,0] = k*2-1
    optVal[k-1,1] = KC.score(X_train, y_train.to_numpy().reshape(-1))
    optVal[k-1,2] = KC.score(X_test, y_test.to_numpy().reshape(-1))

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
niter = 50
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k*2-1,weights='uniform',p=1)
    KC.fit(X_train, y_train)
    optVal[k-1,0] = k*2-1
    optVal[k-1,1] = KC.score(X_train, y_train.to_numpy().reshape(-1))
    optVal[k-1,2] = KC.score(X_test, y_test.to_numpy().reshape(-1))

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

* 데이터 표준화

In [None]:
train_mean = np.mean(X_train, axis=0)
train_std = np.mean(X_train, axis=0)
X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

In [None]:
KC = KNeighborsClassifier()
KC.fit(X_train_scaled,y_train)
KC.score(X_test_scaled,y_test)

In [None]:
y_pred = KC.predict(X_test_scaled)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
niter = 50
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KC = KNeighborsClassifier(n_neighbors=k*2-1,weights='uniform')
    KC.fit(X_train_scaled, y_train)
    optVal[k-1,0] = k*2-1
    optVal[k-1,1] = KC.score(X_train_scaled, y_train)
    optVal[k-1,2] = KC.score(X_test_scaled, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

* kNN 분류의 최적모형

In [None]:
KC = KNeighborsClassifier(n_neighbors=9)
KC.fit(X_train_scaled,y_train)

In [None]:
y_pred_prob = KC.predict_proba(X_test_scaled)

* kNN 분류의 ROC 곡선 그리기

In [None]:
fpr, tpr, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
sm.auc(fpr,tpr)

In [None]:
plt.figure(figsize = (8, 8))
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

---
__분류나무__

* 분류나무 사례 (기본값 적용)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix

In [None]:
DC = DecisionTreeClassifier(random_state=3)
DC.fit(X_train,y_train)

In [None]:
DC.score(X_train,y_train)

In [None]:
DC.score(X_test,y_test)

In [None]:
y_pred = DC.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier
DC = DecisionTreeClassifier(random_state=3)
DC.fit(X_train_scaled,y_train)

In [None]:
DC.score(X_test_scaled,y_test)

In [None]:
y_pred = DC.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

In [None]:
# @title max_depth 변화에 따른 회귀나무의 성능 비교
niter = 100
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    RT = DecisionTreeClassifier(max_depth=k,random_state=3)
    RT.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = RT.score(X_train, y_train)
    optVal[k-2,2] = RT.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('max_depth 값의 변화에 따른 적합결과 변화')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
# @title min_samples_split 변화에 따른 회귀나무의 성능 비교
niter = 100
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    RT = DecisionTreeClassifier(min_samples_split=k,random_state=3)
    RT.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = RT.score(X_train, y_train)
    optVal[k-2,2] = RT.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('min_samples_split 값의 변화에 따른 적합결과 변화')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

In [None]:
# @title min_samples_leaf 변화에 따른 회귀나무의 성능 비교
niter = 100
optVal = np.zeros((niter,3))
for k in range(2, niter+2):
    RT = DecisionTreeClassifier(min_samples_leaf=k,random_state=3)
    RT.fit(X_train, y_train)
    optVal[k-2,0] = k
    optVal[k-2,1] = RT.score(X_train, y_train)
    optVal[k-2,2] = RT.score(X_test, y_test)

plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='평가데이터')
plt.title('min_samples_leaf 값의 변화에 따른 적합결과 변화')
plt.xlabel('min_samples_leaf')
plt.ylabel('Accuracy Rate')
plt.legend()
plt.show()

In [None]:
optVal[optVal[:,2] == max(optVal[:,2]),]

* 최적 분류나무 선택을 위한 AUC 비교

In [None]:
DC = DecisionTreeClassifier(max_depth=2,random_state=3)
DC.fit(X_train,y_train)
y_pred_prob = DC.predict_proba(X_test)
fpr1, tpr1, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
tree.plot_tree(DC)
print(tree.export_text(DC,feature_names=DC.feature_names_in_.tolist()))

In [None]:
DC = DecisionTreeClassifier(min_samples_split=46,random_state=3)
DC.fit(X_train,y_train)
y_pred_prob = DC.predict_proba(X_test)
fpr2, tpr2, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
tree.plot_tree(DC)
print(tree.export_text(DC,feature_names=DC.feature_names_in_.tolist()))

In [None]:
DC = DecisionTreeClassifier(min_samples_leaf=12,random_state=3)
DC.fit(X_train,y_train)
y_pred_prob = DC.predict_proba(X_test)
fpr3, tpr3, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
tree.plot_tree(DC)
print(tree.export_text(DC,feature_names=DC.feature_names_in_.tolist()))

In [None]:
sm.auc(fpr1,tpr1)

In [None]:
sm.auc(fpr2,tpr2)

In [None]:
sm.auc(fpr3,tpr3)

In [None]:
plt.figure(figsize = (8, 8))
plt.plot(fpr1, tpr1, 'b',label='max_depth=2')
plt.plot(fpr2, tpr2, 'b--',label='min_samples_split=46')
plt.plot(fpr3, tpr3, 'grey',label='min_samples_leaf=12')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.show()

* 분류 나무의 최적 모형

In [None]:
DC = DecisionTreeClassifier(max_depth=2,random_state=3)
DC.fit(X_train,y_train)
y_pred_prob = DC.predict_proba(X_test)
y_pred = DC.predict(X_test)

* 혼돈행렬 출력

In [None]:
confusion_matrix(y_test, y_pred)

* AUC와 ROC 곡선 그리기

In [None]:
fpr, tpr, threshold = sm.roc_curve(y_test, y_pred_prob[:, 0], pos_label=0)
sm.auc(fpr,tpr)

In [None]:
plt.figure(figsize = (8, 8))
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

* 나무 모형을 이용한 앙상블 모형

In [None]:
#@title 배깅알고리즘의 적용
from sklearn.ensemble import BaggingClassifier
BC = BaggingClassifier(
            estimator = DC, #
            n_estimators=1000, # 붓스트랩 표본추출 횟수
            max_samples=1, # 붓스트랩 표본추출 비율
            bootstrap=True, # 복원 추출 (기본값)
            oob_score=True, # Out-of-bag 데이터를 이용한 성능 계산
            random_state=3
        )
BC.fit(X_train,y_train)
y_pred = BC.predict(X_test)
y_pred_prob = BC.predict_proba(X_test)

In [None]:
BC.score(X_train,y_train)

In [None]:
BC.score(X_test,y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
#@title 랜덤포레스트 알고리즘의 적용
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트
RFC = RandomForestClassifier(
        n_estimators= 1000, # 붓스트렙 표본추출 횟수
        criterion = 'gini',
        max_depth = 2,
        min_samples_split = 20,
        min_samples_leaf = 10,
        random_state=3)
RFC.fit(X_train,y_train)
y_pred = RFC.predict(X_test)
y_pred_prob = RFC.predict_proba(X_test)

In [None]:
RFC.score(X_train,y_train)

In [None]:
RFC.score(X_test,y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
#@title Gradient Boosting
from xgboost import XGBClassifier # 종속변수가 숫자로 입력
GB = XGBClassifier(random_state=3,
                      eval_metric='logloss', # 기준이 되는 손실함수
                      booster = 'gbtree', # 부스팅 알고리즘 (또는 dart, gblinear)
                      objective = 'binary:logistic',
                      max_depth = 2, # 나무의 최대깊이
                      learning_rate = 0.05, # 학습률(최대 값 1) (강의노트 84쪽 lambda값)
                      n_estimators = 1000, # 붓스트랩 표본추출 횟수(강의노트 84쪽 B값)
                      subsample = 1, # 붓스트랩 표본추출 비율
                      colsample_bytree = 1 # 독립변수의 추출 비율
)
GB.fit(X_train,y_train)
y_pred = GB.predict(X_test)
y_pred_prob = GB.predict_proba(X_test)

In [None]:
GB.score(X_train,y_train)

In [None]:
GB.score(X_test,y_test)

In [None]:
confusion_matrix(y_test, y_pred)

---
__3개 집단 분류 사례 : 붓꽃 데이터__

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#@title 데이터 불러오기
iris = pd.read_table('/content/BizDataAnalysis/DATA/iris.csv',sep=',')
iris.head()

In [None]:
#@title 데이터 살펴보기
import seaborn as sns
sns.pairplot(iris, hue="species")
plt.show()

In [None]:
# @title 데이터 분리
X = iris.iloc[:,0:-1]
y = iris.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=13,train_size=0.7,stratify=iris['species'])

In [None]:
#@title 로지스틱 회귀모형
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train,y_train)
y_pred = LR.predict(X_test)
y_pred_prob = LR.predict_proba(X_test)

# 회귀계수에 대한 오즈비 출력
print("===========================================")
for i in range(0,len(LR.feature_names_in_)+1):
  if i == 0 :
    print(f'절편 : {np.exp(LR.intercept_[0])}')
  else :
    print(f'{LR.feature_names_in_[i-1]} : {np.exp(LR.coef_[0,i-1])}')
# 정확도(AR) 출력
print("===========================================")
print(f"- 학습데이터 : {np.round(LR.score(X_train, y_train),5)} \n- 평가데이터 : {LR.score(X_test, y_test)}")
print("===========================================")
# 혼돈행렬 출력
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

# 모형 성능 리포트 출력
print("===========================================")
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Setosa','Versicolor','Virginica']))

In [None]:
y = np.zeros((,1))
for i in range(0,len(y_train)):
  if y_train[i] == 'setosa': y_train_


In [None]:
#@title Gradient Boosting Machine
from sklearn.ensemble import HistGradientBoostingClassifier
GB = HistGradientBoostingClassifier(max_iter=100)
GB.fit(X_train,y_train)
y_pred = GB.predict(X_test)
y_pred_prob = GB.predict_proba(X_test)

# 정확도(AR) 출력
print("===========================================")
print(f"- 학습데이터 : {np.round(GB.score(X_train, y_train),5)} \n- 평가데이터 : {GB.score(X_test, y_test)}")
print("===========================================")
# 혼돈행렬 출력
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

# 모형 성능 리포트 출력
print("===========================================")
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Setosa','Versicolor','Virginica']))