# 제 6 장 __지도학습의 이해__
___

## __사전설정__
---

(1) 저장소 데이터 가져오기

In [None]:
!rm -rf /content/BizDataAnalysis/

In [None]:
!git clone https://github.com/BizStat/BizDataAnalysis.git


(2) matplotlib 환경에서 한글 사용

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -f -v
!rm ~/.cache/matplotlib -rf

런타임 메뉴에서 '세션 다시 시작' 후 다음의 명령문 실행

In [None]:
from matplotlib import rc
rc('font', family='NanumMyeongjo')
rc('axes', unicode_minus=False)

(3) 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

___

## 6.2 __예측모형의 이해와 활용__

### (1) Boston house-price 데이터

* 데이터 가져오기

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
boston = pd.read_table('/content/BizDataAnalysis/DATA/Boston.txt', sep='\s+')

In [None]:
boston

* 데이터 살펴보기

In [None]:
boston.describe()

In [None]:
boston.hist(bins=15)

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(data=boston)

* 데이터 분할 : 학습데이터와 평가데이터

In [None]:
boston.iloc[:,[-1]]

In [None]:
boston.iloc[:,-1]

In [None]:
boston.iloc[:,0:13]

In [None]:
X = boston.iloc[:,0:13]
y = boston.iloc[:,[-1]]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
y_train.mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3,train_size=0.7)

In [None]:
X_train

---
__선형회귀 알고리즘 설명__

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
boston['LSTAT']

In [None]:
import numpy as np
tmp = np.arange(10)
tmp

In [None]:
tmp.reshape(2,5)

In [None]:
tmp.reshape(10,1)

In [None]:
tmp.reshape(-1,1)

In [None]:
indVar = np.array(boston['LSTAT']).reshape(-1,1)
depVar = np.array(boston['MEDV'])
plt.scatter(indVar,depVar,color="darkgrey")
plt.title('지역별 하위계층비율(LSTAT)과 주택가격(MEDV)의 산점도')
plt.xlabel('LSAT')
plt.ylabel('MEDV')
plt.show()

In [None]:
SLR = LinearRegression(fit_intercept=True) # 절편 포함
SLR.fit(indVar,depVar)
fitVal = SLR.predict(indVar)

In [None]:
plt.scatter(indVar[:,0],depVar,color="darkgrey",label='실제값')
plt.plot(indVar[:,0],fitVal,color='black',label='예측값')
plt.title('지역별 하위계층비율(LSTAT)과 주택가격(MEDV)의 산점도와 단순선형회귀적합 결과')
plt.xlabel('LSAT')
plt.ylabel('MEDV')
plt.legend()
plt.show()

In [None]:
SLR = LinearRegression(fit_intercept=False) # 절편 = 0
SLR.fit(indVar,depVar)
fitVal1 = SLR.predict(indVar)

In [None]:
plt.scatter(indVar[:,0],depVar,color="darkgrey",label='실제값')
plt.plot(indVar[:,0],fitVal,color='black',label='절편을 포함한 예측값')
plt.plot(indVar[:,0],fitVal1,color='darkorange',label='절편이 0인 경우의 예측값')
# plt.title('지역별 하위계층비율(LSTAT)과 주택가격(MEDV)의 산점도와 단순선형회귀적합 결과')
plt.xlabel('LSAT')
plt.ylabel('MEDV')
plt.legend()
plt.show()

In [None]:
SLR.intercept_ # 절편의 추정결과

In [None]:
SLR.coef_

* 선형회귀모형 학습 및 평가

In [None]:
LR = LinearRegression(fit_intercept=True)
LR.fit(X_train, y_train)

In [None]:
LR.intercept_

In [None]:
LR.coef_

In [None]:
LR.predict(X_train)

In [None]:
LR.predict(X_test)

In [None]:
LR.score(X_train, y_train)

In [None]:
LR.score(X_test, y_test)

---
__kNN 회귀 알고리즘 설명__

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
SKR = KNeighborsRegressor(n_neighbors=5)

In [None]:
SKR = KNeighborsRegressor(n_neighbors=5,weights='uniform')

In [None]:
SKR = KNeighborsRegressor(n_neighbors=30,weights='uniform')

In [None]:
SKR = KNeighborsRegressor(n_neighbors=30,weights='distance')

In [None]:
SKR.fit(indVar,depVar)

In [None]:
indVar.min()

In [None]:
indVar.max()

In [None]:
term = np.linspace(indVar.min(), indVar.max(), 100)[:, np.newaxis]
term

In [None]:
fitVal = SKR.predict(term)

In [None]:
plt.scatter(indVar[:,0],depVar,color="darkgrey",label='실제값')
plt.plot(term,fitVal,color='black',label='예측값')
# plt.title('지역별 하위계층비율(LSTAT)과 주택가격(MEDV)의 산점도와 kNN회귀 적합 결과')
plt.xlabel('LSAT')
plt.ylabel('MEDV')
plt.legend()
plt.show()

In [None]:
for k in range(1, 100):
    SKR = KNeighborsRegressor(n_neighbors=k)
    SKR.fit(indVar,depVar)
    print(f'Score for k={k}: {SKR.score(indVar,depVar)}')

In [None]:
optVal = np.zeros((100,2))

In [None]:
for k in range(1, 101):
    SKR = KNeighborsRegressor(n_neighbors=k)
    SKR.fit(indVar,depVar)
    optVal[k-1,0] = k
    optVal[k-1,1] = SKR.score(indVar,depVar)

In [None]:
plt.plot(optVal[:,0],optVal[:,1],color='black')
# plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('R^2')
plt.show()

* kNN 회귀모형 학습 및 평가

In [None]:
KR = KNeighborsRegressor(n_neighbors=5)
KR.fit(X_train,y_train)

In [None]:
for k in range(1, 20):
    KR = KNeighborsRegressor(n_neighbors=k)
    KR.fit(X_train, y_train)
    print(f'Score for k={k}: {KR.score(X_train, y_train)}')

In [None]:
niter = 100
wgt_nm = 'distance'
optVal = np.zeros((niter,3))
for k in range(1, niter+1):
    KR = KNeighborsRegressor(n_neighbors=k,weights=wgt_nm)
    KR.fit(X_train, y_train)
    optVal[k-1,0] = k
    optVal[k-1,1] = KR.score(X_train, y_train)
    optVal[k-1,2] = KR.score(X_test, y_test)

In [None]:
optVal

In [None]:
plt.plot(optVal[:,0],optVal[:,1],color='navy',label='학습데이터')
plt.plot(optVal[:,0],optVal[:,2],color='darkorange',label='검증데이터')
plt.title('n_neighbors 값의 변화에 따른 적합결과 변화')
plt.xlabel('n_neighbors')
plt.ylabel('R^2')
plt.legend()
plt.show()

In [None]:
KR.predict(X_train)

In [None]:
KR.score(X_train, y_train)

In [None]:
KR.score(X_test, y_test)

In [None]:
for k in range(1, 20):
    KR = KNeighborsRegressor(n_neighbors=k)
    KR.fit(X_train, y_train)
    print(f'Score for k={k}: {KR.score(X_test, y_test)}')

* 회귀나무

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
TR = DecisionTreeRegressor(
            criterion = 'squared_error', ## “squared_error”, “friedman_mse”, “absolute_error”, “poisson”
            splitter='best', ## 'random'
            max_depth=3, ## '최대 깊이'
            min_samples_leaf=10, ## 최소 끝마디 샘플 수
            random_state=100
        )
TR.fit(X_train, y_train)

In [None]:
for i, col in enumerate(X.columns):
    print(f'{col} 중요도 : {TR.feature_importances_[i]}')

In [None]:
TR.predict(X_train)

In [None]:
TR.score(X_train, y_train)

In [None]:
TR.score(X_test, y_test)

In [None]:
from sklearn.tree import plot_tree
fig = plt.figure(figsize=(15, 10), facecolor='white')
plot_tree(TR, feature_names=X.columns)
plt.show()