# Chapter 5 Sickit-Learn:  Machine Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Scikit-Learn Estimator API Example

# Scikit-Learn 분석 사례: supervised learning (회귀분석 모형 구축)

In [None]:
rng = np.random.RandomState(42)       # 데이터 생성하기
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)

In [None]:
# 1. Choose a class of model
from sklearn.linear_model import LinearRegression   

In [None]:
# 2. Choose model hyperparameters  (Instantiation)
model = LinearRegression(fit_intercept=True)        
model

In [None]:
#  3. Arrange data into a feature matrix and target vector
X = x[:, np.newaxis]                               
X.shape

In [None]:
x.shape

In [None]:
#  4. Fit the model to your data
model.fit(X,y)                                   

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
#  회귀직선과 산점도를 함께 그림
x_2 = np.arange(11)
X_2 = x_2[:, np.newaxis]
y_2 = model.predict(X_2)
plt.scatter(x, y)
plt.plot(x_2, y_2)

In [None]:
#  오차항의 제곱의 평균값 (MSE: mean squared error) 산출하기
y_pred = model.predict(X)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred)

In [None]:
# 새로운 데이터를 준비하고 예측에 대한 오차 산출하기
x_test = 10 * rng.rand(10)                  
y_test = 2 * x_test - 1 + rng.randn(10)
X_test = x_test[:, np.newaxis]

In [None]:
# 새로운 데이터 셋에 대한 예측값에 대한 오차 그래프로 작성하기
y_pred = model.predict(X_test)                 
plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color="red")

In [None]:
# 새로운 데이터에 대한 오차항의 제곱의 평균값(MSE) 산출하기
mean_squared_error(y_test, y_pred)

##  iris data 읽어들여 분석하기

In [None]:
iris = sns.load_dataset('iris')
iris

In [None]:
iris.info()

In [None]:
sns.pairplot(iris, hue='species')

# 지도학습 (supervised learning): Iris classification

In [None]:
# 1. Choose a class of model : BaussianNB
from sklearn.naive_bayes import GaussianNB

In [None]:
GNB = GaussianNB()

In [None]:
#  3. Arrange data set: split training and test sets
X_iris = iris.drop('species', axis =1)
y_iris = iris['species']

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris)

In [None]:
GNB.fit(Xtrain, ytrain)

In [None]:
# 5. Predict labels for unknown(test) data 
ypred = GNB.predict(Xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

# 비지도학습(unsupervised learning): dimension reduction

In [None]:
from sklearn.decomposition import PCA     # Choose the model class

In [None]:
PCA = PCA(n_components=2)                 # Instantiate the model with hyperparameters

In [None]:
PCA.fit(X_iris)                           # Fit the model with data. Notice y is not specified!

In [None]:
X_2D = PCA.transform(X_iris)             # Transform the data to two dimensions

In [None]:
iris['PCA1'] = X_2D[:,0]
iris['PCA2'] = X_2D[:,1]                 # iris 데이터셋에 새로 정의한 PCA 차원을 할당함
sns.lmplot('PCA1','PCA2', hue='species', data= iris, fit_reg=False)

In [None]:
iris

# 비지도학습(unsupervised learning): Iris Clustering

In [None]:
from sklearn.mixture import GaussianMixture               # Choose the model class

In [None]:
GM = GaussianMixture(n_components=3, covariance_type='full')    # Instantiate a model with hyperparameters

In [None]:
GM.fit(X_iris)                                              # Fit to data. Notice y is not specified!

In [None]:
y_pred = GM.predict(X_iris)                                 # Determine cluster labels

In [None]:
iris['cluster'] = y_pred         

In [None]:
iris

In [None]:
sns.lmplot("PCA1", "PCA2", data=iris, hue='species', col='cluster', fit_reg=False)

## 모형의 타당성 검토(Model Validation)

### clustering of Iris dataset without test data set

In [None]:
from sklearn.neighbors import KNeighborsClassifier    # choose model class: 분류를 위하여 Knn 방법을 사용함 

In [None]:
knn = KNeighborsClassifier()                            # Initiate a model: n_neighbors =5 (default)

In [None]:
knn.fit(X_iris, y_iris)                               # fit the model with data with labels!

In [None]:
y_pred = knn.predict(X_iris)                         
y_pred

# 모델의 성능을 검증하기 위하여 metrics모듈의 accuracy_score 함수를 사용함

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_iris, y_pred)

# hyperparameter n_neighbor=1로 설정한 경우 모형의 성능을 비교함

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_iris, y_iris)
y_pred = knn.predict(X_iris)
accuracy_score(y_iris, y_pred)

# hyperparameter n_neighbor=15로 설정한 경우 모형의 성능을 비교함

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_iris, y_iris)
y_pred = knn.predict(X_iris)
accuracy_score(y_iris, y_pred)

# Model Validation by Train -Test data split 

# train 데이터셋과 test 데이터셋을 구분하기 위하여 model_selection 모듈의 train_test_split 함수를 사용함

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.4)

# hyperparameter n_neighbor=5로 설정한 경우

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

# hyperparameter n_neighbor=1로 설정한 경우

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

# hyperparameter n_neighbor=15로 설정한 경우

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

# hyperparameter n_neighbor=5로 설정한 경우 (여러번 반복할 경우 accuracy값이 다르게 나타난다.)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.4)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.4)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.4)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
cross_val_score(knn, X_iris, y_iris, cv=5)

In [None]:
cross_val_score(knn, X_iris, y_iris, cv=10).mean()

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
cross_val_score(knn, X_iris, y_iris, cv=5)

In [None]:
cross_val_score(knn, X_iris, y_iris, cv=10).mean()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
cross_val_score(knn, X_iris, y_iris, cv=5)

In [None]:
cross_val_score(knn, X_iris, y_iris, cv=10).mean()

## Decision Tree

In [None]:
from sklearn import tree

iris 데이터셋에 decision tree 방식을 적용함

In [None]:
DT = tree.DecisionTreeClassifier()
DT.fit(X_iris, y_iris)

In [None]:
DT.score(X_iris, y_iris)

In [None]:
tree.plot_tree(DT)

In [None]:
#import graphviz

In [None]:
'''
dot_data = tree.export_graphviz(DT, out_file=None)
graph = graphviz.Source(dot_data)
graph
'''

train-test 데이터셋을 구분하여 예측 정확도를 파악함

In [None]:
DT = tree.DecisionTreeClassifier()
DT.fit(X_train, y_train)

In [None]:
DT.score(X_train, y_train)

In [None]:
DT.score(X_test, y_test)

Cross Validation  방법을 이용한 결과

In [None]:
DT = tree.DecisionTreeClassifier()
cross_val_score(DT, X_iris, y_iris, cv=5)

In [None]:
cross_val_score(DT, X_iris, y_iris, cv=5).mean()

##  RandomForests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF = RandomForestClassifier()
RF.fit(X_iris, y_iris)

In [None]:
RF.score(X_iris, y_iris)

train-test 데이터셋을 이용한 경우

In [None]:
RF= RandomForestClassifier()
RF.fit(X_train, y_train)

In [None]:
RF.score(X_test, y_test)

Cross Validation 방식을 이용한 경우

In [None]:
RF = RandomForestClassifier()
cross_val_score(RF, X_iris, y_iris, cv=5)

In [None]:
cross_val_score(RF, X_iris, y_iris, cv=5).mean()