## 패키지 import 및 dataset load

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "3_classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"] # X, y: pandas DataFrame
print(X.shape, X.dtype)
print(y.shape, y.dtype)

(70000, 784) float64
(70000,) object


In [4]:
y = y.astype(np.uint8)
print(y.dtype)

uint8


In [5]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# Import classifiers

from sklearn.neural_network import MLPClassifier # Neural network!
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# classifier generation

mlp_clf = MLPClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
et_clf = ExtraTreesClassifier(random_state=42)
svm_clf = SVC(random_state=42, probability= True)
# dt_clf = DecisionTreeClassifier(random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

In [8]:
# Training  classifiers
from sklearn.metrics import accuracy_score

# 개별 모델의 학습/예측/평가. 
estimators = [mlp_clf, rf_clf, et_clf, svm_clf, knn_clf]
for estimator in estimators:
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    # Print scoresv
    print(estimator.__class__.__name__, accuracy_score(y_test, y_pred))

MLPClassifier 0.9668
RandomForestClassifier 0.9705
ExtraTreesClassifier 0.9722
SVC 0.9792
KNeighborsClassifier 0.9705


## model 1. Voting Classiffier

In [9]:
# Define named_estimators for ensemble learning
named_estimators = [
    ('mlp', mlp_clf), ('rf', rf_clf), ('et', et_clf), ('svm', svm_clf), ('knn', knn_clf)
]

In [10]:
# Define voting_clf
from sklearn.ensemble import VotingClassifier

# 보통 성능이 더 좋은 soft voting을 이용
voting_clf = VotingClassifier(named_estimators, voting='soft')

In [11]:
# VotingClassifier 학습/예측/평가. 
voting_clf.fit(X_train , y_train) 
pred = voting_clf.predict(X_test) 
print(voting_clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.9705


## model 2. Gradient Boost_XGBoost_myModel

In [12]:
# Gradient Boost - LightGBM

In [13]:
# Gradient Boost - CatBoost
# 일반적으로 성능 cat>Light>XGB

# model 3. 실습 XGBoost

In [14]:
from xgboost import XGBClassifier

In [15]:
xgb_clf = XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
print(xgb_clf.__class__.__name__, accuracy_score(y_test, y_pred))



XGBClassifier 0.978


# test set 성능 평가

In [16]:
# scores of each estimator for test sets
clf = [mlp_clf, rf_clf, et_clf, svm_clf, knn_clf, voting_clf, xgb_clf]
for clf in clf:
    print(clf.__class__.__name__, clf.score(X_test, y_test))

MLPClassifier 0.9668
RandomForestClassifier 0.9705
ExtraTreesClassifier 0.9722
SVC 0.9792
KNeighborsClassifier 0.9705
VotingClassifier 0.9809
XGBClassifier 0.978
