<a href="https://colab.research.google.com/github/Anelis17/Ensemble/blob/main/ensemble2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#기본설정
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
from sklearn.datasets import fetch_openml #MNIST데이터셋 불러옴

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(          #테스트샘플을 훈련,검증,테스트의 3개로나눔
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [None]:
#필요한 Classifier 모듈 import

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier, BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import  RidgeClassifier

In [None]:
#사용할 CLassifier들

random_forest_clf = RandomForestClassifier(n_estimators=100,n_jobs=-1, random_state=42) #랜덤포레스트 분류기
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42,n_jobs=-1) #엑스트라트리분류기
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42) #SVM 분류기
mlp_clf = MLPClassifier(random_state=42) #다층 인공 신경망모델
knn_clf = KNeighborsClassifier(n_jobs=-1) #k-최근접 이웃 알고리즘
ada_clf = AdaBoostClassifier(random_state=42) #에이다부스트
bc_clf = BaggingClassifier(random_state=42) #배깅
xgb_clf =  XGBClassifier(n_jobs=-1,random_state=42) #XGBOOST
dt_clf = DecisionTreeClassifier(random_state=42) #결정트리 분류기
rd_clf = RidgeClassifier(random_state=42) #릿지회귀 분류기

In [None]:
#훈련 세트로 Classifier들 훈련

estimators = [random_forest_clf, extra_trees_clf, svm_clf,mlp_clf, knn_clf, ada_clf,bc_clf,xgb_clf,dt_clf,rd_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Training the ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
        

In [None]:
#훈련후 검증세트로 각 Classifier의 훈련점수 체크

for estimator in estimators:
    print(estimator.__class__.__name__,  estimator.score(X_val, y_val))

RandomForestClassifier 0.9692
ExtraTreesClassifier 0.9715
LinearSVC 0.8662
MLPClassifier 0.9639
KNeighborsClassifier 0.9702
AdaBoostClassifier 0.709
BaggingClassifier 0.9257
XGBClassifier 0.9349
DecisionTreeClassifier 0.8684
RidgeClassifier 0.8455


In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
#VotingClassifier에 쓰일 estimators

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
    ("knn_clf", knn_clf),
    ("ada_clf", ada_clf),
    ("bc_clf", bc_clf),
    ("xgb_clf", xgb_clf),
    ("dt_clf", dt_clf),
    ("rd_clf", rd_clf),
]

In [None]:
voting_clf = VotingClassifier(named_estimators)

In [None]:
#VotingClassifier 훈련

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_lea

In [None]:
#검증세트로 VotingClassifier 점수검증 및 Classifier들과의 점수비교

print('VotingClassifier :', voting_clf.score(X_val, y_val) )
for estimator in voting_clf.estimators_:
    print(estimator.__class__.__name__, ':', estimator.score(X_val, y_val))


VotingClassifier : 0.967
RandomForestClassifier : 0.9692
ExtraTreesClassifier : 0.9715
LinearSVC : 0.8662
MLPClassifier : 0.9639
KNeighborsClassifier : 0.9702
AdaBoostClassifier : 0.709
BaggingClassifier : 0.9257
XGBClassifier : 0.9349
DecisionTreeClassifier : 0.8684
RidgeClassifier : 0.8455


In [None]:
#점수가  낮은 일부 Classifier를 제거하고 확인

print(voting_clf.estimators_)

del voting_clf.estimators_[2] #LinearSVC 제거
del voting_clf.estimators_[4] #AdaBoostClassifier 제거
del voting_clf.estimators_[6] #DecisionTreeClassifier 제거
del voting_clf.estimators_[6] #RidgeClassifier 제거
print(voting_clf.estimators_)

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False), ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=F

In [None]:
#점수가 낮은 Classifier를 제거한뒤 점수비교

voting_clf.score(X_val, y_val)
print('VotingClassifier :', voting_clf.score(X_val, y_val) )
for estimator in voting_clf.estimators_:
    print(estimator.__class__.__name__, ':', estimator.score(X_val, y_val))


VotingClassifier : 0.9726
RandomForestClassifier : 0.9692
ExtraTreesClassifier : 0.9715
MLPClassifier : 0.9639
KNeighborsClassifier : 0.9702
BaggingClassifier : 0.9257
XGBClassifier : 0.9349


In [None]:
#점수가  낮은 일부 Classifier를 제거하고 확인

print(voting_clf.estimators_)

del voting_clf.estimators_[4] #BaggingClassifier 제거
del voting_clf.estimators_[4] #XGBClassifier 제거

print(voting_clf.estimators_)

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False), ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=F

In [None]:
#점수가 낮은 Classifier를 제거한뒤 점수비교

voting_clf.score(X_val, y_val)
print('VotingClassifier :', voting_clf.score(X_val, y_val) )
for estimator in voting_clf.estimators_:
    print(estimator.__class__.__name__, ':', estimator.score(X_val, y_val))

VotingClassifier : 0.9778
RandomForestClassifier : 0.9692
ExtraTreesClassifier : 0.9715
MLPClassifier : 0.9639
KNeighborsClassifier : 0.9702


In [None]:
#Soft Voting 방식 시도

voting_clf.voting = "soft"
print('VotingClassifier_Soft  :', voting_clf.score(X_val, y_val) )

VotingClassifier_Soft  : 0.9792


In [None]:
#최종적으로 Test 세트로 Soft,Hard 방식 두개로 Voting Classifier와 다른 Classifier들과의 점수를 비교

voting_clf.voting = "soft"
print('VotingClassifier_Soft :', voting_clf.score(X_test, y_test) )

voting_clf.voting = "hard"
print('VotingClassifier_Hard :', voting_clf.score(X_test, y_test) )

for estimator in voting_clf.estimators_:
    print(estimator.__class__.__name__, ':', estimator.score(X_test, y_test))


VotingClassifier_Soft : 0.976
VotingClassifier_Hard : 0.9712
RandomForestClassifier : 0.9645
ExtraTreesClassifier : 0.9691
MLPClassifier : 0.9604
KNeighborsClassifier : 0.9672
