## Data

In [42]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [43]:
cancer_data = load_breast_cancer()

In [44]:
X_data = cancer_data.data
y_label = cancer_data.target

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_label,
    test_size=0.2,
    random_state=0
)

## Models

- 개별 ML 모델 생성

In [46]:
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

- 스태킹으로 만들어진 데이터셋을 학습할 최종 모델 생성

In [47]:
lr_final = LogisticRegression()

## Train

- 개별 ML 모델 학습

In [48]:
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [49]:
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

- 각 개별 모델의 정확도 확인

In [50]:
accuracy_score(y_test, knn_pred)

0.9210526315789473

In [51]:
accuracy_score(y_test, rf_pred)

0.9649122807017544

In [52]:
accuracy_score(y_test, dt_pred)

0.9122807017543859

In [53]:
accuracy_score(y_test, ada_pred)

0.956140350877193

## 기본 스태킹 모델

- 개별 ML 모델의 예측 결과를 피처로 만듦

In [55]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred]) ## 예측 결과를 행 형태로 붙임
pred.shape

(4, 114)

In [57]:
pred = np.transpose(pred)
pred.shape

(114, 4)

In [59]:
pred[:5]

array([[0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 1, 1],
       [1, 1, 1, 1]])

- 최종 메타 모델인 로지스틱 회귀 학습 및 평가 수행

In [60]:
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

In [61]:
## 최종 메타 모델의 예측 정확도
accuracy_score(y_test, final)

0.9736842105263158

## CV 세트 기반의 스태킹 모델