# 기본 스태킹 모델

In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
cancer_data = load_breast_cancer()

In [4]:
cancer_data

 'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry', 'mean fractal dimension',
        'radius error', 'texture error', 'perimeter error', 'area error',
        'smoothness error', 'compactness error', 'concavity error',
        'concave points error', 'symmetry error',
        'fractal di

In [5]:
X = cancer_data.data
y = cancer_data.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
# 개별 모델 생성
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# 스태킹으로 만들어진 데이터 세트를 학습, 예측할 최종 모델
lr_final = LogisticRegression(C=10)

In [8]:
# 개별 모델들을 학습
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

In [9]:
# 학습된 개별 모델들이 각자 반환하는 예측 데이터 세트를 생성하고, 개별 모델의 정확도 측정
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print(f'ROC-AUC Score of KNeighborsClassifier: {roc_auc_score(y_test, knn_pred)}')
print(f'ROC-AUC Score of RandomForestClassifier: {roc_auc_score(y_test, rf_pred)}')
print(f'ROC-AUC Score of DecisionTreeClassifier: {roc_auc_score(y_test, dt_pred)}')
print(f'ROC-AUC Score of AdaBoostClassifier: {roc_auc_score(y_test, ada_pred)}')

ROC-AUC Score of KNeighborsClassifier: 0.9233089869799938
ROC-AUC Score of RandomForestClassifier: 0.9669736424261671
ROC-AUC Score of DecisionTreeClassifier: 0.900920927278501
ROC-AUC Score of AdaBoostClassifier: 0.9531597332486503


In [10]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
pred.shape

(4, 114)

In [11]:
pred = pred.T
pred.shape

(114, 4)

In [12]:
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

print(f'ROC-AUC Score of final meta model: {roc_auc_score(y_test, final)}')

ROC-AUC Score of final meta model: 0.9669736424261671


# CV 세트 기반의 스태킹

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [14]:
 # 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수
 def get_stacking_base_datasets(model, X_train, y_train, X_test, n_folds):
    # 지정된 n_folds값으로 KFold 생성
    kf = KFold(n_splits=n_folds, shuffle=False)
    # 추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
    train_fold_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test.shape[0], n_folds))
    print('Training:', model.__class__.__name__)

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train)):
        # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
        print(f'Training Fold set', folder_counter)
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[valid_index]

        # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
        model.fit(X_tr, y_tr)
        # 폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        # 입력된 원본 테스트 데이터를 폴드 세트 내 학습된 기반 모델에서 예측 후 데이터 저장
        test_pred[:, folder_counter] = model.predict(X_test)
    
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred, test_pred_mean

In [15]:
knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)

Training: KNeighborsClassifier
Training Fold set 0
Training Fold set 1
Training Fold set 2
Training Fold set 3
Training Fold set 4
Training Fold set 5
Training Fold set 6
Training: RandomForestClassifier
Training Fold set 0
Training Fold set 1
Training Fold set 2
Training Fold set 3
Training Fold set 4
Training Fold set 5
Training Fold set 6
Training: DecisionTreeClassifier
Training Fold set 0
Training Fold set 1
Training Fold set 2
Training Fold set 3
Training Fold set 4
Training Fold set 5
Training Fold set 6
Training: AdaBoostClassifier
Training Fold set 0
Training Fold set 1
Training Fold set 2
Training Fold set 3
Training Fold set 4
Training Fold set 5
Training Fold set 6


In [16]:
stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)

print(f'원본 학습 피쳐 데이터 shape: {X_train.shape}, 원본 테스트 피쳐 데이터 shape: {X_test.shape}')
print(f'스태킹 학습 피쳐 데이터 shape: {stack_final_X_train.shape}, 스태킹 테스트 피쳐 데이터 shape: {stack_final_X_test.shape}')

원본 학습 피쳐 데이터 shape: (455, 30), 원본 테스트 피쳐 데이터 shape: (114, 30)
스태킹 학습 피쳐 데이터 shape: (455, 4), 스태킹 테스트 피쳐 데이터 shape: (114, 4)


In [17]:
lr_final = LogisticRegression()
lr_final.fit(stack_final_X_train, y_train)
stack_final = lr_final.predict(stack_final_X_test)

print(f'ROC-AUC Score of final meta model: {roc_auc_score(y_test, stack_final)}')

ROC-AUC Score of final meta model: 0.9744363289933311
