## 앙상블 모델과 단일 모델 성능 비교 <hr>
- 데이터 : sklearn.datasets 의 breast_cancer
- 유형 : 지도학습 + 분류
- 방법 : LogisticRegression, DecisionTree, SVC
- 학습 데이터셋 : 동일한 데이터셋으로 3개의 모델로 학습 진행

(1) 모듈 로딩 및 데이터 준비

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [2]:
x, y = cancerDF = load_breast_cancer(as_frame=True, return_X_y=True)

In [3]:
x.shape, y.shape

((569, 30), (569,))

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5, stratify=y)

(2) 학습 진행

(2-1) 선형 회귀 LogisticRegression

In [5]:
# LogisticRegression 인스턴스
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(x_train, y_train)

(2-2) Decsision Tree

In [6]:
# Decesion Tree 
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

(2-3) Random Forest

In [7]:
from sklearn.svm import SVC

svc_model = SVC(probability=True)   # probability=True 를 줘야 predict_proba() 가능
svc_model.fit(x_train,y_train)

In [8]:
from sklearn.ensemble import VotingClassifier

# 동일 데이터셋으로 병렬 학습 진행할 모델 리스트 선정 및 결과
vt_models = VotingClassifier(estimators=[('lr_model', lr_model),
                                        ('dt_model', dt_model),
                                        ('svc_model', svc_model)],
                            verbose=True)

In [9]:
# 동일 데이터셋을 전달해서 3개의 모델 동시에 학습 진행
vt_models.fit(x_train, y_train)

[Voting] ................. (1 of 3) Processing lr_model, total=   0.0s
[Voting] ................. (2 of 3) Processing dt_model, total=   0.0s
[Voting] ................ (3 of 3) Processing svc_model, total=   0.0s


In [11]:
# 예측하기
vt_models.predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1])

In [12]:
vt_models.estimators_[0].predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0], dtype=int64)

In [19]:
# 예측하기
new_data = pd.DataFrame([x_test.iloc[0]], columns=x_test.columns)
new_data
# x_test.iloc[0].to_frame().T

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
525,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,...,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049


In [20]:
# 예측하기
vt_models.predict(new_data)
# vt_models.predict_proba(new_data)

array([1])

In [22]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (1)
# 인덱싱 방식으로 접근해야 함 => 리스트
vt_models.estimators_

[LogisticRegression(solver='liblinear'),
 DecisionTreeClassifier(),
 SVC(random_state=5)]

In [28]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (2)
# 키(Key)로 접근해야 함 => 딕셔너리
vt_models.named_estimators_

{'lr_model': LogisticRegression(solver='liblinear'),
 'dt_model': DecisionTreeClassifier(),
 'svc_model': SVC(random_state=5)}

In [31]:
for key, value in vt_models.named_estimators_.items():
    print(key, value.predict(new_data)[0])

lr_model 1
dt_model 1
svc_model 1
