# 앙상블(Ensemble)

## Bagging meta-estimator

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_boston
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

### Bagging을 사용한 분류

```
params = {
    "random_state" : SEED,
    "base_estimator" : base_model, #base로 사용할 Model 
    "n_estimators": 100, # base_estimator  개수
    "max_features":0.5, # 한 model이 사용할 최대 feature의 비율,
    "bootstrap_features": Flase, # Feature의 중복추출 허용
    "bootstrap" : True # Dataset 중복추출 허용 여부
}
```

#### 데이터셋 불러오기

In [5]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

#### KNN + Bagging

##### 붓꽃 데이터

In [6]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.8, max_features=.8)

In [7]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0008244514465332031
avg score time: 0.0013092041015625
avg test score: 0.96


In [8]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.012445545196533203
avg score time: 0.004494810104370117
avg test score: 0.9666666666666666


In [9]:
# 실습 와인 유방암 데이터 set을 적용해 보세요 

##### 와인 데이터

In [10]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [11]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0014878273010253905
avg score time: 0.0021875858306884765
avg test score: 0.9493650793650794


In [12]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.014239978790283204
avg score time: 0.00451502799987793
avg test score: 0.972063492063492


##### 유방암 데이터

In [13]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.8)

In [14]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.002014923095703125
avg score time: 0.006245088577270508
avg test score: 0.9648501785437045


In [15]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.017145633697509766
avg score time: 0.010694026947021484
avg test score: 0.9683744760130415


#### SVC + Bagging

##### 붓꽃 데이터

In [16]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [17]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0011839866638183594
avg score time: 0.00039210319519042967
avg test score: 0.9666666666666666


In [18]:

cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.020819473266601562
avg score time: 0.002507305145263672
avg test score: 0.9533333333333334


##### 와인 데이터

In [19]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [20]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0012569427490234375
avg score time: 0.00038976669311523436
avg test score: 0.9833333333333334


In [21]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.017487382888793944
avg score time: 0.0024641990661621095
avg test score: 0.961111111111111


##### 유방암 데이터

In [22]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [23]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.003923177719116211
avg score time: 0.001213979721069336
avg test score: 0.9736376339077782


In [24]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.02431516647338867
avg score time: 0.007056760787963867
avg test score: 0.9630802670392796


#### Decision Tree + Bagging

##### 붓꽃 데이터

In [25]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [26]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0008026123046875
avg score time: 0.0003253936767578125
avg test score: 0.9533333333333334


In [27]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.018235397338867188
avg score time: 0.0017558574676513673
avg test score: 0.9466666666666667


##### 와인 데이터

In [28]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [29]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.001415586471557617
avg score time: 0.00031299591064453124
avg test score: 0.8709523809523809


In [30]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0146575927734375
avg score time: 0.0017647266387939454
avg test score: 0.9273015873015874


##### 유방암 데이터

In [31]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [32]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.005768728256225586
avg score time: 0.00044550895690917967
avg test score: 0.9190964136003726


In [33]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.023106861114501952
avg score time: 0.001876688003540039
avg test score: 0.9279925477410339


### Bagging을 사용한 회귀

#### 데이터셋 불러오기

In [34]:
boston = load_boston()
diabetes = load_diabetes()

#### KNN + Bagging

##### 보스턴 주택 가격 데이터

In [35]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [36]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0013090133666992187
avg score time: 0.001387166976928711
avg test score: 0.47357748833823543


In [37]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.012032699584960938
avg score time: 0.005208301544189453
avg test score: 0.5581329876240896


##### 당뇨병 데이터

In [38]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [39]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0010829925537109374
avg score time: 0.0011868000030517579
avg test score: 0.3689720650295623


In [40]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.020571184158325196
avg score time: 0.006197690963745117
avg test score: 0.36841925121437524


#### SVR + Bagging

##### 보스턴 주택 가격 데이터

In [41]:
base_model = make_pipeline(
    StandardScaler(),
    SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [42]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.008684873580932617
avg score time: 0.002401113510131836
avg test score: 0.17631266230186618


In [43]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.026160669326782227
avg score time: 0.007461404800415039
avg test score: 0.15907645199864495


##### 당뇨병 데이터

In [44]:
base_model = make_pipeline(
    StandardScaler(),
    SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [45]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.005448722839355468
avg score time: 0.0014922618865966797
avg test score: 0.14659936199629434


In [46]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.021876430511474608
avg score time: 0.006628751754760742
avg test score: 0.07200056972228237


#### Decision Tree + Bagging

##### 보스턴 주택 가격 데이터

In [47]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [48]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0029764175415039062
avg score time: 0.0004002571105957031
avg test score: 0.11917692272930411


In [49]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.017389440536499025
avg score time: 0.0014261245727539063
avg test score: 0.5507792713777818


##### 당뇨병 데이터

In [50]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [51]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0014103889465332032
avg score time: 0.001667928695678711
avg test score: 0.3689720650295623


In [52]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.011734867095947265
avg score time: 0.006138896942138672
avg test score: 0.43024968500302607
