# ◆ Ensemble
## * Bagging
> * Bootstrapping(복원추출)된 샘플들로 학습시킨 뒤 학습된 모델의 결과를 집계하여 최종결과를 도출하는 과정
> 
## * Boosting
> * 약한 분류기를 결합하여 강한 분류기를 만드는 과정<br>
> * ex) accuracy가 0.3인 분류기 A를 보완하여 분류기B,<br> 
B를 보완하여 분류기 C를 만들어 accuracy가 0.7인 최종 분류기를 만드는 과정

<img src="https://t1.daumcdn.net/cfile/tistory/995D67335C46BA4114" width = "700" height="700"/>

![](https://www.researchgate.net/profile/Ryan-Byrne-2/publication/334404567/figure/fig4/AS:862185249071106@1582572390782/Illustrations-of-A-bagging-and-B-boosting-ensemble-algorithms.ppm)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./data_ml/titanic_train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
df.isnull().sum()/len(df)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [4]:
df.drop('Cabin',axis=1,inplace=True)

In [5]:
df['Age'] = df.Age.fillna(df.Age.mean())
df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [7]:
y = df.Survived
x = df.drop(columns = ['Survived','PassengerId','Name','Ticket'])

In [8]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.000000,1,0,7.2500,S
1,1,female,38.000000,1,0,71.2833,C
2,3,female,26.000000,0,0,7.9250,S
3,1,female,35.000000,1,0,53.1000,S
4,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,S
887,1,female,19.000000,0,0,30.0000,S
888,3,female,29.699118,1,2,23.4500,S
889,1,male,26.000000,0,0,30.0000,C


In [9]:
x = pd.get_dummies(x)
x

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,3,26.000000,0,0,7.9250,1,0,0,0,1
3,1,35.000000,1,0,53.1000,1,0,0,0,1
4,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0,1,0,0,1
887,1,19.000000,0,0,30.0000,1,0,0,0,1
888,3,29.699118,1,2,23.4500,1,0,0,0,1
889,1,26.000000,0,0,30.0000,0,1,1,0,0


In [10]:
y.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [11]:
# train,test data split

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y ,test_size=0.2, random_state=123, stratify= y) #y의 비율과 맞게 뽑아

## ◆ VotingClassifier

In [13]:
# 어떤 알고리즘이 최적이냐 

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

In [15]:
import warnings
warnings.filterwarnings(action='ignore')

In [16]:
tree = DecisionTreeClassifier()
log = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC(probability=True)

vot_h = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='hard')

vot_s = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='soft')

In [17]:
models = [tree,log,knn,nb,svc,vot_h,vot_s]

In [18]:
for m in models:
    m.fit(x_train,y_train)
    accuracy = m.score(x_test,y_test)
    print(m.__class__.__name__,':',accuracy)
    print('-------------------')

DecisionTreeClassifier : 0.7318435754189944
-------------------
LogisticRegression : 0.8100558659217877
-------------------
KNeighborsClassifier : 0.6703910614525139
-------------------
GaussianNB : 0.8044692737430168
-------------------
SVC : 0.6983240223463687
-------------------
VotingClassifier : 0.7821229050279329
-------------------
VotingClassifier : 0.776536312849162
-------------------


In [19]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [20]:
x = cancer.data
y = cancer.target

In [21]:
len(x)

569

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2 ,random_state=123 , stratify=y)

In [23]:
tree = DecisionTreeClassifier()
log = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC(probability=True)

vot_h = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='hard')

vot_s = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='soft')

In [24]:
models = [tree,log,knn,nb,svc,vot_h,vot_s]

In [25]:
for m in models:
    m.fit(x_train,y_train)
    accuracy = m.score(x_test,y_test)
    print(m.__class__.__name__,':',accuracy)
    print('-------------------')

DecisionTreeClassifier : 0.956140350877193
-------------------
LogisticRegression : 0.956140350877193
-------------------
KNeighborsClassifier : 0.9298245614035088
-------------------
GaussianNB : 0.956140350877193
-------------------
SVC : 0.9385964912280702
-------------------
VotingClassifier : 0.9649122807017544
-------------------
VotingClassifier : 0.9824561403508771
-------------------


## ◆ Randomforest

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rfc = RandomForestClassifier()

In [28]:
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [29]:
rfc.score(x_test,y_test)

0.9649122807017544

## ◆ Adaboost

In [31]:
from sklearn.ensemble import AdaBoostClassifier

In [32]:
ada = AdaBoostClassifier()

In [33]:
ada.fit(x_train,y_train)
ada.score(x_test,y_test)

0.9736842105263158

In [36]:
# load datasets 데이터 사용

In [38]:
df = pd.read_csv('./data_ml/loan_data.csv')
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [39]:
y = df['not.fully.paid']
x = df.drop(columns = 'not.fully.paid')

In [40]:
x = pd.get_dummies(x)


In [41]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=123,stratify=y)

In [42]:
tree = DecisionTreeClassifier()
log = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC(probability=True)

vot_h = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='hard')

vot_s = VotingClassifier(estimators = [('log',log),('tree',tree),('knn',knn),('nb',nb),('svc',svc)], voting='soft')

models = [tree,log,knn,nb,svc,vot_h,vot_s]

In [43]:
for m in models:
    m.fit(x_train,y_train)
    accuracy = m.score(x_test,y_test)
    print(m.__class__.__name__,':',accuracy)
    print('-------------------')

DecisionTreeClassifier : 0.7411273486430062
-------------------
LogisticRegression : 0.8397703549060542
-------------------
KNeighborsClassifier : 0.8115866388308977
-------------------
GaussianNB : 0.824634655532359
-------------------
SVC : 0.8397703549060542
-------------------
VotingClassifier : 0.8392484342379958
-------------------
VotingClassifier : 0.8387265135699373
-------------------


## ◆ gradientboost

In [45]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train,y_train)
gbc.score(x_train, y_train)

0.8519968676585747

In [46]:
!pip install xgboost



In [47]:
!pip install lightgbm



In [48]:
!pip install catboost



In [49]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [50]:
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
xgb.score(x_test,y_test)



0.824634655532359

In [51]:
lgbm = LGBMClassifier()
lgbm.fit(x_train,y_train)
lgbm.score(x_test,y_test)

0.8366388308977035

In [52]:
catboost = CatBoostClassifier()
catboost.fit(x_train,y_train)
catboost.score(x_test,y_test)

Learning rate set to 0.024578
0:	learn: 0.6780198	total: 145ms	remaining: 2m 24s
1:	learn: 0.6643225	total: 151ms	remaining: 1m 15s
2:	learn: 0.6504015	total: 161ms	remaining: 53.4s
3:	learn: 0.6373942	total: 167ms	remaining: 41.6s
4:	learn: 0.6256968	total: 171ms	remaining: 34s
5:	learn: 0.6137008	total: 175ms	remaining: 29s
6:	learn: 0.6029378	total: 179ms	remaining: 25.3s
7:	learn: 0.5930166	total: 182ms	remaining: 22.5s
8:	learn: 0.5843993	total: 184ms	remaining: 20.3s
9:	learn: 0.5754519	total: 187ms	remaining: 18.5s
10:	learn: 0.5667295	total: 189ms	remaining: 17s
11:	learn: 0.5587053	total: 191ms	remaining: 15.7s
12:	learn: 0.5509949	total: 193ms	remaining: 14.7s
13:	learn: 0.5440525	total: 196ms	remaining: 13.8s
14:	learn: 0.5373656	total: 197ms	remaining: 13s
15:	learn: 0.5312762	total: 199ms	remaining: 12.2s
16:	learn: 0.5253954	total: 200ms	remaining: 11.6s
17:	learn: 0.5199595	total: 202ms	remaining: 11s
18:	learn: 0.5147442	total: 203ms	remaining: 10.5s
19:	learn: 0.509352

170:	learn: 0.3922758	total: 502ms	remaining: 2.43s
171:	learn: 0.3920815	total: 504ms	remaining: 2.42s
172:	learn: 0.3919420	total: 506ms	remaining: 2.42s
173:	learn: 0.3918442	total: 508ms	remaining: 2.41s
174:	learn: 0.3916389	total: 510ms	remaining: 2.4s
175:	learn: 0.3915542	total: 512ms	remaining: 2.4s
176:	learn: 0.3913517	total: 514ms	remaining: 2.39s
177:	learn: 0.3912907	total: 516ms	remaining: 2.38s
178:	learn: 0.3911312	total: 518ms	remaining: 2.38s
179:	learn: 0.3909719	total: 521ms	remaining: 2.37s
180:	learn: 0.3908016	total: 523ms	remaining: 2.36s
181:	learn: 0.3906672	total: 524ms	remaining: 2.36s
182:	learn: 0.3905197	total: 526ms	remaining: 2.35s
183:	learn: 0.3904490	total: 529ms	remaining: 2.34s
184:	learn: 0.3902674	total: 531ms	remaining: 2.34s
185:	learn: 0.3901121	total: 533ms	remaining: 2.33s
186:	learn: 0.3899948	total: 535ms	remaining: 2.33s
187:	learn: 0.3899006	total: 537ms	remaining: 2.32s
188:	learn: 0.3896562	total: 539ms	remaining: 2.31s
189:	learn: 0.

353:	learn: 0.3667978	total: 861ms	remaining: 1.57s
354:	learn: 0.3666286	total: 864ms	remaining: 1.57s
355:	learn: 0.3665353	total: 866ms	remaining: 1.57s
356:	learn: 0.3663618	total: 868ms	remaining: 1.56s
357:	learn: 0.3661809	total: 870ms	remaining: 1.56s
358:	learn: 0.3660091	total: 872ms	remaining: 1.56s
359:	learn: 0.3659365	total: 874ms	remaining: 1.55s
360:	learn: 0.3659314	total: 875ms	remaining: 1.55s
361:	learn: 0.3657981	total: 877ms	remaining: 1.55s
362:	learn: 0.3656548	total: 879ms	remaining: 1.54s
363:	learn: 0.3655473	total: 881ms	remaining: 1.54s
364:	learn: 0.3653581	total: 883ms	remaining: 1.54s
365:	learn: 0.3650878	total: 885ms	remaining: 1.53s
366:	learn: 0.3649133	total: 887ms	remaining: 1.53s
367:	learn: 0.3648406	total: 889ms	remaining: 1.53s
368:	learn: 0.3647161	total: 891ms	remaining: 1.52s
369:	learn: 0.3646062	total: 893ms	remaining: 1.52s
370:	learn: 0.3644387	total: 895ms	remaining: 1.52s
371:	learn: 0.3642284	total: 897ms	remaining: 1.51s
372:	learn: 

535:	learn: 0.3425908	total: 1.22s	remaining: 1.05s
536:	learn: 0.3425034	total: 1.22s	remaining: 1.05s
537:	learn: 0.3423547	total: 1.22s	remaining: 1.05s
538:	learn: 0.3421541	total: 1.23s	remaining: 1.05s
539:	learn: 0.3420442	total: 1.23s	remaining: 1.04s
540:	learn: 0.3419274	total: 1.23s	remaining: 1.04s
541:	learn: 0.3417325	total: 1.23s	remaining: 1.04s
542:	learn: 0.3416110	total: 1.23s	remaining: 1.04s
543:	learn: 0.3414773	total: 1.24s	remaining: 1.03s
544:	learn: 0.3413360	total: 1.24s	remaining: 1.03s
545:	learn: 0.3411796	total: 1.24s	remaining: 1.03s
546:	learn: 0.3410712	total: 1.24s	remaining: 1.03s
547:	learn: 0.3410622	total: 1.24s	remaining: 1.02s
548:	learn: 0.3409169	total: 1.24s	remaining: 1.02s
549:	learn: 0.3407615	total: 1.25s	remaining: 1.02s
550:	learn: 0.3407222	total: 1.25s	remaining: 1.02s
551:	learn: 0.3406445	total: 1.25s	remaining: 1.01s
552:	learn: 0.3405387	total: 1.25s	remaining: 1.01s
553:	learn: 0.3404469	total: 1.25s	remaining: 1.01s
554:	learn: 

718:	learn: 0.3215220	total: 1.58s	remaining: 617ms
719:	learn: 0.3213626	total: 1.58s	remaining: 615ms
720:	learn: 0.3212532	total: 1.58s	remaining: 612ms
721:	learn: 0.3211500	total: 1.58s	remaining: 610ms
722:	learn: 0.3210598	total: 1.59s	remaining: 608ms
723:	learn: 0.3209450	total: 1.59s	remaining: 606ms
724:	learn: 0.3209300	total: 1.59s	remaining: 603ms
725:	learn: 0.3208397	total: 1.59s	remaining: 601ms
726:	learn: 0.3206451	total: 1.59s	remaining: 599ms
727:	learn: 0.3205628	total: 1.6s	remaining: 597ms
728:	learn: 0.3204096	total: 1.6s	remaining: 594ms
729:	learn: 0.3202976	total: 1.6s	remaining: 592ms
730:	learn: 0.3201864	total: 1.6s	remaining: 590ms
731:	learn: 0.3201814	total: 1.6s	remaining: 587ms
732:	learn: 0.3200601	total: 1.61s	remaining: 585ms
733:	learn: 0.3199950	total: 1.61s	remaining: 583ms
734:	learn: 0.3198942	total: 1.61s	remaining: 581ms
735:	learn: 0.3197241	total: 1.61s	remaining: 579ms
736:	learn: 0.3196082	total: 1.61s	remaining: 577ms
737:	learn: 0.319

898:	learn: 0.3024173	total: 1.94s	remaining: 217ms
899:	learn: 0.3022904	total: 1.94s	remaining: 215ms
900:	learn: 0.3022017	total: 1.94s	remaining: 213ms
901:	learn: 0.3020286	total: 1.94s	remaining: 211ms
902:	learn: 0.3019836	total: 1.94s	remaining: 209ms
903:	learn: 0.3018503	total: 1.95s	remaining: 207ms
904:	learn: 0.3017333	total: 1.95s	remaining: 205ms
905:	learn: 0.3016412	total: 1.95s	remaining: 202ms
906:	learn: 0.3015319	total: 1.95s	remaining: 200ms
907:	learn: 0.3014464	total: 1.95s	remaining: 198ms
908:	learn: 0.3013539	total: 1.96s	remaining: 196ms
909:	learn: 0.3012780	total: 1.96s	remaining: 194ms
910:	learn: 0.3012162	total: 1.96s	remaining: 191ms
911:	learn: 0.3011389	total: 1.96s	remaining: 189ms
912:	learn: 0.3009994	total: 1.96s	remaining: 187ms
913:	learn: 0.3009446	total: 1.97s	remaining: 185ms
914:	learn: 0.3008660	total: 1.97s	remaining: 183ms
915:	learn: 0.3007844	total: 1.97s	remaining: 181ms
916:	learn: 0.3006807	total: 1.97s	remaining: 179ms
917:	learn: 

0.8376826722338204