# Chapter 15

# Improve Performance with Ensembles

## Bagging Algorithms

### Bagged Decision Trees

In [2]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7578263841421736


### Random Forest

In [5]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_trees = 100
model = RandomForestClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7721462747778538


### Extra Trees

In [6]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_trees = 100
model = ExtraTreesClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7656356801093642


## Boosting Algorithms

### AdaBoost

In [8]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_trees = 30
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7552460697197538


### Stochastic Gradient Boosting

In [9]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
test_size = 0.33
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_trees = 100
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7578947368421053


## Voting Ensemble

In [10]:
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
filename = 'diabetes.csv'
data = read_csv(filename)
array = data.values # convert to numpy array
X = array[:,0:8] # first 8 column
Y = array[:,8] # last column
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True) # create k-fold cross-validation
estimators = []
estimators.append(('logistic', LogisticRegression(solver='liblinear'))) # create the model
estimators.append(('cart', DecisionTreeClassifier())) # create the model
estimators.append(('svm', SVC())) # create the model
ensemble = VotingClassifier(estimators) # create the ensemble
results = cross_val_score(ensemble, X, Y, cv=kfold) # evaluate the ensemble
print(results.mean()) # print the mean accuracy

0.7695659603554341
