In [1]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Ensemble learning

- Data https://archive.ics.uci.edu/ml/datasets/letter+recognition
- URL https://www.pluralsight.com/guides/ensemble-modeling-scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

### Load data

In [3]:
df = pd.read_csv('data/letter-recognition.data')
print(df.shape)
df.head(10)

(19999, 17)


Unnamed: 0,T,2,8,3,5,1,8.1,13,0,6,6.1,10,8.2,0.1,8.3,0.2,8.4
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7
5,B,4,2,5,4,4,8,7,6,6,7,6,6,2,8,7,10
6,A,1,1,3,2,1,8,2,2,2,8,2,8,1,6,2,7
7,J,2,2,4,4,2,10,6,2,6,12,4,8,1,6,1,7
8,M,11,15,13,9,7,13,2,6,2,12,1,9,8,1,1,8
9,X,3,9,5,7,4,8,7,3,8,5,6,8,2,8,6,7


### Create arrays for the features and the response variable

In [4]:
y = df['T'].values
x = df.drop('T', axis=1).values

### Logistic Regression

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[210   0   0   1   0   0   0   0   0   2   3   0   2   1   0   0   1   2
    5   3   1   0   0   0   1   0]
 [  1 180   1   7   1   5   1   4   3   0   2   0   0   0   3   0   3   8
   12   0   0   3   0   0   0   2]
 [  0   0 166   0   3   1  15   1   0   0  15   4   0   0   1   0   2   0
    0   2   2   0   0   0   0   0]
 [  0   8   0 195   0   1   1   3   0   1   3   0   2   3   1   1   2   7
    0   3   0   0   0   3   0   1]
 [  0   8   2   0 194   1  15   0   0   0   1   1   0   0   0   0   3   3
    9   1   0   0   0   3   0   3]
 [  0   1   1   3   7 182   9   1   1   1   1   0   0   3   0   6   0   1
    9   7   1   0   0   3   3   1]
 [  2   3  31   3   1   1 126   3   0   0  10   2   1   0   5   3  22   6
    2   0   0   5   0   0   0   0]
 [  3   4   0  15   0   5   4 105   0   1  13   1   2   8  27   1   4  13
    0   0  11   4   0   3   2   0]
 [  0   5   0   4   0   2   0   0 173   6   1   2   0   0   1   0   0   0
    2   0   0   0   0   8   0   3]
 [  8   0   0   3  

### Bagged Decision Trees for Classification

In [6]:
kfold = model_selection.KFold(n_splits=10, shuffle=True)
model_1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=10)
results_1 = model_selection.cross_val_score(model_1, x, y, cv=kfold)
print(results_1.mean())

0.9479973986993496


### Random Forest Classification

In [7]:
kfold_rf = model_selection.KFold(n_splits=10, shuffle=True)
model_rf = RandomForestClassifier(n_estimators=100, max_features=5)
results_rf = model_selection.cross_val_score(model_rf, x, y, cv=kfold_rf)
print(results_rf.mean())

0.9668980990495248


### Adaptive Boosting or AdaBoost

In [8]:
kfold_ada = model_selection.KFold(n_splits=10, shuffle=True)
model_ada = AdaBoostClassifier(n_estimators=30, random_state=10)
results_ada = model_selection.cross_val_score(model_ada, x, y, cv=kfold_ada)
print(results_ada.mean())

0.2999153576788395


### Stochastic Gradient Boosting

In [9]:
kfold_sgb = model_selection.KFold(n_splits=10, shuffle=True)
model_sgb = GradientBoostingClassifier(n_estimators=100, random_state=10)
results_sgb = model_selection.cross_val_score(model_sgb, x, y, cv=kfold_sgb)
print(results_sgb.mean())

0.9181956728364181


### Stacking/Voting Ensemble

In [10]:
kfold_vc = model_selection.KFold(n_splits=10, shuffle=True)
 
estimators = []
mod_lr = LogisticRegression(max_iter=5000)
estimators.append(('logistic', mod_lr))
mod_dt = DecisionTreeClassifier()
estimators.append(('cart', mod_dt))
mod_sv = SVC()
estimators.append(('svm', mod_sv))
 
ensemble = VotingClassifier(estimators)
results_vc = model_selection.cross_val_score(ensemble, x, y, cv=kfold_vc)
print(results_vc.mean())

0.9098955977988993
