In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Ensemble learning

- Data https://archive.ics.uci.edu/ml/datasets/letter+recognition
- URL https://www.pluralsight.com/guides/ensemble-modeling-scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

### Load data

In [None]:
df = pd.read_csv('data/letter-recognition.data')
print(df.shape)
df.head(10)

### Create arrays for the features and the response variable

In [None]:
y = df['T'].values
x = df.drop('T', axis=1).values

### Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Bagged Decision Trees for Classification

In [None]:
kfold = model_selection.KFold(n_splits=10, shuffle=True)
model_1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=10)
results_1 = model_selection.cross_val_score(model_1, x, y, cv=kfold)
print(results_1.mean())

### Random Forest Classification

In [None]:
kfold_rf = model_selection.KFold(n_splits=10, shuffle=True)
model_rf = RandomForestClassifier(n_estimators=100, max_features=5)
results_rf = model_selection.cross_val_score(model_rf, x, y, cv=kfold_rf)
print(results_rf.mean())

### Adaptive Boosting or AdaBoost

In [None]:
kfold_ada = model_selection.KFold(n_splits=10, shuffle=True)
model_ada = AdaBoostClassifier(n_estimators=30, random_state=10)
results_ada = model_selection.cross_val_score(model_ada, x, y, cv=kfold_ada)
print(results_ada.mean())

### Stochastic Gradient Boosting

In [None]:
kfold_sgb = model_selection.KFold(n_splits=10, shuffle=True)
model_sgb = GradientBoostingClassifier(n_estimators=100, random_state=10)
results_sgb = model_selection.cross_val_score(model_sgb, x, y, cv=kfold_sgb)
print(results_sgb.mean())

### Stacking/Voting Ensemble

In [None]:
kfold_vc = model_selection.KFold(n_splits=10, shuffle=True)
 
estimators = []
mod_lr = LogisticRegression(max_iter=1000)
estimators.append(('logistic', mod_lr))
mod_dt = DecisionTreeClassifier()
estimators.append(('cart', mod_dt))
mod_sv = SVC()
estimators.append(('svm', mod_sv))
 
ensemble = VotingClassifier(estimators)
results_vc = model_selection.cross_val_score(ensemble, x, y, cv=kfold_vc)
print(results_vc.mean())