# Ensemble Voting

In [None]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

### Prepare Dataset

In [None]:
df = pd.read_csv("data/diabetes.csv")

# Features & Target
X = df.iloc[:,:8].values
y = df['class'].values

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import VotingClassifier

LR = LogisticRegression()
RF = RandomForestClassifier(n_estimators = 100)
SVM = SVC(random_state=0, probability=True)
KNN = KNeighborsClassifier()
DT = DecisionTreeClassifier()
AdaBoost = AdaBoostClassifier(n_estimators = 100)
Bagging = BaggingClassifier(n_estimators = 100)
GBC = GradientBoostingClassifier(n_estimators = 100)

clfs = []
print('5-fold cross validation:\n')
for clf, label in zip([LR, RF, SVM, KNN, DT, AdaBoost, Bagging, GBC], 
                      ['Logistic Regression', 'Random Forest', 'Support Vector Machine','KNeighbors',
                       'Decision Tree','Ada Boost','Bagging','Gradient Boosting']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

From above benchmarking we see that some models are giving better accuracy compared to other models. Let's combine non-similar models to create a robust generalized model.



In [None]:
# Building only the best performance
clfs = []
print('5-fold cross validation:\n')
for clf, label in zip([DT, SVM, RF, Bagging, GBC], 
                      ['Decision Tree', 'Support Vector Machine', 'Random Forest',
                       'Bagging','Gradient Boosting']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

# Hard Voting vs Soft Voting

In [None]:
VoteH = VotingClassifier(estimators=[('dt', DT),('svm', SVM), ('rf', RF), ('bag', Bagging), ('gnb', GBC)], voting='hard')
VoteS = VotingClassifier(estimators=[('dt', DT),('svm', SVM), ('rf', RF), ('bag', Bagging), ('gnb', GBC)], voting='soft', weights=[1,1,1,1,1])

for clf, label in zip([VoteH, VoteS], ['Ensemble Hard Voting', 'Ensemble Soft Voting']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))