In [2]:
#Some classifiers of Scikit-learn do not support the predict_proba method.

In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
# Set seed for reproducibility
np.random.seed(2017)

In [5]:
# Read the data
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, :8]     # Independent variables
y = df['Outcome']       # Dependent variable

In [6]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2017)

In [10]:
# Define the individual classifiers
LR = LogisticRegression(max_iter=200, random_state=2017)
RF = RandomForestClassifier(n_estimators=100, random_state=2017)
GBC = GradientBoostingClassifier(n_estimators=100, random_state=2017)

In [11]:
# Define ensemble classifiers with hard and soft voting
ECH = EnsembleVoteClassifier(clfs=[LR, RF, GBC], voting='hard')
ECS = EnsembleVoteClassifier(clfs=[LR, RF, GBC], voting='soft', weights=[1, 1, 1])

In [12]:
# Evaluate ensemble classifiers using cross-validation
print('5-fold cross-validation:\n')
for clf, label in zip([ECH, ECS],
                      ['Ensemble Hard Voting',
                       'Ensemble Soft Voting']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    clf.fit(X_train, y_train)
    print("Test Accuracy: %0.2f [%s]" % (metrics.accuracy_score(clf.predict(X_test), y_test), label))
    print()

5-fold cross-validation:

Train CV Accuracy: 0.76 (+/- 0.02) [Ensemble Hard Voting]
Test Accuracy: 0.80 [Ensemble Hard Voting]

Train CV Accuracy: 0.76 (+/- 0.03) [Ensemble Soft Voting]
Test Accuracy: 0.78 [Ensemble Soft Voting]

