In [1]:
import pandas as pd
import numpy as np
import os 
# Get some classifiers to evaluate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
# score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
print(os.listdir('../input'))
#read in the dataset
df = pd.read_csv('../input/diabetes_data.csv')

#take a look at the data
df.head()
#check dataset size
df.shape
#split data into inputs and targets
X = df.drop(columns = ['diabetes'])
y = df['diabetes']
print(y.shape)
print("first 10 labels")
print(y[:10])
#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['diabetes_data.csv']
(768,)
first 10 labels
0    1
1    0
2    1
3    0
4    1
5    0
6    1
7    0
8    1
9    1
Name: diabetes, dtype: int64
(537, 8)
(231, 8)
(537,)
(231,)


In [3]:
seed = 1075
np.random.seed(seed)

# Create classifiers
rf = RandomForestClassifier(n_estimators=150, max_depth=4, min_samples_split=10)
et = ExtraTreesClassifier(n_estimators=150, max_depth=4, min_samples_split=10)
knn = KNeighborsClassifier()
svc = SVC(gamma='scale')
rg = RidgeClassifier()
gb = GradientBoostingClassifier(n_estimators=100, max_depth=4, min_samples_split=8)

clf_array = [rf, et, knn, svc, rg, gb]

for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X, y, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.7, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X, y, cv=10, 
       n_jobs=-1)
    
    bag_model=BaggingClassifier(clf,bootstrap=True)
    bag_model=bag_model.fit(X_train,y_train)
    ytest_pred=bag_model.predict(X_test)
    print("score on test data:", accuracy_score(ytest_pred, y_test))
    
    print ("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, 
                                                              vanilla_scores.mean(), vanilla_scores.std()))
    print ("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, 
                                                                        bagging_scores.mean(), bagging_scores.std()))

score on test data: 0.7922077922077922
Mean of: 0.772, std: (+/-) 0.043 [RandomForestClassifier]
Mean of: 0.763, std: (+/-) 0.044 [Bagging RandomForestClassifier]

score on test data: 0.70995670995671
Mean of: 0.700, std: (+/-) 0.028 [ExtraTreesClassifier]
Mean of: 0.720, std: (+/-) 0.036 [Bagging ExtraTreesClassifier]

score on test data: 0.7489177489177489
Mean of: 0.721, std: (+/-) 0.044 [KNeighborsClassifier]
Mean of: 0.729, std: (+/-) 0.043 [Bagging KNeighborsClassifier]

score on test data: 0.7705627705627706
Mean of: 0.758, std: (+/-) 0.030 [SVC]
Mean of: 0.762, std: (+/-) 0.030 [Bagging SVC]

score on test data: 0.7835497835497836
Mean of: 0.773, std: (+/-) 0.034 [RidgeClassifier]
Mean of: 0.767, std: (+/-) 0.031 [Bagging RidgeClassifier]

score on test data: 0.8008658008658008
Mean of: 0.751, std: (+/-) 0.051 [GradientBoostingClassifier]
Mean of: 0.766, std: (+/-) 0.052 [Bagging GradientBoostingClassifier]



In [4]:
# Example of hard voting 
from sklearn.ensemble import VotingClassifier
clf = [rf, et, knn, svc, rg, gb]
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard')
for clf, label in zip([rf, et, knn, svc, rg, gb, eclf], ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier', 'GradientBoosting', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.76 (+/- 0.05) [Random Forest]
Accuracy: 0.71 (+/- 0.03) [Extra Trees]
Accuracy: 0.72 (+/- 0.04) [KNeighbors]
Accuracy: 0.76 (+/- 0.03) [SVC]
Accuracy: 0.77 (+/- 0.03) [Ridge Classifier]
Accuracy: 0.75 (+/- 0.05) [GradientBoosting]
Accuracy: 0.76 (+/- 0.04) [Ensemble]


In [5]:
# Set up ensemble voting for bagging
ebclf_array = []

for clf in clf_array:
    ebclf_array.append(BaggingClassifier(clf, max_samples=0.7, random_state=seed))
for clf, label in (zip(ebclf_array, ['Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors',
                              'Bagging SVC', 'BaggingRidge Classifier', 'GradientBoostingBagged'])):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy',error_score='raise')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))
## Set up voting
v_eclf = VotingClassifier(estimators=[('Bagging Random Forest', ebclf_array[0]), ('Bagging Extra Trees', ebclf_array[1]), 
                                    ('Bagging KNeighbors', ebclf_array[2]), ('Bagging SVC', ebclf_array[3]), ('Bagging Ridge Classifier', ebclf_array[4])], voting='hard')
scores = cross_val_score(v_eclf, X, y, cv=10, scoring='accuracy',error_score='raise')
print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), 'Bagging Ensemble'))

Mean: 0.763, std: (+/-) 0.044 [Bagging Random Forest]
Mean: 0.720, std: (+/-) 0.036 [Bagging Extra Trees]
Mean: 0.729, std: (+/-) 0.043 [Bagging KNeighbors]
Mean: 0.762, std: (+/-) 0.030 [Bagging SVC]
Mean: 0.767, std: (+/-) 0.031 [BaggingRidge Classifier]
Mean: 0.766, std: (+/-) 0.052 [GradientBoostingBagged]
Mean: 0.769, std: (+/-) 0.035 [Bagging Ensemble]


In [6]:
from mlxtend.classifier import EnsembleVoteClassifier
import warnings
from xgboost import plot_importance
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

warnings.filterwarnings('ignore')

# Create boosting classifiers
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()

boost_array = [ada_boost, grad_boost, xgb_boost, gb]

eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost, gb], voting='hard')

labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble', 'Gradient Boosting']

for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf,], labels):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.755, std: (+/-) 0.057 [Ada Boost]
Mean: 0.762, std: (+/-) 0.052 [Grad Boost]
Mean: 0.769, std: (+/-) 0.059 [XG Boost]
Mean: 0.770, std: (+/-) 0.050 [Ensemble]


In [7]:
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

lr = LogisticRegression()

seed = 1075

ensemble = SuperLearner(scorer = accuracy_score, 
                        random_state=seed, 
                        folds=10,
                        verbose = 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

# Build the first layer
ensemble.add([rf, et, knn, rg])
# Attach the final meta estimator
ensemble.add_meta(lr)

ensemble.fit(X_train, y_train)
preds = ensemble.predict(X_test)
print("Fit data:\n%r" % ensemble.data)
print("Accuracy score:", accuracy_score(preds, y_test))


Fitting 2 layers
Processing layer-1             

[MLENS] backend: threading


done | 00:00:03
Processing layer-2             done | 00:00:00
Fit complete                        | 00:00:03

Predicting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:00
Fit data:
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  extratreesclassifier         0.73     0.04  0.27  0.03  0.02  0.00
layer-1  kneighborsclassifier         0.72     0.07  0.00  0.00  0.01  0.00
layer-1  randomforestclassifier       0.76     0.05  0.28  0.06  0.04  0.03
layer-1  ridgeclassifier              0.77     0.05  0.00  0.00  0.00  0.00

Accuracy score: 0.7637795275590551


In [8]:
from itertools import combinations

names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']

def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        temp_list = []
        for i in range(len(arg) + 1):
            temp = list(map(list, combinations(arg, i)))
            temp_list.append(temp)
        combined_items = sum(temp_list, [])
#         print(map(list(combinations(arg, 2))))
#         print (len(combined_items),combined_items)
#         combined_items = sum([map(list(), combinations(arg, i)) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
#         print (list(combined_items))
        to_zip.append(combined_items) 
#     print("to_zip[0]",list(to_zip[0]))
#     print("to_zip[1]",list(to_zip[1]))
    return zip(to_zip[0], to_zip[1])

stacked_clf_list = zip_stacked_classifiers(clf_array, names)
# for clf in stacked_clf_list:
#     print("clf", clf[1])
best_combination = [0.00, ""]

for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = seed, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(lr)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    
    print("Accuracy score: ", accuracy, clf[1])

print("\nBest stacking model is {} with accuracy of: ",best_combination[1], best_combination[0])

Accuracy score:  0.7834645669291339 ['Random Forest']
Accuracy score:  0.7401574803149606 ['Extra Trees']
Accuracy score:  0.7047244094488189 ['KNeighbors']
Accuracy score:  0.7519685039370079 ['SVC']
Accuracy score:  0.7559055118110236 ['Ridge Classifier']
Accuracy score:  0.7677165354330708 ['Random Forest', 'Extra Trees']
Accuracy score:  0.7874015748031497 ['Random Forest', 'KNeighbors']
Accuracy score:  0.7834645669291339 ['Random Forest', 'SVC']
Accuracy score:  0.7716535433070866 ['Random Forest', 'Ridge Classifier']
Accuracy score:  0.7559055118110236 ['Extra Trees', 'KNeighbors']
Accuracy score:  0.7519685039370079 ['Extra Trees', 'SVC']
Accuracy score:  0.7165354330708661 ['Extra Trees', 'Ridge Classifier']
Accuracy score:  0.7637795275590551 ['KNeighbors', 'SVC']
Accuracy score:  0.7559055118110236 ['KNeighbors', 'Ridge Classifier']
Accuracy score:  0.7677165354330708 ['SVC', 'Ridge Classifier']
Accuracy score:  0.7519685039370079 ['Random Forest', 'Extra Trees', 'KNeighbors