In [1]:
import pandas as pd

from stack_ensembler import stack_ensemble

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# generate toy dataset
x_train, y_train = make_classification(n_samples=1000, n_features=5, n_informative=2, 
                                       n_redundant=3, random_state=42)

# create train-test split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.25)

# print dataset
pd.DataFrame(x_train).head()

Unnamed: 0,0,1,2,3,4
0,-0.56957,-1.241575,-1.43758,-0.364237,0.870827
1,-1.519048,0.533777,-1.436711,-1.090573,0.525579
2,0.010513,0.721858,0.462312,-0.014935,-0.342712
3,-1.254097,1.130708,-0.755899,-0.921738,0.111433
4,0.803684,-1.502468,-0.000564,0.614797,0.292106


In [3]:
# use logistic regression for baseline
model = LogisticRegression()
preds = model.fit(x_train, y_train).predict(x_test)

# print results
print(classification_report(preds, y_test))
print(accuracy_score(preds, y_test))

             precision    recall  f1-score   support

          0       0.77      0.82      0.79       111
          1       0.85      0.81      0.83       139

avg / total       0.81      0.81      0.81       250

0.812


In [4]:
# define first-level models
models = [ 
    RandomForestClassifier(), 
    KNeighborsClassifier(),
    MLPClassifier()
]

# get data with meta-features
x_train_stack, x_test_stack = stack_ensemble(models, x_train, y_train, x_test)

# print new dataframe
pd.DataFrame(x_train_stack).head()

Getting predictions from RandomForestClassifier..
Getting predictions from KNeighborsClassifier..
Getting predictions from MLPClassifier..




Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.56957,-1.241575,-1.43758,-0.364237,0.870827,0.0,0.0,0.0
1,-1.519048,0.533777,-1.436711,-1.090573,0.525579,0.0,0.0,0.0
2,0.010513,0.721858,0.462312,-0.014935,-0.342712,1.0,1.0,1.0
3,-1.254097,1.130708,-0.755899,-0.921738,0.111433,0.0,0.0,0.0
4,0.803684,-1.502468,-0.000564,0.614797,0.292106,1.0,1.0,1.0


In [5]:
# use logistic regression on stack ensembled data
preds = model.fit(x_train_stack, y_train).predict(x_test_stack)

# print results
print(classification_report(preds, y_test))
print(accuracy_score(preds, y_test))

             precision    recall  f1-score   support

          0       0.89      0.88      0.88       120
          1       0.89      0.90      0.89       130

avg / total       0.89      0.89      0.89       250

0.888
