In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


## Create data

In [2]:
data, y = make_classification(
    n_samples=20000, n_features=10, n_informative=5, n_redundant=2, flip_y=0.15)
data = pd.DataFrame(data=data, columns=['f'+str(i+1) for i in range(data.shape[1])])
data['target'] = y

In [3]:
features = [c for c in data.columns if c != 'target']
train, test, ytrain, ytest = train_test_split(data, data.target, test_size=0.25)

scaler = StandardScaler()
train = pd.DataFrame(data=scaler.fit_transform(train[features]),  columns=features)
test  = pd.DataFrame(data=scaler.transform(test[features]),  columns=features)

## Fit first level models

In [4]:
from stacker import Stacker

S = Stacker(
    train, ytrain,
    metric=log_loss,
    test=test, ytest=None,
    features=features, features_to_encode=[],
    split_by=None, stratify_folds=False, n_splits=5, split_seed=0
)
# We are not giving ytest to model yet, just to be sure.

In [5]:
xgb_params ={
    'objective': 'binary:logistic', 
    'eval_metric': 'auc', 
    #'seed': 0, 
    'booster': 'gbtree', 
    'eta': 0.05,
    #'alpha': 1,
    #'colsample_bylevel': 1,
    #'colsample_bytree': 0.75,
    #'gamma': 1,
    #'lambda': 16,
    #'max_bin': 1000,
    #'max_depth': 5,
    #'min_child_weight': 750,
    #'scale_pos_weight': 1
}

lgb_params = {
    'metric': 'auc', 
    'objective': 'binary',
    'learning_rate' : 0.025, 
    #'bagging_fraction': 0.75,
    #'bagging_freq': 5,
    #'feature_fraction': 1,
    #'lambda_l1': 1,
    #'lambda_l2': 5,
    #'min_data_in_leaf': 5,
    #'min_split_gain': 10,
    #'num_leaves': 8,
    #'scale_pos_weight': 1/data.target.mean()
}

# Sklearn models.
lr = LogisticRegression(C=0.5)
nb = BernoulliNB()
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=32, criterion='gini', random_state=0)
etc = ExtraTreesClassifier(n_estimators=200, n_jobs=-1, max_depth=10)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(24,8), random_state=1, early_stopping=True)
knn = KNeighborsClassifier(n_neighbors=50, n_jobs=4, leaf_size=100, algorithm='kd_tree')

In [6]:
S.fit(
    model='lgb',
    colname='lgb_1',
    model_params=lgb_params,
    valid_size=0.25)
S.fit('xgb', 'xgb_1', xgb_params, 0.25)
S.fit(lr, 'lr_1')
S.fit(nb, 'nb_1')
S.fit(knn, 'knn_1')

Metric on test fold:  0.3655
Metric on test fold:  0.3602
Metric on test fold:  0.3586
Metric on test fold:  0.3652
Metric on test fold:  0.3512
Iteration OOF score: 0.3602

Metric on test fold:  0.3709
Metric on test fold:  0.3418
Metric on test fold:  0.3599
Metric on test fold:  0.3684
Metric on test fold:  0.3576
Iteration OOF score: 0.3597

Metric on test fold:  0.5278
Metric on test fold:  0.52
Metric on test fold:  0.5405
Metric on test fold:  0.5309
Metric on test fold:  0.5101
Iteration OOF score: 0.5259

Metric on test fold:  0.6203
Metric on test fold:  0.6119
Metric on test fold:  0.6383
Metric on test fold:  0.5925
Metric on test fold:  0.5887
Iteration OOF score: 0.6103

Metric on test fold:  0.3989
Metric on test fold:  0.4167
Metric on test fold:  0.394
Metric on test fold:  0.4156
Metric on test fold:  0.39
Iteration OOF score: 0.403



## First level results

In [7]:
S.train_meta.head()

Unnamed: 0,lgb_1,xgb_1,lr_1,nb_1,knn_1
0,0.561616,0.619793,0.491191,0.177352,0.42
1,0.881164,0.855368,0.744002,0.912788,1.0
2,0.273214,0.259756,0.436971,0.073697,0.28
3,0.12499,0.120153,0.558395,0.187555,0.22
4,0.088565,0.074827,0.28912,0.362987,0.04


In [8]:
S.test_meta.head()

Unnamed: 0,lgb_1,xgb_1,lr_1,nb_1,knn_1
0,0.895354,0.903486,0.492098,0.322522,0.868
1,0.111129,0.092397,0.196257,0.078667,0.188
2,0.133736,0.127299,0.174184,0.064883,0.164
3,0.150134,0.136195,0.199756,0.070803,0.08
4,0.149761,0.170025,0.199551,0.175806,0.144


Results on test are a bit better due to blending, because for test data each model predictions are blended n_splits times:

In [9]:
# OOFs on train data.
S.get_metrics()

0.3602 - lgb_1 on train
0.3597 - xgb_1 on train
0.5259 - lr_1 on train
0.6103 - nb_1 on train
0.403 - knn_1 on train


In [10]:
# OOFs on test data. 
S.get_metrics(ytest)

0.3512 - lgb_1 on test
0.3483 - xgb_1 on test
0.5241 - lr_1 on test
0.6044 - nb_1 on test
0.3851 - knn_1 on test


## Fit second level model and inspect results

In [11]:
lr = LogisticRegression()
S.fit(lr, 'lr_2', level=2)

Metric on test fold:  0.3622
Metric on test fold:  0.3334
Metric on test fold:  0.3491
Metric on test fold:  0.3582
Metric on test fold:  0.3476
Iteration OOF score: 0.3501



In [14]:
S.test_result.head()

Unnamed: 0,lr_2
0,0.943763
1,0.104289
2,0.113615
3,0.10063
4,0.115821


In [15]:
S.get_metrics_final()

0.3501 - lr_2 on train


In [16]:
S.get_metrics_final(ytest)

0.343 - lr_2 on test


### Logloss of second level model is lower than from the best first level model (0.343 vs 0.3483) !