In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


## Create data

In [2]:
data, y = make_regression(
    n_samples=3000, n_features=10, n_informative=5, noise=10)
data = pd.DataFrame(data=data, columns=['f'+str(i+1) for i in range(data.shape[1])])
# adding some non-linearity
y = (y + abs(y.min())) ** 2
y /= 1000

In [3]:
features = [c for c in data.columns]
train, test, ytrain, ytest = train_test_split(data[features], y, test_size=0.25)

scaler = StandardScaler()
train = pd.DataFrame(data=scaler.fit_transform(train[features]),  columns=features)
test  = pd.DataFrame(data=scaler.transform(test[features]),  columns=features)

## Fit first level models

In [4]:
from stacker import Stacker

S = Stacker(
    train, ytrain,
    metric=mean_squared_error,
    test=test, ytest=None,
    features=features, features_to_encode=[],
    split_by=None, stratify_folds=False, n_splits=5, split_seed=0
)
# We are not giving ytest to model yet, just to be sure.

In [5]:
xgb_params ={
    'objective': 'reg:linear', 
    'eval_metric': 'rmse', 
    #'seed': 0, 
    'booster': 'gbtree', 
    'eta': 0.05,
    #'alpha': 1,
    #'colsample_bylevel': 1,
    #'colsample_bytree': 0.75,
    #'gamma': 1,
    #'lambda': 16,
    #'max_bin': 1000,
    #'max_depth': 5,
    #'min_child_weight': 750,
    #'scale_pos_weight': 1
}

lgb_params = {
    'metric': 'mse', 
    'objective': 'mse',
    'learning_rate' : 0.05, 
    #'bagging_fraction': 0.75,
    #'bagging_freq': 5,
    #'feature_fraction': 1,
    #'lambda_l1': 1,
    #'lambda_l2': 5,
    #'min_data_in_leaf': 5,
    #'min_split_gain': 10,
    #'num_leaves': 8,
    #'scale_pos_weight': 1/data.target.mean()
}

# Sklearn models.
lr = ElasticNet(alpha=0.1)
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth=32, random_state=0)
knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)

In [6]:
S.fit(
    model='lgb',
    colname='lgb_1',
    model_params=lgb_params,
    valid_size=0.25)
S.fit('xgb', 'xgb_1', xgb_params, 0.25)
#S.fit(rf, 'rf_1')
S.fit(lr, 'lr_1')
S.fit(knn, 'knn_1')

Metric on test fold:  133.6803
Metric on test fold:  125.0507
Metric on test fold:  71.2976
Metric on test fold:  107.7617
Metric on test fold:  112.073
Iteration OOF score: 109.9727

Metric on test fold:  93.1471
Metric on test fold:  127.342
Metric on test fold:  77.9722
Metric on test fold:  92.2932
Metric on test fold:  109.4087
Iteration OOF score: 100.0326

Metric on test fold:  175.506
Metric on test fold:  148.0047
Metric on test fold:  122.809
Metric on test fold:  151.0889
Metric on test fold:  137.164
Iteration OOF score: 146.9145

Metric on test fold:  623.5632
Metric on test fold:  520.0906
Metric on test fold:  440.582
Metric on test fold:  553.615
Metric on test fold:  507.111
Iteration OOF score: 528.9923



## First level results

In [7]:
S.train_meta.head()

Unnamed: 0,lgb_1,xgb_1,lr_1,knn_1
0,181.940641,183.037766,174.544243,147.858345
1,154.056695,145.415497,153.327952,133.719496
2,136.954266,140.401596,141.925037,125.146756
3,45.079565,42.531502,42.495055,61.598431
4,25.5767,24.869057,16.92263,49.237891


In [8]:
S.test_meta.head()

Unnamed: 0,lgb_1,xgb_1,lr_1,knn_1
0,103.754951,103.072655,107.30449,102.996149
1,85.259169,88.248184,93.840025,83.620337
2,101.823485,103.06097,113.424351,104.458501
3,97.114228,96.275476,104.304997,108.374389
4,37.203178,35.261732,38.040479,57.577672


Results on test are a bit better due to blending, because for test data each model predictions are blended n_splits times:

In [9]:
# OOFs on train data.
S.get_metrics()

109.9727 - lgb_1 on train
100.0326 - xgb_1 on train
146.9145 - lr_1 on train
528.9923 - knn_1 on train


In [10]:
# OOFs on test data. 
S.get_metrics(ytest)

84.2216 - lgb_1 on test
71.1094 - xgb_1 on test
133.8088 - lr_1 on test
510.9368 - knn_1 on test


## Fit second level model and inspect results

In [11]:
lr = ElasticNet()
S.fit(lr, 'lr_2', level=2)

Metric on test fold:  86.1575
Metric on test fold:  98.3113
Metric on test fold:  62.2685
Metric on test fold:  74.8713
Metric on test fold:  80.4463
Iteration OOF score: 80.411



In [12]:
S.test_meta.head()

Unnamed: 0,lgb_1,xgb_1,lr_1,knn_1
0,103.754951,103.072655,107.30449,102.996149
1,85.259169,88.248184,93.840025,83.620337
2,101.823485,103.06097,113.424351,104.458501
3,97.114228,96.275476,104.304997,108.374389
4,37.203178,35.261732,38.040479,57.577672


In [13]:
S.get_metrics_final()

80.411 - lr_2 on train


In [14]:
S.get_metrics_final(ytest)

61.1626 - lr_2 on test


### MSE of second level model is lower than from the best first level model (61 vs 71)