In [61]:
import sys
sys.path.append('../lib')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import xgboost as xgb

import lightgbm as lgb

# from Ensembler import Ensembler
# from CVHelper import CVHelper

In [62]:
from sklearn.model_selection import KFold

In [132]:
munged_train_df = pd.read_csv('../data/offline/train.csv', index_col=0)
munged_test_df = pd.read_csv('../data/offline/test.csv', index_col=0)
y_train_df = pd.read_csv('../data/offline/y_train.csv', index_col=0)

X_train, X_test, y_train, y_test = train_test_split(munged_train_df.values, y_train_df['y'].values, test_size=0.5, random_state=1729)
X_all_train = munged_train_df.values
y_all_train = y_train_df['y'].values
X_all_test = munged_test_df.values

In [64]:
xgb_model = xgb.XGBRegressor(n_estimators=700, max_depth=4, learning_rate=0.005, subsample=0.92)
gbm_model = lgb.LGBMRegressor(max_depth=4, num_leaves=9, learning_rate=0.005, n_estimators=700, subsample=0.95, seed=1729)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_all_train, y_all_train, test_size=0.2, random_state=1729)

In [79]:
gbm_model.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.005,
       max_bin=255, max_depth=4, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=700, nthread=-1, num_leaves=9,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=1729,
       silent=True, subsample=0.95, subsample_for_bin=50000,
       subsample_freq=1)

In [80]:
gbm_model.predict(X_test)

array([ 111.28486402,  103.56439499,   94.3795449 ,   94.72813677,
         93.00123864,   92.87310075,  112.65502729,  110.85753561,
         79.33678918,   94.4903027 ,  111.77597672,   94.94908393,
        112.26045079,  112.20752699,  111.343404  ,  112.96123263,
        114.02633395,  111.75336583,   99.76286141,  104.66584692,
         91.97186382,  112.08194613,  103.31681127,  109.78810989,
        111.74016419,  112.89916386,  111.47168633,  103.56214601,
        111.49795255,  111.53722258,  115.28387314,  104.5838231 ,
        103.60365761,   93.91246323,   94.14753308,   95.71383402,
         96.17601389,  111.41062696,   93.94903229,   94.03706627,
         92.86420262,   93.95423819,   93.56565279,  111.60076835,
        102.79139046,   95.14764573,  103.38616764,  115.76733145,
         92.91749223,   94.637335  ,   94.65487685,  111.47910869,
         94.11316148,  118.71451394,  103.30409453,   93.48515382,
         98.59960094,   94.1004158 ,   94.44317793,   99.36432

In [101]:
class Ensembler(object):
    def __init__(self, n_folds, base_models):
        self.n_folds = n_folds
        self.base_models = base_models
        self.level_one_train = np.zeros((1, 1));
        self.level_one_test = np.zeros((1, 1));
        self.X = None
        self.y = None
        self.T = None


    def fit_predict(self, X, y, T):
        self.X = np.array(X)
        self.y = np.array(y)
        self.T = np.array(T)

        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=2016)

        self.level_one_train = np.zeros((X.shape[0], len(self.base_models)))
        self.level_one_test = np.zeros((T.shape[0], len(self.base_models)))

        print(len(self.base_models))
        for i, model in enumerate(self.base_models):
            print("i=%d"%i)
            S_test_i = np.zeros((T.shape[0], self.n_folds))
            
            for j, (train_idx, test_idx) in enumerate(kf.split(self.X)):
                print("j=%d"%j)
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_test = X[test_idx]

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                self.level_one_train[test_idx, i] = y_pred
                S_test_i[:, j] = model.predict(T)

            self.level_one_test[:, i] = S_test_i.mean(1)

        return self.level_one_train, self.level_one_test


    def stacking(self, stacker_model):
        stacker_model.fit(self.level_one_train, self.y)
        y_pred = stacker_model.predict(self.level_one_test)[:]
        return y_pred

In [133]:
ensembler = Ensembler(5, [gbm_model, xgb_model])
level_one_train, level_one_test = ensembler.fit_predict(X_all_train, y_all_train, X_all_test)

2
i=0
j=0
j=1
j=2
j=3
j=4
i=1
j=0
j=1
j=2
j=3
j=4


In [134]:
xgb_stacking_model = xgb.XGBRegressor(n_estimators=60, max_depth=2, learning_rate=0.1, subsample=0.7)
y_pred = ensembler.stacking(xgb_stacking_model)

In [135]:
output = pd.DataFrame({'id': munged_test_df.index, 'y': y_pred})
output.to_csv('../data/online/lightbgm-7-10-6-5_6-13-1.csv', index=False)

In [131]:
r2_score(y_test, y_pred)

0.47858484293991843

In [122]:
level_one_train

array([[  93.03326054,   89.82991791],
       [  92.43403738,   89.46440887],
       [ 111.51614073,  108.63038635],
       ..., 
       [ 102.92917421,  100.02684784],
       [  93.67845383,   90.55873108],
       [ 104.04511104,  101.03120422]])

In [119]:
y_train

array([  92.91,   87.94,  106.52, ...,   98.95,   91.97,  111.68])