In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(font_scale = 1.7)
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
# http://ipython.readthedocs.io/en/stable/config/options/terminal.html
import warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [9]:
train_raw = train.copy()
test_raw = test.copy()

In [10]:
import lightgbm as lgb
import xgboost as xgb

In [11]:
train_x = train.drop('y', axis = 1)
train_y = train['y']
test_x = test.copy()

# Stack then average models 

https://www.kaggle.com/hakeem/stacked-then-averaged-models-0-5697

In [12]:
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LassoLarsCV, ElasticNetCV 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.utils import check_array
from sklearn.metrics import r2_score

In [13]:
# http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/

class StackingEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator
        
    def fit(self, X, y = None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, "predict_proba"):
            X_transformed = np.hstack((self.estimator.predict_proba(X),X))
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X),(-1,1)),X_transformed))
        
        return X_transformed
    

In [14]:
for col in train.columns:
    if train[col].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[col].values) + list(test[col].values))
        train[col] = lbl.transform(list(train[col].values))
        test[col] = lbl.transform(list(test[col].values))
                

In [15]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

In [16]:
#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]


In [18]:
y_mean = np.mean(train_y)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

In [19]:
finaltrainset.shape

(4209, 377)

In [20]:
train.shape

(4209, 438)

In [21]:
train_raw.shape

(4209, 378)

In [22]:
import xgboost as xgb
import lightgbm as lgb

In [24]:
train_x = train.drop('y', axis = 1)
train_y = train['y']
test_x = test.copy()

In [25]:
xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

In [26]:

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/only xgb 3' + '.csv', index = False)

## XGBoost Cross-validation

In [30]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [38]:
kf = KFold(n_splits= 10, shuffle = True)

In [40]:
cv_scores2 = []

In [41]:
for a, b in kf.split(train_x):
    train_index = a
    test_index = b
    break

In [42]:
train_x_cv, test_x_cv = train_x.ix[train_index], train_x.ix[test_index]
train_y_cv, test_y_cv = train_y.ix[train_index], train_y.ix[test_index]
dtrain_cv = xgb.DMatrix(train_x_cv, train_y_cv)
dtest_cv = xgb.DMatrix(test_x_cv)

In [43]:
%%time

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
num_boost_rounds = 1250


for train_index, test_index in kf.split(train_x):
    train_x_cv, test_x_cv = train_x.ix[train_index], train_x.ix[test_index]
    train_y_cv, test_y_cv = train_y.ix[train_index], train_y.ix[test_index]
    dtrain_cv = xgb.DMatrix(train_x_cv, train_y_cv)
    dtest_cv = xgb.DMatrix(test_x_cv)
    # train model
    model = xgb.train(dict(xgb_params, silent=0), dtrain_cv, num_boost_round=num_boost_rounds)
    y_pred = model.predict(dtest_cv)
    cv_score = r2_score(test_y_cv, y_pred)
    cv_scores2.append(cv_score)

CPU times: user 10min 24s, sys: 11.3 s, total: 10min 36s
Wall time: 11min 26s


In [44]:
cv_scores2

[0.61008453267941065,
 0.54288438506063597,
 0.61488459834950482,
 0.59471918661746703,
 0.64995145182257263,
 0.43672063076043377,
 0.60846458837541717,
 0.52647105740046563,
 0.54931443277397052,
 0.53761336844127672]

In [58]:
np.mean(cv_scores2)

0.56711082322811557

In [59]:
np.std(cv_scores2)

0.058215911207195631

## XGB cross-validation

GridSearchCV http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
xgb_params_ori = {'n_trees':1000,
              'eta' : 0.0045,
            'max_depth' : 5,
              'subsample' : 0.93,
              'eval_metric':'rmse',
            'objective': 'reg:linear',
              'silent' : 1,
               'min_child_weight':1,
               'colsample_bytree':1,
               'gamma':0,
               'lambda':1,
               'reg_alpha':0,
                'base_score' : y_mean,
               'reg_lambda':0
             }


In [56]:
cv_folds = 10
nrounds = 1250

### Tune max_depth and min_child_weight

In [54]:
%%time
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.93, colsample_bytree=1,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=420), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=10)
gsearch1.fit(train_x, train_y)


CPU times: user 39.3 s, sys: 937 ms, total: 40.2 s
Wall time: 52min 14s


In [55]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.57035, std: 0.09820, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.57208, std: 0.09911, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.57200, std: 0.09972, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.56702, std: 0.09778, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.56738, std: 0.09754, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.56748, std: 0.09938, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.55802, std: 0.09707, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.56083, std: 0.09566, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.56103, std: 0.09756, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: 0.55273, std: 0.09747, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: 0.55510, std: 0.09494, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: 0.55507, std: 0.09645, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 3, 'min_child_weight': 3

In [60]:
%%time
xgb_params_tune1 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 3,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':3
}
# NOTE: Make sure that the class is labeled 'class' in the data file



num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params_tune1, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 45.6 s, sys: 352 ms, total: 46 s
Wall time: 46.8 s


In [62]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_1' + '.csv', index = False)

In [63]:
%%time
param_test2 = {
 'max_depth':range(2,5,1),
 'min_child_weight':range(1,4,1)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.93, colsample_bytree=1,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=420), 
 param_grid = param_test2, scoring='r2',n_jobs=4,iid=False, cv=10)
gsearch1.fit(train_x, train_y)

CPU times: user 38.2 s, sys: 843 ms, total: 39 s
Wall time: 23min 24s


In [64]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.57174, std: 0.09871, params: {'min_child_weight': 1, 'max_depth': 2},
  mean: 0.57180, std: 0.09884, params: {'min_child_weight': 2, 'max_depth': 2},
  mean: 0.57206, std: 0.09888, params: {'min_child_weight': 3, 'max_depth': 2},
  mean: 0.57035, std: 0.09820, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.57179, std: 0.09899, params: {'min_child_weight': 2, 'max_depth': 3},
  mean: 0.57208, std: 0.09911, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.56877, std: 0.09810, params: {'min_child_weight': 1, 'max_depth': 4},
  mean: 0.56975, std: 0.09831, params: {'min_child_weight': 2, 'max_depth': 4},
  mean: 0.57050, std: 0.09855, params: {'min_child_weight': 3, 'max_depth': 4}],
 {'max_depth': 3, 'min_child_weight': 3},
 0.57207644720386941)

### Summary of Tune 1: max_depth and min_child_weight

- parameters given by GridSearchCV are max_depth:3, min_child_weight:3
- According to LB, the best parameters are max_depth:4, min_child_weight:1
- LB scores are 0.56905 and 0.56376 respectively

### Tune gamma

In [65]:
%%time
param_test3 = {
 'gamma':np.arange(0,0.6,0.1)
}
gsearch3 = GridSearchCV(
    estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=4,
                                 min_child_weight=1, gamma=0, subsample=0.93, 
                                 colsample_bytree=1, objective= 'reg:linear', nthread=4, 
                                 scale_pos_weight=1, seed=420), 
    param_grid = param_test3, scoring='r2',n_jobs=4,iid=False, cv=10)


gsearch3.fit(train_x, train_y)

CPU times: user 49.3 s, sys: 801 ms, total: 50.1 s
Wall time: 20min 58s


In [66]:
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.56877, std: 0.09810, params: {'gamma': 0.0},
  mean: 0.56876, std: 0.09808, params: {'gamma': 0.10000000000000001},
  mean: 0.56876, std: 0.09807, params: {'gamma': 0.20000000000000001},
  mean: 0.56886, std: 0.09807, params: {'gamma': 0.30000000000000004},
  mean: 0.56901, std: 0.09816, params: {'gamma': 0.40000000000000002},
  mean: 0.56903, std: 0.09824, params: {'gamma': 0.5}],
 {'gamma': 0.5},
 0.56903428174599813)

In [68]:

xgb_params_tune2 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.5
}

In [69]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params_tune2, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 59.2 s, sys: 554 ms, total: 59.8 s
Wall time: 1min


In [70]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_3' + '.csv', index = False)

In [71]:
%%time
param_test4 = {
 'gamma':np.arange(0.4,0.6,0.025)
}
gsearch4 = GridSearchCV(
    estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=4,
                                 min_child_weight=1, gamma=0, subsample=0.93, 
                                 colsample_bytree=1, objective= 'reg:linear', nthread=4, 
                                 scale_pos_weight=1, seed=420), 
    param_grid = param_test4, scoring='r2',n_jobs=4,iid=False, cv=10)


gsearch4.fit(train_x, train_y)

CPU times: user 46.7 s, sys: 554 ms, total: 47.3 s
Wall time: 25min 21s


In [72]:
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.56901, std: 0.09816, params: {'gamma': 0.40000000000000002},
  mean: 0.56900, std: 0.09816, params: {'gamma': 0.42500000000000004},
  mean: 0.56900, std: 0.09816, params: {'gamma': 0.45000000000000007},
  mean: 0.56902, std: 0.09823, params: {'gamma': 0.47500000000000009},
  mean: 0.56903, std: 0.09824, params: {'gamma': 0.50000000000000011},
  mean: 0.56904, std: 0.09825, params: {'gamma': 0.52500000000000013},
  mean: 0.56906, std: 0.09824, params: {'gamma': 0.55000000000000016},
  mean: 0.56903, std: 0.09823, params: {'gamma': 0.57500000000000018}],
 {'gamma': 0.55000000000000016},
 0.5690559169415228)

In [73]:
xgb_params_tune2 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.55
}

In [74]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params_tune2, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 58.2 s, sys: 388 ms, total: 58.6 s
Wall time: 59.1 s


In [75]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_4' + '.csv', index = False)

### Summary of gamma

- Best gamma: 0.55, giving LB score 0.00001 improvement

### Tune subsample and colsample_bytree

In [79]:
%%time
param_test5 = {
     'subsample':[0.8,0.9,1.0],
    'colsample_bytree':[0.8,0.9,1.0]
}
gsearch5 = GridSearchCV(
    estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=4,
                                 min_child_weight=1, gamma=0.55, subsample=0.93, 
                                 colsample_bytree=1, objective= 'reg:linear', nthread=4, 
                                 scale_pos_weight=1, seed=420), 
    param_grid = param_test5, scoring='r2',n_jobs=4,iid=False, cv=10)


gsearch5.fit(train_x, train_y)

CPU times: user 39.3 s, sys: 557 ms, total: 39.8 s
Wall time: 27min 34s


In [81]:
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.56881, std: 0.09866, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.56890, std: 0.09814, params: {'colsample_bytree': 0.8, 'subsample': 0.9},
  mean: 0.57000, std: 0.09826, params: {'colsample_bytree': 0.8, 'subsample': 1.0},
  mean: 0.56946, std: 0.09873, params: {'colsample_bytree': 0.9, 'subsample': 0.8},
  mean: 0.56928, std: 0.09857, params: {'colsample_bytree': 0.9, 'subsample': 0.9},
  mean: 0.56992, std: 0.09923, params: {'colsample_bytree': 0.9, 'subsample': 1.0},
  mean: 0.56864, std: 0.09751, params: {'colsample_bytree': 1.0, 'subsample': 0.8},
  mean: 0.56895, std: 0.09815, params: {'colsample_bytree': 1.0, 'subsample': 0.9},
  mean: 0.56970, std: 0.09906, params: {'colsample_bytree': 1.0, 'subsample': 1.0}],
 {'colsample_bytree': 0.8, 'subsample': 1.0},
 0.56999896193024358)

In [83]:
xgb_params_tune3 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,

    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.55,
        'subsample': 1.0,
    'colsample_bytree':0.8
}

In [84]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params_tune3, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 50 s, sys: 635 ms, total: 50.7 s
Wall time: 52.6 s


In [85]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_5' + '.csv', index = False)

In [86]:
xgb_params_tune3 = {
    'n_trees': 520, 
    'eta': 0.0025,
    'max_depth': 4,

    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.55,
        'subsample': 1.0,
    'colsample_bytree':0.8
}

In [87]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 2500
# train model
model = xgb.train(dict(xgb_params_tune3, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 1min 34s, sys: 694 ms, total: 1min 34s
Wall time: 1min 36s


In [88]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_6' + '.csv', index = False)

### Summary of colsample_bytree & subsample

- LB score 0.56735 with colsample_bytree 0.8 and subsample 1.0, not an improvement 

### Tune of Regularization - alpha & lambda

In [94]:
%%time
param_test6 = {
     'reg_alpha':[0,0.1,0.2],
    'reg_lambda':[1.0,1.5,2.0]
}
gsearch6 = GridSearchCV(
    estimator = xgb.XGBRegressor( learning_rate =0.0045, n_estimators=1000, max_depth=4,
                                 min_child_weight=1, gamma=0.55, subsample=0.93, 
                                 colsample_bytree=1, objective= 'reg:linear', nthread=4, 
                                 scale_pos_weight=1, seed=420), 
    param_grid = param_test6, scoring='r2',n_jobs=4,iid=False, cv=10)


gsearch6.fit(train_x, train_y)

CPU times: user 49.6 s, sys: 592 ms, total: 50.2 s
Wall time: 29min 31s


In [95]:
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.56906, std: 0.09824, params: {'reg_lambda': 1.0, 'reg_alpha': 0},
  mean: 0.57070, std: 0.09908, params: {'reg_lambda': 1.5, 'reg_alpha': 0},
  mean: 0.57219, std: 0.10046, params: {'reg_lambda': 2.0, 'reg_alpha': 0},
  mean: 0.56882, std: 0.09799, params: {'reg_lambda': 1.0, 'reg_alpha': 0.1},
  mean: 0.57088, std: 0.09939, params: {'reg_lambda': 1.5, 'reg_alpha': 0.1},
  mean: 0.57219, std: 0.10050, params: {'reg_lambda': 2.0, 'reg_alpha': 0.1},
  mean: 0.56902, std: 0.09813, params: {'reg_lambda': 1.0, 'reg_alpha': 0.2},
  mean: 0.57081, std: 0.09933, params: {'reg_lambda': 1.5, 'reg_alpha': 0.2},
  mean: 0.57207, std: 0.10038, params: {'reg_lambda': 2.0, 'reg_alpha': 0.2}],
 {'reg_alpha': 0.1, 'reg_lambda': 2.0},
 0.57219456108668998)

In [96]:
xgb_params_tune4 = {
    'n_trees': 520, 
    'eta': 0.0025,
    'max_depth': 4,

    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.55,
        'subsample': 0.93,
    'reg_alpha':0.1,
    'reg_lambda':2.0
    
}

In [98]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds =1250
# train model
model = xgb.train(dict(xgb_params_tune4, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

CPU times: user 1min 3s, sys: 692 ms, total: 1min 4s
Wall time: 1min 6s


In [99]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('Submission/gridcv_7' + '.csv', index = False)

## Stacking 1
- Train the stacked models then predict the test data

In [109]:
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)

In [101]:
%%time 
stacked_pipeline.fit(finaltrainset, train_y)
results = stacked_pipeline.predict(finaltestset)

CPU times: user 3.8 s, sys: 243 ms, total: 4.05 s
Wall time: 3.48 s


In [105]:
xgb_params_tune2 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
    'min_child_weight':1,
    'gamma':0.55
}

In [106]:
%%time
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params_tune2, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred_stack = model.predict(dtest)

CPU times: user 1min 1s, sys: 513 ms, total: 1min 2s
Wall time: 1min 3s


In [107]:
'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(train_y,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

R2 score on train data:
0.658462139275


In [110]:
## Only stack
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = results
sub.to_csv('Submission/only_stack.csv', index=False)

In [112]:
## Average the preditionon test data  of both models then save it on a csv file
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred_stack*0.75 + results*0.25
sub.to_csv('Submission/stacked3.csv', index=False)

In [113]:


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred_stack*0.7145 + results*0.2855
sub.to_csv('Submission/stacked2.csv', index=False)

#### Summary of Stack
- stacked2 gives 0.56872, not an improvement
- try stacked 3 again
- cv to find optimal weight for pred & results

In [114]:
# def runXGB(train_X, train_y, test_X, test_y = None, feature_names = None, seed_val = 0, num_rounds = 5000):
#     param = {}
#     param['objective'] = 'reg:linear'
#     param['eta'] = 0.001
#     param['max_depth'] = 5
#     param['silent'] = 1

#     param['eval_metric'] = "rmse"
#     param['min_child_weight'] = 1
#     param['subsample'] = 0.7
#     param['colsample_bytree'] = 0.7
#     param['seed'] = seed_val
#     num_rounds = num_rounds
    
#     plst = list(param.items())
    
#     xgtrain = xgb.DMatrix(train_X, label=train_y)

#     if test_y is not None:
#         xgtest = xgb.DMatrix(test_X, label=test_y)
#         watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
#         model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
#     else:
#         xgtest = xgb.DMatrix(test_X)
#         model = xgb.train(plst, xgtrain, num_rounds)

#     pred_test_y = model.predict(xgtest)
#     return pred_test_y, model

In [None]:
# %%time
# cv_scores = []
# kf = KFold(n_splits=10, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(train_x.shape[0])):
#         dev_X, val_X = train_x.ix[dev_index,:], train_x.ix[val_index,:]
#         dev_y, val_y = train_y.ix[dev_index], train_y.ix[val_index]
#         preds, model = runXGB(dev_X, dev_y, val_X, val_y)
#         cv_scores.append(r2_score(val_y, preds))
#         print(cv_scores)
#         break

## Pre-Stacking2 
- Try models besides XGBoost

In [116]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Lasso, LassoCV, LassoLars, LassoLarsCV, RidgeCV
from sklearn.svm import SVR

#### random forest 
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [126]:

rf_params = {
    'n_estimators':1000,
    'criterion':'mse',
    'max_features':'sqrt',
    'max_depth':5,
    'min_samples_split':2,
    'n_jobs':-1,
    'random_state':420
}

In [146]:
rf = SklearnWrapper(RandomForestRegressor, rf_params)

In [147]:
kf = KFold(n_splits= 10)

In [152]:
rf.fit2(train_x, train_y).predict2(train_x)

AttributeError: 'NoneType' object has no attribute 'predict2'

In [148]:
def get_cv(clf):
    result = []
    for train_index, val_index in kf.split(train_x):
        train_x_cv, train_y_cv = train_x.ix[train_index], train_y.ix[train_index]
        val_x_cv, val_y_cv = train_x.ix[val_index], train_y.ix[val_index] 
        pred_cv = clf.fit2(train_x_cv, train_y_cv).predict2(val_x_cv)
        score = r2_score(val_y_cv, pred_cv)
        result.append(score)
    
    return result
        
    

In [149]:
get_cv(rf)

AttributeError: 'NoneType' object has no attribute 'predict2'

In [134]:
rf2 = RandomForestRegressor(n_estimators= 10)

In [135]:
y_pred_rf = rf2.fit(train_x, train_y).predict(train_x)

In [136]:
r2_score(train_y, y_pred_rf)

0.91176984070286937

## Stacking 2

In [106]:

import xgboost as xgb

In [143]:
class SklearnWrapper(object):
    def __init__(self, clf, params=None): ## remove parameter seed = 0
        # params['random_state'] = seed
        self.clf = clf(**params)

    def fit2(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict2(self, x):
        return self.clf.predict(x)

In [108]:
class XGBWrapper(object):
    def __init__(self, seed = 0, params = None):
        self.params = params
        self.params['seed'] = seed
        self.nrounds = params['nround']
        
    def fit2(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label = y_train)
        self.gbdt = xgb.train(self.params, dtrain, self.nrounds)
        
    def predict2(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [109]:
from sklearn.model_selection import KFold
kf = KFold(5)

In [110]:
def get_oof(clf, train_x, train_y, test_x):
    n = train_x.shape[0]
    oof_train = np.zeros((n,1))
    oof_test = np.zeros((n,1))
    oof_test_temp = np.zeros((n,1))
    
    for train_index, test_index in kf.split(train_x):
        model = clf.fit2(train_x.iloc[train_index], train_y.iloc[train_index])
        oof_train[test_index] = clf.predict2(train_x.iloc[test_index]).reshape(-1,1)
        oof_test_temp = clf.predict2(test_x).reshape(-1,1)
        oof_test = oof_test + oof_test_temp
    
    oof_test = oof_test / 5
    
    return oof_train, oof_test
        
    

### XGBoost

In [245]:
xg_params = {
    'objective' : 'reg:linear',
    'n_trees':1250,
    'eta' : 0.0045,
    'max_depth' : 3,
    'subsample' : 0.93,
    'eval_metric':'rmse',
    'base_score' : y_mean,
    'silent' : 1 ,
    'nround':1000,
    'min_child_weight':1
             }

In [246]:
xg = XGBWrapper(seed = 0, params = xg_params)

In [113]:
%%time
xg_oof_train, xg_oof_test = get_oof(xg, train_x, train_y, test_x)

CPU times: user 3min 54s, sys: 958 ms, total: 3min 55s
Wall time: 3min 58s


### sklearn models

In [129]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LassoLarsCV, ElasticNetCV , ElasticNet

In [116]:
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
rf_params = {
    'n_jobs': 16,
    'n_estimators': 1000,
    'max_features': 0.5,
    'max_depth': 10,
    'min_samples_leaf': 2,
}

In [120]:
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
gbr_params = {'loss':'ls',
              'learning_rate':0.01,
              'n_estimators':1000,
              'max_depth':5,
              'min_sample_leaf':2,
    
}

In [154]:
gbr_params2 = {
    'learning_rate' : 0.001,
    'n_estimators':2000,
    'loss' : 'huber', 
    'max_depth' : 3, 
    'max_features': 0.55,
    'min_samples_leaf' : 18, 
    'min_samples_split' : 14, 
    'subsample': 0.7
}

In [121]:
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html


et_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
    'max_features': 0.5,
    'max_depth': 10,
    'min_samples_leaf': 2,
}

In [140]:
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html
ll_params = {
   # 'fit_intercept':True
    'n_jobs':-1
}

In [126]:
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
en_params = {
    'alpha':0.1,
    'l1_ratio':0.7
    
}

In [122]:
## http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
svr_params = {
    'C' : 1,
    'eplison':0.1,
    'kernel':'rbf',
    'gamma':'auto'
    
}

In [125]:
rf = SklearnWrapper(clf=RandomForestRegressor, params=rf_params)
gbr = SklearnWrapper(clf = GradientBoostingRegressor,  params = gbr_params2)
et = SklearnWrapper(clf = ExtraTreesRegressor,  params = et_params)

en = SklearnWrapper(clf = ElasticNet, params = en_params)


TypeError: __init__() got an unexpected keyword argument 'min_sample_split'

In [142]:
# rf = SklearnWrapper(clf=RandomForestRegressor, seed=2017, params=rf_params)
# gb = SklearnWrapper(clf = GradientBoostingRegressor, seed = 2017, params = gbr_params2)
# et = SklearnWrapper(clf = ExtraTreesRegressor, seed = 2017, params = et_params)
# ll = SklearnWrapper(clf = LassoLarsCV, seed = 2017, params = ll_params)
# en = SklearnWrapper(clf = ElasticNet, seed = 2017, params = en_params)
# svr = SklearnWrapper(clf = SVR, seed = 2017, params = svr_params)

In [143]:
ll =  LassoLarsCV()

In [146]:
svr = SVR()

In [151]:
%%time 
rf_oof_train, rf_oof_test =  get_oof(rf, train_x, train_y, test)

CPU times: user 7min 56s, sys: 3.21 s, total: 7min 59s
Wall time: 2min 21s


In [156]:
%%time
gb_oof_train, gb_oof_test = get_oof(gbr, train_x, train_y, test)

CPU times: user 4min 19s, sys: 782 ms, total: 4min 20s
Wall time: 4min 20s


In [157]:
%%time 
et_oof_train, et_oof_test = get_oof(et, train_x, train_y, test)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x116dc62e8>>
Traceback (most recent call last):
  File "/Users/Aaron/anaconda/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/core.py", line 337, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


CPU times: user 4min 5s, sys: 2.26 s, total: 4min 8s
Wall time: 1min 18s


In [163]:
%%time
en_oof_train, en_oof_test = get_oof(en, train_x, train_y, test)

CPU times: user 8.73 s, sys: 123 ms, total: 8.85 s
Wall time: 7.5 s


In [159]:
def get_oof_extra(clf, train_x, train_y, test_x):
    n = train_x.shape[0]
    oof_train = np.zeros((n,1))
    oof_test = np.zeros((n,1))
    oof_test_temp = np.zeros((n,1))
    
    for train_index, test_index in kf.split(train_x):
        model = clf.fit(train_x.iloc[train_index], train_y.iloc[train_index])
        oof_train[test_index] = clf.predict(train_x.iloc[test_index]).reshape(-1,1)
        oof_test_temp = clf.predict(test_x).reshape(-1,1)
        oof_test = oof_test + oof_test_temp
    
    oof_test = oof_test / 5
    
    return oof_train, oof_test
        
    

In [160]:
%%time 
svr_oof_train, svr_oof_test = get_oof_extra(svr, train_x, train_y, test)

CPU times: user 1min 5s, sys: 292 ms, total: 1min 5s
Wall time: 1min 5s


In [161]:
%%time 
ll_oof_train, ll_oof_test = get_oof_extra(ll, train_x, train_y, test)

CPU times: user 2.35 s, sys: 367 ms, total: 2.72 s
Wall time: 1.69 s


In [164]:
### Combining all model 1
x_train_after_stack = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, gb_oof_train
                                     , ll_oof_train, en_oof_train, svr_oof_train), axis=1)
x_test_after_stack = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, gb_oof_test, 
                                     ll_oof_test, en_oof_test, svr_oof_test), axis=1)

In [168]:
%%time
p, model = runXGB(x_train_after_stack, train_y, x_test_after_stack, num_rounds=1000)

CPU times: user 1.3 s, sys: 26.3 ms, total: 1.32 s
Wall time: 1.4 s


In [170]:
x_train_after_stack.shape

(4209, 7)

In [171]:
p[:5]

array([ 54.88514328,  60.32428741,  54.88514328,  54.88514328,  68.5891571 ], dtype=float32)

In [172]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p
sub.to_csv('stack1.csv', index = False)

In [173]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = rf_oof_test
sub.to_csv('rf.csv', index = False)

In [174]:
rf_oof_test.shape

(4209, 1)

In [176]:
rf_oof_test[:5]

array([[  81.39255755],
       [ 108.00938191],
       [  81.43177215],
       [  82.68408714],
       [ 119.37385729]])

In [178]:
train_x.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,pca_11,ica_11,tsvd_11,grp_11,srp_11,pca_12,ica_12,tsvd_12,grp_12,srp_12
0,0,37,23,20,0,3,27,9,14,0,...,1.360747,-0.016158,0.534548,-508.35648,-3.981069,4.238816,-0.026,1.349757,1218.909313,-14.707015
1,6,37,21,22,4,3,31,11,14,0,...,-2.803449,-0.025003,-0.298973,-515.250317,-2.185324,2.358387,0.001018,-2.785076,1209.804441,-15.233844
2,7,24,24,38,2,3,30,9,23,0,...,3.667395,0.025212,-4.359028,-485.908782,8.223169,-0.968324,-0.025087,4.349495,1188.930647,-52.632749
3,9,24,21,38,5,3,30,11,4,0,...,4.234178,0.021985,-3.81186,-507.90845,-1.033786,-1.581056,-0.018093,4.806347,1179.433141,-22.133079
4,13,24,23,38,5,3,14,3,13,0,...,4.83734,0.013433,-1.678387,-502.634869,2.039579,-1.653438,-0.024047,5.160166,1186.215859,-63.204101


In [183]:
r2_score(rf_oof_train, train_y)

-0.18665155841770664

In [181]:
rf_oof_train.shape

(4209, 1)

In [182]:
test.shape

(4209, 437)

In [185]:
r2_score(rf.fit2(train_x, train_y).predict2(train_x), train_y)

AttributeError: 'NoneType' object has no attribute 'predict2'

In [186]:
r2_score(gb_oof_train, train_y)

-0.11056341949576431

In [187]:
r2_score(ll_oof_train, train_y)

-0.10988915924298714

In [188]:
r2_score(en_oof_train, train_y)

-0.0099200917466468752

In [189]:
r2_score(et_oof_train, train_y)

0.067769074918884065

In [190]:
r2_score(svr_oof_train, train_y)

-8327.3718559387071