In [1]:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques

# Stacking Starter based on Allstate Faron's Script
# https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867
# Preprocessing from Alexandru Papiu
# https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models

import os.path
import math
import functools
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import skew

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso



In [2]:
TARGET = 'SalePrice'
NSPLITS = 5
SEED = 0
NROWS = None
data_dir = functools.partial(os.path.join, 'data/')
SUBMISSION_FILE = data_dir('sample_submission.csv')


## Load the data ##
train = pd.read_csv(data_dir('train.csv'))
test = pd.read_csv(data_dir('test.csv'))

# Separate the labels from the last column
y_train = np.log(train[TARGET] + 1)
train.drop([TARGET], axis=1, inplace=True)

ntrain = train.shape[0]
ntest = test.shape[0]

train.shape, test.shape, y_train.shape

((1460, 80), (1459, 80), (1460,))

In [3]:
all_data = pd.concat([train, test])
values = all_data['LotArea'].unique()
values.sort()
values

array([  1300,   1470,   1476, ..., 159000, 164660, 215245])

In [4]:
def digitize(all_data, column):
    values = all_data[column].unique()
    values.sort()
    return np.digitize(all_data[column], bins=values)

In [5]:
## Preprocessing ##
all_data = pd.concat([train, test])
all_data.drop(['Id'], axis=1, inplace=True)

#all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
#                      test.loc[:,'MSSubClass':'SaleCondition']))

#for col in ['MSSubClass']: #, 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']:
#    all_data[col] = digitize(all_data, col)

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
object_feats = all_data.dtypes[all_data.dtypes == 'object'].index
#object_feats = np.hstack([object_feats, ['MSSubClass', 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']])

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.7]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = all_data.fillna(all_data.median())
all_data = pd.get_dummies(all_data, columns=object_feats)

#creating matrices for sklearn:
data = all_data.values
x_train = data[:ntrain]
x_test = data[ntrain:]

kf = KFold(n_splits=NSPLITS, shuffle=True, random_state=SEED)
x_train.shape, x_test.shape

((1460, 288), (1459, 288))

In [6]:
all_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.110874,4.189655,9.04204,7,5,2003,2003,5.283204,6.561031,0.0,...,0,0,0,1,0,0,0,0,1,0
1,3.044522,4.394449,9.169623,6,8,1976,1976,0.0,6.886532,0.0,...,0,0,0,1,0,0,0,0,1,0
2,4.110874,4.234107,9.328212,7,5,2001,2002,5.09375,6.188264,0.0,...,0,0,0,1,0,0,0,0,1,0
3,4.26268,4.110874,9.164401,7,5,1915,1970,0.0,5.379897,0.0,...,0,0,0,1,1,0,0,0,0,0
4,4.110874,4.442651,9.565284,8,5,2000,2000,5.860786,6.486161,0.0,...,0,0,0,1,0,0,0,0,1,0


In [7]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NSPLITS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 1000
}



rd_params={
    'alpha': 10
}


ls_params={
    'alpha': 0.005
}


xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

print("XG-CV: {}".format(np.sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(np.sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(np.sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RD-CV: {}".format(np.sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(np.sqrt(mean_squared_error(y_train, ls_oof_train))))


x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)


xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=1500, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=50, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice']) - 1
submission[TARGET] = saleprice
submission.to_csv(data_dir('xgstacker_starter_submission2.csv'), index=None)

XG-CV: 0.12257812904190073
ET-CV: 0.14555761485208654
RF-CV: 0.14241118123674062
RD-CV: 0.13182895036747352
LS-CV: 0.1432176643227663
(1460, 5),(1459, 5)
[0]	train-rmse:11.416+0.00727855	test-rmse:11.4159+0.0221774
[50]	train-rmse:6.91812+0.00440813	test-rmse:6.91817+0.0233714
[100]	train-rmse:4.19539+0.00270257	test-rmse:4.19562+0.0211604
[150]	train-rmse:2.54699+0.00185298	test-rmse:2.54766+0.0194404
[200]	train-rmse:1.54987+0.00129759	test-rmse:1.55092+0.0176708
[250]	train-rmse:0.948274+0.00112668	test-rmse:0.949684+0.0164278
[300]	train-rmse:0.587085+0.00118784	test-rmse:0.589152+0.0158442
[350]	train-rmse:0.372809+0.00150623	test-rmse:0.375891+0.0156244
[400]	train-rmse:0.249169+0.00187076	test-rmse:0.253691+0.0156815
[450]	train-rmse:0.181173+0.00229631	test-rmse:0.187516+0.0157262
[500]	train-rmse:0.146157+0.00280842	test-rmse:0.154218+0.0152181
[550]	train-rmse:0.128825+0.0030956	test-rmse:0.13855+0.0143483
[600]	train-rmse:0.12008+0.00320575	test-rmse:0.130772+0.0135573
[650]

In [8]:
# Ensemble-CV: 0.12850675
# Ensemble-CV: 0.1237075 (0.12236)
# Ensemble-CV: 0.12549675 (0.12399)
# Ensemble-CV: 0.125022 (0.12381)
# Ensemble-CV: 0.12068925 (0.12133)