In [1]:
#import kaggle
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

import pickle
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
path = Path('data')
path.mkdir(parents=True, exist_ok=True)

In [14]:
with open(path / 'HousePrices_preproc_outliers.pkl', 'rb') as h:
    df_all = pickle.load(h)

In [15]:
# copies of the other notebook
miss_list = ['MSZoning_miss', 'LotFrontage_miss', 'Utilities_miss', 'Exterior1st_miss', 'Exterior2nd_miss', 'MasVnrType_miss',
             'MasVnrArea_miss', 'BsmtQual_miss', 'BsmtCond_miss', 'BsmtExposure_miss', 'BsmtFinType2_miss', 'Electrical_miss',
             'KitchenQual_miss', 'Functional_miss', 'GarageYrBlt_miss', 'GarageFinish_miss', 'GarageCars_miss',
             'GarageArea_miss', 'GarageQual_miss', 'GarageCond_miss', 'PoolQC_miss', 'MiscFeature_miss', 'SaleType_miss']

cat_names = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour',
             'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
            'Condition2','BldgType','HouseStyle','OverallQual','OverallCond',
            'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType',
            'ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond',
            'BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC',
            'CentralAir','Electrical','BsmtFullBath','BsmtHalfBath','FullBath',
            'HalfBath','KitchenQual','TotRmsAbvGrd',
            'Functional','Fireplaces','FireplaceQu','GarageType','GarageYrBlt',
            'GarageFinish','GarageCars','GarageQual','GarageCond','PavedDrive',
            'PoolQC','Fence','MiscFeature','MoSold','YrSold','SaleType',
             'SaleCondition','YearBuilt','YearRemodAdd','BedroomAbvGr', 'KitchenAbvGr']

cat_names.extend(miss_list)

In [16]:
for c in cat_names:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values))
    df_all[c] = lbl.transform(list(df_all[c].values))

In [17]:
train_df = df_all[df_all['is_test'] == False].drop('is_test', axis = 1)
test_df = df_all[df_all['is_test'] == True].drop('is_test', axis = 1)

test_ids = test_df['Id']
test_df.drop(['Id', 'SalePrice'], axis = 1, inplace = True)

y = np.log(train_df['SalePrice']) # predict the log of the SalePrice
X = train_df.drop(['SalePrice','Id'], axis = 1)

In [18]:
train_X, test_X, train_y, test_y = train_test_split(X.values, y.values, test_size = 0.2)

In [44]:
my_model = XGBRegressor(colsample_bytree=0.4,
                        gamma = 0.,
                        learning_rate = 0.05,
                        max_depth = 5,
                        min_child_weight = 1.7,
                        n_estimators = 1000, 
                        reg_alpha = 0.5,
                        reg_lambda = 0.6,
                        subsample = 0.5,
                        seed = 42,
                        objective = 'reg:squarederror'
                       )
my_model.fit(train_X, train_y, early_stopping_rounds = 5, eval_set = [(test_X,test_y)], verbose = True)

[0]	validation_0-rmse:10.9638
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:10.4171
[2]	validation_0-rmse:9.89818
[3]	validation_0-rmse:9.40466
[4]	validation_0-rmse:8.93608
[5]	validation_0-rmse:8.49156
[6]	validation_0-rmse:8.06946
[7]	validation_0-rmse:7.66891
[8]	validation_0-rmse:7.28691
[9]	validation_0-rmse:6.92415
[10]	validation_0-rmse:6.57987
[11]	validation_0-rmse:6.25199
[12]	validation_0-rmse:5.94165
[13]	validation_0-rmse:5.64524
[14]	validation_0-rmse:5.36474
[15]	validation_0-rmse:5.09767
[16]	validation_0-rmse:4.84481
[17]	validation_0-rmse:4.60384
[18]	validation_0-rmse:4.37514
[19]	validation_0-rmse:4.15861
[20]	validation_0-rmse:3.95237
[21]	validation_0-rmse:3.7572
[22]	validation_0-rmse:3.5711
[23]	validation_0-rmse:3.39468
[24]	validation_0-rmse:3.22651
[25]	validation_0-rmse:3.06704
[26]	validation_0-rmse:2.91549
[27]	validation_0-rmse:2.77093
[28]	validation_0-rmse:2.63372
[29]	validation_0-rmse:2.503
[30]	validation_0-rm

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=1.7, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0.5, reg_lambda=0.6, scale_pos_weight=1,
             seed=42, silent=None, subsample=0.5, verbosity=1)

In [45]:
my_model = XGBRegressor(colsample_bytree=0.4,
                        gamma = 0.,
                        learning_rate = 0.05,
                        max_depth = 5,
                        min_child_weight = 1.7,
                        n_estimators = 1000, 
                        reg_alpha = 0.5,
                        reg_lambda = 0.6,
                        subsample = 0.5,
                        seed = 42,
                        objective = 'reg:squarederror'
                       )
my_model.fit(X.values, y.values, verbose = False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=1.7, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0.5, reg_lambda=0.6, scale_pos_weight=1,
             seed=42, silent=None, subsample=0.5, verbosity=1)

In [46]:
# Just out of curiosity
predictions = my_model.predict(X.values)
print("MAE: " + str(np.sqrt(mean_squared_log_error(predictions, y.values))))

MAE: 0.002963367041961072


In [49]:
# Remove .round() when combined with NN
predictions = np.exp(my_model.predict(test_df.values)).round()

In [50]:
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})
submission.to_csv(path / 'submission_xgb_all.csv', index = False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,124867.0
1,1462,163610.0
2,1463,184957.0
3,1464,195666.0
4,1465,186796.0
