In [1]:
import pandas as pd
import numpy as np
import sys
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [2]:
MODULES_PATH = '../common'

if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)

In [3]:
from common import enlist, df_not_nan, rmsle
from common_encoders import encode_cat_features, ABSENT_VALUE

### Load dataset

In [4]:
df_train = pd.read_csv('../../input/train.csv', index_col='Id')
df_test = pd.read_csv('../../input/test.csv', index_col='Id')
print('Shape', df_train.shape)
df_train.head()

Shape (1460, 80)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


### List features

In [5]:
TARGET = 'SalePrice'

In [6]:
df_train_features = df_train.drop(columns=TARGET)

In [7]:
FEATURES_INT = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', 
                '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
                'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

In [8]:
FEATURES_FLOAT = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [9]:
FEATURES_CAT = [feature for feature in list(df_train_features) if feature not in FEATURES_INT + FEATURES_FLOAT]
print(enlist(FEATURES_CAT))

'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'


### Preprocess numeric NaNs and encoding categorical data
Replacing with zeros for now

In [10]:
def encode(df):
    df_features_float = df[FEATURES_FLOAT].fillna(0).astype(np.float)
    df_features_cat = df[FEATURES_CAT].fillna(ABSENT_VALUE)
    encoded_cat_features = encode_cat_features(df_features_cat, FEATURES_CAT)
    return pd.concat([df[FEATURES_INT], df_features_float, encoded_cat_features], axis=1)

### Split train into train and test

In [11]:
df_encoded_train = encode(df_train_features)
print('Shape', df_encoded_train.shape)
df_encoded_train.head()

Shape (1460, 268)


Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_5,SaleType_6,SaleType_7,SaleType_8,SaleCondition_0,SaleCondition_1,SaleCondition_2,SaleCondition_3,SaleCondition_4,SaleCondition_5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,8450,7,5,2003,2003,706,0,150,856,...,0,0,0,1,0,0,0,0,1,0
2,20,9600,6,8,1976,1976,978,0,284,1262,...,0,0,0,1,0,0,0,0,1,0
3,60,11250,7,5,2001,2002,486,0,434,920,...,0,0,0,1,0,0,0,0,1,0
4,70,9550,7,5,1915,1970,216,0,540,756,...,0,0,0,1,0,0,0,0,0,0
5,60,14260,8,5,2000,2000,655,0,490,1145,...,0,0,0,1,0,0,0,0,1,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded_train, df_train[TARGET], test_size=0.3)

model = XGBRegressor()
model.fit(X_train, y_train)

y_pred = np.maximum(model.predict(X_test), 0)
print('Test error:', rmsle(y_test, y_pred))

Test error: 0.151144366856


### Training on all data

In [13]:
df_all = pd.concat([df_test, df_train_features])
df_encoded_all = encode(df_all)

train_len = len(df_train)
X_train, X_test, y_train = df_encoded_all.loc[df_train.index], df_encoded_all.loc[df_test.index], df_train[TARGET]

model = XGBRegressor()
model.fit(X_train, y_train)

y_train_pred = np.maximum(model.predict(X_train), 0)
print('Train error:', rmsle(y_train, y_train_pred))

Train error: 0.0938998528017


In [14]:
y_pred = np.maximum(model.predict(X_test), 0)

df_output = pd.DataFrame({TARGET: y_pred}, index=df_test.index)
df_output.to_csv('../../output/submission.csv')
print('Shape', df_output.shape)
df_output.head()

Shape (1459, 1)


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,123566.851562
1462,161948.671875
1463,173148.015625
1464,185096.046875
1465,191149.0
