The "Housing Prices" competition is a regression problem: given some info on a house, we determine its selling price.

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. 

In [1]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

Setup Complete


# Step 1: Gather the Data

### Checking the format of the data

In [2]:
train_data = pd.read_csv("../input/train.csv", index_col='Id')
test_data = pd.read_csv("../input/test.csv", index_col="Id")

# train_data.head()
# test_data.head()

# Step 2: Prepare the Data

In [3]:
# Separate target from predictors
y = train_data.SalePrice
X_full = train_data.drop(['SalePrice'], axis=1)

In [4]:
# Drop columns with missing values (simplest approach)
cols_with_na = [col for col in X_full.columns if X_full[col].isnull().any()]
X_reduced = X_full.drop(cols_with_na, axis=1)

print(cols_with_na)
print("X_reduced.shape = ", X_reduced.shape)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
def has_low_cardinality(df, col):
    is_categorical = (df[col].dtype == "object")
    has_low_card = (df[col].nunique() < 10)
    return (is_categorical and has_low_card)

categorical_cols = [col for col in X_reduced.columns if has_low_cardinality(X_reduced, col)]
print(categorical_cols)

# Select numerical columns
numerical_cols = [col for col in X_reduced.columns if X_reduced[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X = X_reduced[my_cols].copy()

# Divide data into training and validation subsets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
X_reduced.shape =  (1460, 60)
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


## Defining the Preprocessing Pipeline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Select the model

In [6]:
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(n_estimators=100, random_state=0)

from xgboost import XGBRegressor

model_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

## Create and Evaluate the Pipeline

In [7]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_xgb)
                             ])

# fitting the preprocessor step
preprocessor.fit(X_train)
X_valid_transformed = preprocessor.transform(X_valid)

my_pipeline.fit(X_train, y_train, 
                model__early_stopping_rounds=5, 
                model__eval_set=[(X_valid_transformed, y_valid)],
                model__verbose=True)

predictions = my_pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

[0]	validation_0-rmse:190107.76562
[1]	validation_0-rmse:181148.23438
[2]	validation_0-rmse:172722.95312
[3]	validation_0-rmse:164668.25000
[4]	validation_0-rmse:157119.00000
[5]	validation_0-rmse:150043.76562
[6]	validation_0-rmse:143190.25000
[7]	validation_0-rmse:136573.67188
[8]	validation_0-rmse:130510.42969
[9]	validation_0-rmse:124651.79688
[10]	validation_0-rmse:119300.82031
[11]	validation_0-rmse:113977.19531
[12]	validation_0-rmse:108955.65625
[13]	validation_0-rmse:104182.27344
[14]	validation_0-rmse:99604.54688
[15]	validation_0-rmse:95340.80469
[16]	validation_0-rmse:91263.12500
[17]	validation_0-rmse:87490.56250
[18]	validation_0-rmse:83952.66406
[19]	validation_0-rmse:80471.41406
[20]	validation_0-rmse:77251.25000
[21]	validation_0-rmse:74092.60938
[22]	validation_0-rmse:71221.00781
[23]	validation_0-rmse:68510.76562
[24]	validation_0-rmse:65979.81250
[25]	validation_0-rmse:63577.80469
[26]	validation_0-rmse:61343.36719
[27]	validation_0-rmse:59112.32812
[28]	validation_

In [8]:
from sklearn.model_selection import cross_val_score

final_pipeline =  Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=123, learning_rate=0.05, n_jobs=4))
])


# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(
    final_pipeline, X, y,
    cv=3,
    n_jobs=4,
    scoring='neg_mean_absolute_error',
    verbose=True,
)

print("MAE scores:\n", scores)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


MAE scores:
 [16662.45977445 16732.63641395 16671.21772119]


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    2.7s finished
