The "Housing Prices" competition is a regression problem: given some info on a house, we determine its selling price.

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # graph

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))


wdir = os.getcwd()

D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_categorical-variables.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_intermediate-ML.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_missing-values.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_pipelines.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\submission.csv
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_categorical-variables-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_intermediate-ML-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_missing-values-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_pipelines-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\input\data_

# Step 1: Gather the Data

### Checking the format of the data

In [2]:
train_data = pd.read_csv("input/train.csv", index_col='Id')
test_data = pd.read_csv("input/test.csv", index_col="Id")

# train_data.head()
# test_data.head()

# Step 2: Prepare the Data

In [3]:
# Separate target from predictors
y = train_data.SalePrice
X_full = train_data.drop(['SalePrice'], axis=1)

Preliminary investigation concerning missing values

In [4]:
# Statistics of entries with missing data
print("X_full.shape = ", X_full.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_full.isnull().sum())

print(missing_val_count_by_column[missing_val_count_by_column > 0])

X_full.shape =  (1460, 79)
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [5]:
# Drop columns with missing values (simplest approach)
cols_with_na = [col for col in X_full.columns if X_full[col].isnull().any()]
X_reduced = X_full.drop(cols_with_na, axis=1)

print(cols_with_na)
print("X_reduced.shape = ", X_reduced.shape)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
def has_low_cardinality(df, col):
    is_categorical = (df[col].dtype == "object")
    has_low_card = (df[col].nunique() < 10)
    return (is_categorical and has_low_card)

low_cardinality_cols = [col for col in X_reduced.columns if has_low_cardinality(X_reduced, col)]
print(low_cardinality_cols)

# Select numerical columns
numerical_cols = [col for col in X_reduced.columns if X_reduced[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X = X_reduced[my_cols].copy()

# Divide data into training and validation subsets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
X_reduced.shape =  (1460, 60)
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


## Function to Evaluate the approaches

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Approaches to the Preprocessing of Categorical Variables

In [7]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


### Score from Approach 1 (Drop Categorical Variables)

In [8]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17837.82570776256


### Score from Approach 2 (Label Encoding)

In [9]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()

label_encoder.fit_transform(X["MSZoning"])
for col in object_cols:
    label_encoder.fit_transform(X[col])
    label_X_train[col] = label_encoder.transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

print("MAE from Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Label Encoding):
17340.801792237442


### Score from Approach 3 (One-Hot Encoding)

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(sparse=False)
OH_encoder.fit_transform(X[object_cols])
OH_cols_train = pd.DataFrame(OH_encoder.transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
17425.430856164385
