In [2]:
# Categorical data - takes on a limited number of values
# Usually will get errors if you try to directly plug these variables in
# One-hot Encoding is useful for categorical variables that don't take on
# too many values - it createst new (binary) columns, indicating the
# presence of each possible value from the original data

In [16]:
# Read in data
import pandas as pd
train_data = pd.read_csv('/Users/ChesterHuynh/learningml/Kaggle/Tutorials/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/Users/ChesterHuynh/learningml/Kaggle/Tutorials/house-prices-advanced-regression-techniques/test.csv')

# Drop houses where the target value is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = train_data.SalePrice

# We drop columns with missing values for simplicity
cols_with_missing = [col for col in train_data.columns
                                if train_data[col].isnull().any()]
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = train_data.drop(['Id'] + cols_with_missing, axis=1)

# cardinality := number of unique values in a column (useful for categorical values)
# Use it to select categorical columns that take on low amount of values
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns
                               if candidate_train_predictors[cname].nunique() < 10 and
                                   candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns
                        if candidate_train_predictors[cname].dtype in ["int64", "float64"]]

my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [21]:
print(train_predictors.dtypes.sample(10))
print(train_predictors.shape)

RoofStyle       object
MiscVal          int64
KitchenQual     object
2ndFlrSF         int64
TotalBsmtSF      int64
YrSold           int64
BsmtFinSF2       int64
BedroomAbvGr     int64
LotArea          int64
BldgType        object
dtype: object
(1460, 57)


In [22]:
# Object indicates a column has text. Most common to one-hot encode these "object" columns
# Pandas offers a convenient function called get_dummies to get one-hot encodings
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [24]:
# Alternatively, we could've just droped the categoricals. Let's compare the two approaches
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiply by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()

# 1) One-hot encoded categoricals as well as numeric predictors
predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))

print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

Mean Absolute Error when Dropping Categoricals: 18462
Mean Abslute Error with One-Hot Encoding: 17883


In [25]:
# When comparing multiple files, you need to ensure that the ordering of columns is consistent 
# and aligned across training and test datasets, otherwise, you get nonsense

# align command makes sure the columns show up in the same order in both datasets
# join='left' specifies that if there are ever columns that show up in one dataset and not the other, 
# we will keep exactly the columns from our training data (i.e. left dataset input).
# join='inner' would keep only the columns shared by both datasets

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='left', axis=1)