In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load data
main_file_path = '/Users/ChesterHuynh/learningml/Kaggle/Tutorials/house-prices-advanced-regression-techniques/train.csv'
data = pd.read_csv(main_file_path)

housing_target = data.SalePrice
housing_predictors = data.drop(['SalePrice'], axis=1) 
# Dataframe axis=1, means left to right across columns, axis=0, 
# means up to down across rows

# Only using numerical predictors, dropping any "objects"
housing_num_predictors = housing_predictors.select_dtypes(exclude=['object'])


# Defining a function to validate the model quantitatively
X_train, X_test, y_train, y_test = train_test_split(housing_num_predictors, housing_target, train_size = 0.7, test_size = 0.3, random_state = 0)
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds) # MAE = (Sum(|y-yhat|))/n

In [23]:
# 1) Simply Dropping Columns with Missing Values
cols_with_missing = [col for col in train_X.columns 
                     if X_train[col].isnull().any()]
# .any() basically gives a bool of whether or not an array has a NaN
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values: ")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

Mean Absolute Error from dropping columns with Missing Values: 
19847.9824201


In [25]:
# 2) Imputation - filling in missing values with some number
from sklearn.preprocessing import Imputer
my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train) 
# imputer.fit_transform does not replace the initial array, it makes a new one
imputed_X_test = my_imputer.fit_transform(X_test)
print("Mean Absolute Error from Imputation: ")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))


Mean Absolute Error from Imputation: 
18743.2084475


In [33]:
# 3) Imputation with a consideration of which values were originally missing

# making copy to avoid changing original data
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

# make new columns indicating what will be imputed
cols_with_missing = (col for col in X_train.columns
                                if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
    
# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.fit_transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed: ")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

Mean Absolute Error from Imputation while Track What Was Imputed: 
18879.6273973
