In [42]:
import pandas as pd

train_data = pd.read_csv(".\\Programming\\Kaggle\\houses\\train.csv")
test_data = pd.read_csv('.\\Programming\\Kaggle\\houses\\test.csv')

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

target = train_data.SalePrice

print(test_data.columns)

print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [48]:
cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()]                                  
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if
                                candidate_train_predictors[cname].nunique()<10 and
                                candidate_train_predictors[cname].dtype == 'object']
numeric_cols = [cname for cname in candidate_train_predictors if
                candidate_train_predictors[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [49]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [52]:
from sklearn.model_selection import cross_val_score

def get_mae(X,y):
    return -1 * cross_val_score(RandomForestRegressor(50), X, y,
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('MAE when dropping categoricals:', mae_without_categoricals)
print('MAE when onehot encoding categoricals:', mae_one_hot_encoded)



MAE when dropping categoricals: 18435.216237821212
MAE when onehot encoding categoricals: 18095.09519594505


In [53]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)


In [22]:
X_train, X_val, y_train, y_val = train_test_split(iowa_numeric_predictors,
                                                  iowa_target,
                                                  train_size=0.7,
                                                  test_size=0.3,
                                                  random_state=0)

def score_dataset(X_train, X_val, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_absolute_error(y_test,preds)

In [27]:
#Dropping mising
cols_with_missing = [col for col in X_train.columns
                                if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_val = X_val.drop(cols_with_missing, axis=1)
print("MAE for dropping cols with missing values:")
print(score_dataset(reduced_X_train,reduced_X_val,y_train,y_val))

MAE for dropping cols with missing values:
18953.095890410958




In [29]:
#Imputation
from sklearn.preprocessing import Imputer

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_val = my_imputer.fit_transform(X_val)
print("MAE from imputation:")
print(score_dataset(imputed_X_train, imputed_X_val, y_train, y_val))

MAE from imputation:
19838.903652968038




In [38]:
imputed_X_train_plus = X_train.copy()
imputed_X_val_plus = X_val.copy()

#cols_with_missing = (col for col in X_train.columns
#                                  if X_train[col].isnull().any()

for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_val_plus[col+'_was_missing'] = imputed_X_val_plus[col].isnull()
    
#imp
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_val_plus = my_imputer.fit_transform(imputed_X_val_plus)

print(imputed_X_val_plus)

print("MAE from imputation while tracking whats imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_val_plus, y_train, y_val))

[[5.30000000e+02 2.00000000e+01 7.09130435e+01 ... 1.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [4.92000000e+02 5.00000000e+01 7.90000000e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.60000000e+02 5.00000000e+01 7.09130435e+01 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [6.55000000e+02 2.00000000e+01 9.10000000e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.28100000e+03 2.00000000e+01 6.70000000e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.99000000e+02 2.00000000e+01 1.00000000e+02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
MAE from imputation while tracking whats imputed:
19503.22283105023


