In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# read the data
X = pd.read_csv('~/Desktop/Kaggle/practice/Intro_Machine_Learning/House_Price_Competition/train.csv', index_col='Id') 
X_test = pd.read_csv('~/Desktop/Kaggle/practice/Intro_Machine_Learning/House_Price_Competition/test.csv', index_col='Id')

# remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [3]:
#One-Hot Encoding Method:

# All categorical columns
object_cols_train = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that will be one-hot encoded
low_cardinality_cols_train = [col for col in object_cols_train if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols_train = list(set(object_cols_train)-set(low_cardinality_cols_train))

#print result
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols_train)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols_train)

Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Neighborhood', 'Exterior2nd', 'Exterior1st']


In [17]:
#use imputer to fill up missing values in X_test
imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(imputer.fit_transform(X_test))
imputed_X_test.columns = X_test.columns

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols_train]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols_train]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols_train]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = imputed_X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols_train, axis=1)
num_X_valid = X_valid.drop(object_cols_train, axis=1)
num_X_test = imputed_X_test.drop(object_cols_train, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

#calculate MAE
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds = model.predict(OH_X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17525.345719178084


In [None]:
#build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds = model.predict(OH_X_test)

#save results
output = pd.DataFrame({'Id': X_test.index,
'SalePrice': preds})
output.to_csv('categorical_variables_submission.csv', index=False)