In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
# Reading competition data
X = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv', index_col='Id') 
X_test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv', index_col='Id')

# Removing rows with missing target, separating target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# Breaking off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [3]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

#Drop all categorical columns with values missing from training/validation data
X_train_clean = X_train.drop(bad_label_cols, axis=1)
X_valid_clean = X_valid.drop(bad_label_cols, axis=1)

#Drop all categorical columns with missing values from test data
X_test_clean = X_test.drop(bad_label_cols, axis=1)


In [4]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in good_label_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(good_label_cols)-set(low_cardinality_cols))

In [5]:
#Preprocessing the categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
ordinal_encoder = OrdinalEncoder()

#We OH encode the good low categorical values
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_clean[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid_clean[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_clean[low_cardinality_cols]))

#We reindex the numpyarray of good low categorical values we just made
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index

#Here we are ordinalizing the large categorical data
X_train_clean[high_cardinality_cols] = ordinal_encoder.fit_transform(X_train[high_cardinality_cols])
X_valid_clean[high_cardinality_cols] = ordinal_encoder.transform(X_valid[high_cardinality_cols])
X_test_clean[high_cardinality_cols] = ordinal_encoder.fit_transform(X_test[high_cardinality_cols])

#We drop the good low categorical columns from the data set so we can add the numpy array
num_X_train = X_train_clean.drop(low_cardinality_cols, axis=1)
num_X_valid = X_valid_clean.drop(low_cardinality_cols, axis=1)
num_X_test = X_test_clean.drop(low_cardinality_cols, axis=1)

#We add the OH numpy array to the dataset
OH_X_train_clean = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid_clean = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test_clean = pd.concat([num_X_test, OH_cols_test], axis=1)

In [6]:
#Deal with missing data
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train_clean))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid_clean))
imputed_X_test = pd.DataFrame(my_imputer.transform(OH_X_test_clean))


imputed_X_train.columns = OH_X_train_clean.columns
imputed_X_valid.columns = OH_X_valid_clean.columns
imputed_X_test.columns = OH_X_test_clean.columns



In [7]:
# Your code here - actually making the model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(imputed_X_train, y_train)
preds_valid = model.predict(imputed_X_valid)
print(mean_absolute_error(y_valid, preds_valid))
#This doesn't work. It seems that there is missing data in the test data not missing in the training/validation data?
preds_test = model.predict(imputed_X_test)



17439.449280821915




In [8]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('housing_data_submission.csv', index=False)