In [1]:
import pandas as pd
import numpy as np

train_file_path = 'data/train.csv'
test_file_path = 'data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [2]:
from sklearn.model_selection import train_test_split


In [3]:
house_features = list(train_data.select_dtypes(exclude='object').columns.drop(['Id', 'SalePrice']))

print(house_features)
print(len(house_features))

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
36


In [4]:
X_train = train_data[house_features]
y_train = train_data.SalePrice

X_train, valid_X, y_train, valid_y = train_test_split(X_train, y_train, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Function for fitting the model and save the result as submission.csv
def fit_and_save(X_train, y_train, test_data, output_file_name):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds_test = model.predict(test_data[house_features])
    output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': preds_test})
    output.to_csv(output_file_name, index=False)

In [9]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='mean')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(valid_X))

# Imputation removed column names; put
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = valid_X.columns

fit_and_save(imputed_X_train, y_train, test_data, 'submission.csv')