load train data set into environment

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
housing_file_path = 'home-data-for-ml-course/train.csv'
housing_data = pd.read_csv(housing_file_path) 


Data Exploration

In [3]:
housing_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [4]:
housing_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
housing_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
housing_data.shape

(1460, 81)

In [7]:
filtered_housing_data = housing_data.dropna(axis=0)

In [8]:
filtered_housing_data.shape

(0, 81)

filtering 'na' values on the entire dataframe is not helpful with this data set because there are some columns that only have NA values, making <>.dropna() yield an empty df

use 'housing_data'

Define the prediction target (y), 'Sale Price'

In [9]:
y = housing_data.SalePrice

Define feature columns (X)

In [10]:
#  why does <>.fit() only process numeric data?

features = ['LotArea', 'OverallCond', 'YearBuilt', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea']

#features = ['LotArea', 'Neighborhood', 'BldgType', 'HouseStyle', 'OverallCond', 'YearBuilt', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'SaleCondition']

X = housing_data[features]
X.describe()

Unnamed: 0,LotArea,OverallCond,YearBuilt,FullBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,GarageArea
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,5.575342,1971.267808,1.565068,2.866438,1.046575,6.517808,472.980137
std,9981.264932,1.112799,30.202904,0.550916,0.815778,0.220338,1.625393,213.804841
min,1300.0,1.0,1872.0,0.0,0.0,0.0,2.0,0.0
25%,7553.5,5.0,1954.0,1.0,2.0,1.0,5.0,334.5
50%,9478.5,5.0,1973.0,2.0,3.0,1.0,6.0,480.0
75%,11601.5,6.0,2000.0,2.0,3.0,1.0,7.0,576.0
max,215245.0,9.0,2010.0,3.0,8.0,3.0,14.0,1418.0


-----DECISION TREE REGRESSOR-----

In [11]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

model = DecisionTreeRegressor()
model.fit(train_X, train_y)

predictions = model.predict(val_X)

run validation with MAE

In [17]:
mean_absolute_error(val_y, predictions)

33595.19726027397

Calculate the MAE for different node counts to identify the lowest validation error.  The leaf count is passed into DecisionTreeRegressor

In [13]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):

    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    
    return(mae)

In [14]:
# compare MAE with differing values of max_leaf_nodes
leaf_mae = dict()
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    leaf_mae[my_mae]=max_leaf_nodes

best_tree_size = leaf_mae[min(list(leaf_mae))]
print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(best_tree_size, min(list(leaf_mae))))

Max leaf nodes: 50  		 Mean Absolute Error:  33079


Now that the max number of leaf nodes are calculated, run the model with the full data set with the ideal number of nodes defined

In [15]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

-----RANDOM FOREST-----

More accurate predictions can be made by training a Random Forest, which is a collection of decision trees
This can be achieved in sklearn with RandomForestRegressor
Generally work reasonably even without this tuning

In [16]:
forest_model = RandomForestRegressor(random_state=1)  # define the model

forest_model.fit(train_X, train_y)  # fit the model with data
melb_preds = forest_model.predict(val_X)  # run predictions
print(mean_absolute_error(val_y, melb_preds))  # model validation

24971.913515981734


Tree Regressor MAE:  33079
Forest Regressor MAE:  24971