In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [1]:
import pandas as pd

melbourne_file_path = 'melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

clean up the data.  In this case, drop rows (axis=0) with missing values

In [2]:
# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)
print(melbourne_data.shape)
print(filtered_melbourne_data.shape)

(13580, 21)
(6196, 21)


In [3]:
y = filtered_melbourne_data.Price
print(y)

1        1035000.0
2        1465000.0
4        1600000.0
6        1876000.0
7        1636000.0
           ...    
12205     601000.0
12206    1050000.0
12207     385000.0
12209     560000.0
12212    2450000.0
Name: Price, Length: 6196, dtype: float64


In [4]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'BuildingArea', 'YearBuilt', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,BuildingArea,YearBuilt,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,141.568645,1964.081988,144.990201
std,0.971079,0.711362,897.449881,0.07585,90.834824,38.105673,0.099165
min,1.0,1.0,0.0,-38.16492,0.0,1196.0,144.54237
25%,2.0,1.0,152.0,-37.855438,91.0,1940.0,144.926198
50%,3.0,1.0,373.0,-37.80225,124.0,1970.0,144.9958
75%,4.0,2.0,628.0,-37.7582,170.0,2000.0,145.0527
max,8.0,8.0,37000.0,-37.45709,3112.0,2018.0,145.52635


Build models with scikit-learn library

Steps to building a model:

    1. Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
    2. Fit: Capture patterns from provided data. This is the heart of modeling.
    3. Predict: Just what it sounds like
    4. Evaluate: Determine how accurate the model's predictions are.

Define a decision tree model with scikit-learn and fit (<>.fit()) with deatures and target variable

Using sklearn.tree.DecisionTreeRegressor

ML models allow randomness in model training, so defining a random_state ensures the same result each run (cnosidered good practice).  Choose any number - the model quality won't depend it

In [5]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

Test the model by training on the first few rows of known data and generating the remaining rows
Make predictions with the model's predict command using X as the data. Save the results to a variable called predictions

NOTE that we are only running the prediction on the first values in the data set, so we are using the <>.head() method on the parameters being passed in

In [6]:
print("Making predictions for the following 5 houses:")
print(X.head())

predictions = melbourne_model.predict(X.head())
print(f'The predictions are: {predictions}')

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  BuildingArea  YearBuilt  Longtitude
1      2       1.0     156.0   -37.8079          79.0     1900.0    144.9934
2      3       2.0     134.0   -37.8093         150.0     1900.0    144.9944
4      4       1.0     120.0   -37.8072         142.0     2014.0    144.9941
6      3       2.0     245.0   -37.8024         210.0     1910.0    144.9993
7      2       1.0     256.0   -37.8060         107.0     1890.0    144.9954
The predictions are: [1035000. 1465000. 1600000. 1876000. 1636000.]


Model validation can be done with the Mean Absolute error, which is the average of the calculated error between the actual and predicted values

sklearn can calculate this

The example below yields an in-sample score, where the training data doubles as the actual data with which the MAE is calculate

In [7]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

434.71594577146544

This means that we have an MAE of ~$500

The problem with the 'in-sample' score is that it does not account for biases/patterns hidden in the training set, so that won't expand beyond to new data.  In other words, the actual data's features are misrepresenting the true MAE because they're already accounted for in the training data, so this validation becomes cyclical.

The better way of doing it is to split the existing data and use one part of it for model training, and the other part as 'actuals' for model validation

This can be done with train_test_split:  randomly divides your input data (e.g., features and labels) into two sets: one for training your model and one for testing it.

In [8]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
# X_train: Most of your features, meant for training (e.g., 80% of X)
# X_test: The remaining features for testing (e.g., 20% of X)

# y_train: The labels corresponding to X_train
# y_test: The labels corresponding to X_test

# Define model
melbourne_model = DecisionTreeRegressor()


# Fit model with training data (is this the data that was excluded from fitting the model in train_test_split?)
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data using the full sample data set that's now been fit to the model
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

251467.84247901873


This means that we have an MAE of ~$250K

Underfitting and Overfitting can compromise the integrity of a model

    Underfitting:  Shallow decisision tree, potentially missing relevant patterns and yielding less accurate predictions
    Overfitting:  Deep decision tree where the model data matches the training data almost perfectly, capturing patterns that may not occur in the future

Solution:  Create a utility function to help compare MAE scores from different values for max_lead_nodes (parameter in DecisionTreeRegressor)
    NOTE that this will use the split data using train_rest_split

In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):

    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    
    return(mae)

In [10]:
# Data Loading Code Runs At This Point
import pandas as pd
    
# Load data
melbourne_file_path = 'melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 

# Filter rows with missing values
filtered_melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and features
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']

X = filtered_melbourne_data[melbourne_features]

from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [11]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 500  		 Mean Absolute Error:  243495
Max leaf nodes: 5000  		 Mean Absolute Error:  255575


In [12]:
# SIMILARLY
leaf_mae = dict()
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    leaf_mae[my_mae]=max_leaf_nodes

best_tree_size = leaf_mae[min(list(leaf_mae))]
print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(best_tree_size, min(list(leaf_mae))))

Max leaf nodes: 500  		 Mean Absolute Error:  243495


Now that the ideal number of leaves is determined, deploy the model with the full data set and define the identified ideal number of leaf nodes

In [13]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

RANDOM FOREST

More accurate predictions can be made by training a Random Forest, which is a collection of decision trees
This can be achieved in sklearn with RandomForestRegressor
Generally work reasonably even without this tuning

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)  # define the model
forest_model.fit(train_X, train_y)  # fit the model with data
melb_preds = forest_model.predict(val_X)  # run predictions
print(mean_absolute_error(val_y, melb_preds))  # model validation

191669.7536453626
