In [1]:
# Importing pandas for data manipulation
import pandas as pd

In [2]:
# Reading in our data and seeing if it was successful
melb_data = pd.read_csv('melb_data.csv')
melb_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [3]:
# To get all of the columns inside the data
#melb_data.columns

# This is what we want to predict
y = melb_data.Price

# These are the features (attributes we pass into the model) that we are going to use
# We are picking them based on intuition
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x = melb_data[melbourne_features]
print(x)
x.describe()
x.head()


       Rooms  Bathroom  Landsize  Lattitude  Longtitude
0          2       1.0     202.0  -37.79960   144.99840
1          2       1.0     156.0  -37.80790   144.99340
2          3       2.0     134.0  -37.80930   144.99440
3          3       2.0      94.0  -37.79690   144.99690
4          4       1.0     120.0  -37.80720   144.99410
...      ...       ...       ...        ...         ...
13575      4       2.0     652.0  -37.90562   145.16761
13576      3       2.0     333.0  -37.85927   144.87904
13577      3       2.0     436.0  -37.85274   144.88738
13578      4       1.0     866.0  -37.85908   144.89299
13579      4       1.0     362.0  -37.81188   144.88449

[13580 rows x 5 columns]


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941


In [4]:
# Defining the decision tree model using scikit-learn
# The decision tree is the most basic model for machine learning
from sklearn.tree import DecisionTreeRegressor

In [5]:
# Steps to building and using a model
# 0) Cleaning/Preprocessing data
# 1) Define which type of model it will be and also some features (parameters)
# 2) Fitting the model. (Capturing patterns from the provided data)
# 3) Prediction
# 4) Evaluation (how accurate are the prediction)
# 5) Tweak parameters and repeat

# This is our model
# For model reproducibility, set a numeric value for random_state when specifying the model
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fitting the model
melbourne_model.fit(x, y)

In [6]:
# Making predictions
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(melbourne_model.predict(x.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941
The predictions are
[1480000. 1035000. 1465000.  850000. 1600000.]


In [7]:
# Calculating error in the model predictions
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(x)
# This calculates the mean absolute error between the expected and the actual values
mean_absolute_error(y, predicted_home_prices)


1125.1804614629357

In [8]:
# Train test split is the process of seperating data into two sets for training and testing purposes
from sklearn.model_selection import train_test_split

# We set random state to a definite number to ensure that we get the same 
# Splits the x and y datasets into two sets, one for testing, one for training
train_X, test_X, train_Y, test_Y = train_test_split(x, y, random_state=0)
# For reference
# x = melb_data[melbourne_features]
# y = melb_data.Price

# Creating a new model based on training data
melbourne_model.fit(train_X, train_Y)

In [9]:
val_predictions = melbourne_model.predict(test_X)

mean_absolute_error(test_Y, val_predictions)

250429.6127638684

In [10]:
# We will improve the accuracy by determining the best amount of leaf nodes in our decision tree model

def get_mae(i, train_X, train_Y, test_X, test_Y):
    model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=i)
    model.fit(train_X, train_Y)
    predictions = model.predict(test_X)
    mae = mean_absolute_error(test_Y, predictions)
    return mae

candidate_node_values = [5, 50, 500, 5000]
currmin = 10000000000
currbest = 0
for i in candidate_node_values:
    mae = get_mae(i, train_X, train_Y, test_X, test_Y)
    if mae < currmin:
        currmin = mae
        currbest = i

best_leaf_value = currbest

# Final model

melbourne_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=best_leaf_value)
melbourne_model.fit(train_X, train_Y)
predictions = melbourne_model.predict(test_X)
print(mean_absolute_error(test_Y, predictions))

# As we can see we have improved the mean absolute error, albeit by a small amount

231301.17567588817


In [11]:
# Retrying again with a random forest model instead of a tree
from sklearn.ensemble import RandomForestRegressor

# Our new forest model
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_Y)

# Getting the new absolute error
new_predictions = forest_model.predict(test_X)
new_error = mean_absolute_error(test_Y, new_predictions)
print(new_error)
# The new error is much lower than before

180860.37877504269
