In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melbourne-housing-snapshot/melb_data.csv


# Data processing and Choosing training features:


In [2]:
# save filepath to variable for easier access
melbourne_file_path = '/kaggle/input/melbourne-housing-snapshot/melb_data.csv'

# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)

# print a summary of the data in Melbourne data
melbourne_data.columns
melbourne_data.describe()

# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)

# printing by columns
y = melbourne_data.Price

# Choosing "Features"
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

# print a summary of the data in X data
X.describe()


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


# Helper functions:

In [3]:
from sklearn.metrics import mean_absolute_error

# Calculate the Mean Average Error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# Split data into Training and validation 

In [4]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Building the Model using "Decision Tree Regressor"

In [5]:
# Building Your Model
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model

DecisionTreeRegressor(random_state=1)

# Start training and Fitting the model

In [6]:
# Fit model
melbourne_model.fit(train_X, train_y)
predicted_home_prices = melbourne_model.predict(train_X)
val_predictions = melbourne_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)

print("First in-sample predictions:", predicted_home_prices)
print("Actual target values for those homes:", train_y.head().tolist())
print("The MSE: ", mean_absolute_error(train_y, predicted_home_prices))
print("Validation MAE: {:,.0f}".format(val_mae))

First in-sample predictions: [1060000.  410000.  502000. ...  724500. 1000000.  890000.]
Actual target values for those homes: [1060000.0, 390000.0, 502000.0, 1055000.0, 1900000.0]
The MSE:  897.7835162470411
Validation MAE: 273,518


# Compare MAE with differing values of max_leaf_nodes

In [7]:
# compare MAE with differing values of max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 1000, 2000, 3000, 4000, 5000]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
x = val_mae
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    if my_mae < x:
        x = round(my_mae)
        # Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
        best_tree_size = max_leaf_nodes

print("best_tree_size: ", best_tree_size)
# Fit Model Using All Data
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
# fit the final model and uncomment the next two lines
final_model.fit(X, y)

predicted_home_prices = final_model.predict(train_X.head())
val_predictions = final_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)

print("First in-sample predictions:", predicted_home_prices)
print("Actual target values for those homes:", train_y.head().tolist())
print("The MSE: ", mean_absolute_error(train_y.head(), predicted_home_prices))
print("Validation MAE: {:,.0f}".format(val_mae))

best_tree_size:  500
First in-sample predictions: [ 817057.5         635110.39330544  604620.         1004851.85185185
 1923571.42857143]
Actual target values for those homes: [1060000.0, 390000.0, 502000.0, 1055000.0, 1900000.0]
The MSE:  132878.49400500325
Validation MAE: 125,445
