In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    """
    This function get the MAE (Mean Absolute Error) based on the
    max_leaf_nodes parameters from DecisionTreeRegressor algorithm.
    """
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,
                                  random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [3]:
import pandas as pd

# importing the dataset from  csv to DataFrame format
melbourne_file_path = '../intro-to-machine-learning/data/melb_data.csv'
melbourne_data  = pd.read_csv(melbourne_file_path)
# removing missing values and creating a array with dataset features names 
filtered_melbourne_data = melbourne_data.dropna(axis=0)
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 
                      'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
# X = features, y = target
X = filtered_melbourne_data[melbourne_features]
y = filtered_melbourne_data.Price

from sklearn.model_selection import train_test_split
# splitting the dataset into train and validation
train_X, val_X, train_y, val_y = train_test_split(X,
                                                  y,
                                                  random_state = 0)

In [14]:
# loop for comparing MAE with different values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f'Max leaf nodes: {max_leaf_nodes} \t\t MAE: {my_mae:.0f}')

Max leaf nodes: 5 		 MAE: 347380
Max leaf nodes: 50 		 MAE: 258171
Max leaf nodes: 500 		 MAE: 243496
Max leaf nodes: 5000 		 MAE: 255575
