In [None]:
"""
Same code that was used in 'model_validation.ipynb'.
"""

In [2]:
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

#path to access file
file_path = '~/Desktop/Kaggle/practice/model_validation_home_train.csv'
#reads file
file_data = pd.read_csv(file_path)

#prediction target
y = file_data.SalePrice
#features that determines house price
featured_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = file_data[featured_columns]

#split data into training and validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#specify model
file_model = DecisionTreeRegressor(random_state=1)
#fit model with trining data set
file_model.fit(train_X, train_y)

#predict validation prices based on model that used train data
valid_predict = file_model.predict(val_X)

#calculate MAE
mae2 = mean_absolute_error(val_y, valid_predict)
print('MAE:', mae2)

MAE: 29652.931506849316


In [None]:
"""
To improve this model, we need to determine the lowest MAE based on the tree leaves. The best MAE can be determined from model that used train data set, and than we can use validation data set on the trained model to predict the result and determine MAE. Bellow is the function that calculates MAE values.
"""

In [3]:
#function that helps get MAE scores
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    #define model
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    #fit model
    model.fit(train_X, train_y)
    #predict values 
    predict_vals = model.predict(val_X)
    #calculate MAE
    mae = mean_absolute_error(val_y, predict_vals)
    return (mae)

In [None]:
"""
Now we have list of number of leaves. We can run 'for-loop' to compare each leaf MAE result. This helps to determine the lowest MAE and optimal number of leaves.
"""

In [4]:
#number of possible leaves
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

#use for loop to compare MAE of different numbers of leaves
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print ("Leaf #: %d \t\t MAE: %d" %(max_leaf_nodes, my_mae))


Leaf #: 5 		 MAE: 35044
Leaf #: 25 		 MAE: 29016
Leaf #: 50 		 MAE: 27405
Leaf #: 100 		 MAE: 27282
Leaf #: 250 		 MAE: 27893
Leaf #: 500 		 MAE: 29454


In [None]:
"""
The above approach is good for small list of leaves. To avoid serching for result by hand, we can automatically determine the best number of leaves we can use dictinary approach.
"""

In [8]:
#can use dictinary to determine lowest MAE
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}

#determones lowest MAE from dictinary
best_tree_size = min(scores, key = scores.get)
print('Best tree size:', best_tree_size)

Best tree size: 100


In [None]:
"""
Now we have the best number of leaves (tree size), we can fit model using the whole data.
"""

In [11]:
#final model using 100 leaves
final_model = DecisionTreeRegressor(max_leaf_nodes = best_tree_size, random_state = 1)

#fit model on the whole data
final_model.fit(X,y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)