## Recap

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

The function `get_mae` to obtain mean absolute error for each tree size. 

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

## Step 1: Compare Different Tree Sizes
A loop that tries the following values for *max_leaf_nodes* from a set of possible values.

Calling the *get_mae* function on each value of max_leaf_nodes. Storing the output in some way that allows us to select the value of `max_leaf_nodes` that gives the most accurate model on our data.

In [None]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# loop to find the ideal tree size from candidate_max_leaf_nodes
best_mln =  5
best_mae = float('inf')
for max_leaf_nodes in [5, 25, 50, 100, 250, 500] :
    mae = get_mae(max_leaf_nodes,train_X, val_X, train_y, val_y)
    if mae < best_mae :
        best_mae = mae
        best_mln = max_leaf_nodes
# Storing the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = best_mln
print(best_tree_size)

## Step 2: Fit Model Using All Data
The best tree size is known. If we were going to deploy this model in practice, you would make it even more accurate by using all of the data and keeping that tree size.

In [None]:
# Filling in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size,random_state=0)

# fitting the final model 
final_model.fit(X,y)