In [3]:
# Construindo o modelo
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

iowa_file_path = '../data/train.csv'

home_data = pd.read_csv(iowa_file_path)

y = home_data.SalePrice

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(train_X, train_y)

val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_y, val_predictions)

print("Validation MAE: {:,.0f}".format(val_mae))


Validation MAE: 29,653


In [4]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae


In [9]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 200, 500, 1000]
smallest_mae = float('inf')

for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    if mae<smallest_mae:
        smallest_mae = mae
        best_tree_size = max_leaf_nodes
    print("Max Leaf Nodes: {:,.0f} \t\t MAE: {:,.0f}".format(max_leaf_nodes, mae))

print("Best Tree Size: {:.0f}".format(best_tree_size))

Max Leaf Nodes: 5 		 MAE: 35,045
Max Leaf Nodes: 25 		 MAE: 29,016
Max Leaf Nodes: 50 		 MAE: 27,406
Max Leaf Nodes: 100 		 MAE: 27,283
Max Leaf Nodes: 200 		 MAE: 28,136
Max Leaf Nodes: 500 		 MAE: 29,454
Max Leaf Nodes: 1,000 		 MAE: 30,136
Best Tree Size: 100


In [10]:
# Agora construimos o modelo com a melhor configuração
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=0)