# Decision Tree Regressor Model Setup and Validation

## Modules are helpful

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Importing Data

In [None]:
# Path of the file to read
data_path = '../path/to/data.csv'
the_data = pd.read_csv(data_path)
test_data_path = '../path/to/test.csv'
test_data = pd.read_csv(test_data_path)

## Checking initial column names

In [1]:
the_data.columns

## Checking the first few rows of the data

In [None]:
the_data.head()

## Selecting the target variable

In [None]:
y = the_data["Target"]

## Setting up the feature list that are used to predict the target

In [None]:
# Create the list of features below
feature_names = ["Feature_1","Feature_2","Feature_3"]

# Select data corresponding to features in feature_names
X = the_data[feature_names]

## Reviewing the features

In [None]:
# print description or statistics from X
print(X.describe())

# print the top few lines
print(X.head())

## Spliting up the model into training and validation

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 7)

## Making the model

In [None]:
#For model reproducibility, set a numeric value for random_state when specifying the model
model = DecisionTreeRegressor(random_state=7)

# Fit the model
model.fit(train_X, train_y)

## Making predictions and checking how well the model did

In [None]:
val_predictions = model.predict(val_X)
#print(val_predictions[0:20])
print(mean_absolute_error(val_y, val_predictions))

## A function that checks the Mean Absolute Error of Decision Tree Depth Configurations

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

## Using the get_mea() function to see the best depth of the Decision Tree Regressor

In [None]:
# Making a list of all the tree node depths to check
depths_to_check = [5, 50, 500, 5000]
for max_leaf_nodes in depths_to_check:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# References
#### Intro to Machine Learning on Kaggle by Dan Becker: https://www.kaggle.com/learn/intro-to-machine-learning
#### Scikit's documentation on DecisionTreeRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html