# Decision Trees for You and Me!

## Multi-Class Classification with the Iris Dataset

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [None]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [None]:
# Acquire Stage
df = data('iris')

# Prep
df.columns = [col.lower().replace('.', '_') for col in df]

train, validate, test = split(df, stratify_by="species")

print(train.shape, validate.shape, test.shape)

train.head()

In [None]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['species'])
y_train = train.species

X_validate = validate.drop(columns=['species'])
y_validate = validate.species

X_test = test.drop(columns=['species'])
y_test = test.species

This notebook is skipping the Exploration stage, because we have already explored this data.

Remember that one of the deliverables from our Exploration stage is narrowing down which features we'll use to model. 

For this example, it's pretty direct:
- Our target variable is species
- Our input variables are sepal and petal length and width

Onto the modeling!

In [None]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# Now let's train our model on the training data
clf = clf.fit(X_train, y_train)

In [None]:
clf.classes_

In [None]:
# To programmatically specify the output class labels
# Visualize the model so iut can explain itself!
# dataframe.target_variable.unique() then sort the array

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True, format="pdf")

In [None]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
print(classification_report(y_train, y_pred))

## Takeaways so far
- 96% accuracy on training data. 
- This specific model is pretty good at predicting setosa on the train data
- But how does this model perform on out-of-sample data?

In [None]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)

In [None]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

Workflow:
- Train a single model and see if it beats the baseline? If so, keep going. If not, maybe baseline is OK
- Evaluate that single model on validate dataset to see how well it performs on out of sample data
- We might then make or tweak a few other models (with different features and different hyperparameter arguments)
- We'll evaluate our handful of models on validate, then pick the best performing one.
- Once we've picked our shining model, then we'll evaluate its performance on the TEST dataset

In [None]:
# orange is setosa
train[train.petal_length <= 2.5].head(3)

In [None]:
not_setosas = train[train.petal_length > 2.5]
not_setosas.species.value_counts()

In [None]:
# Veriscolor is green
versicolor = not_setosas[not_setosas.petal_length <= 4.75]
versicolor.shape

In [None]:
# purple is virginica
mostly_virginica = not_setosas[not_setosas.petal_length > 4.75]
mostly_virginica.species.value_counts()

In [None]:
# Manually check out the decision rules from the trained model on validate
validate[validate.petal_length <= 2.5]

In [None]:
v_or_v = validate[validate.petal_length > 2.5]
v_or_v.species.value_counts()

In [None]:
v_or_v[v_or_v.petal_length <= 4.75]