# Decision Trees for You and Me!

## Binary Classification w/ the Tips dataset!

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

## Planning Stage
- Given diner table information, predict if the table will be a smoking table or not
- Input features are bill, tip, gender, day, time of day, and table size
- Target variable is smoker status

In [7]:
# Acquire
df = data("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
# We don't need to scale the continuous input variables, since we're working with a decision tree

# Let's turn strings into booleans
df["is_female"] = df.sex == "Female"
df["is_dinner"] = df.time == "Dinner"

# We'll want to encode the day variable, since there are 4 possibilities (Thursday, Friday, Saturday, Sunday)
dummy_df = pd.get_dummies(df[["day"]], drop_first=True)
dummy_df

Unnamed: 0,day_Sat,day_Sun,day_Thur
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,0,1,0
...,...,...,...
240,1,0,0
241,1,0,0
242,1,0,0
243,1,0,0


In [9]:
df = pd.concat([df, dummy_df], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'time', 'day'])
df.head()

Unnamed: 0,total_bill,tip,smoker,size,is_female,is_dinner,day_Sat,day_Sun,day_Thur
1,16.99,1.01,No,2,True,True,0,1,0
2,10.34,1.66,No,3,False,True,0,1,0
3,21.01,3.5,No,3,False,True,0,1,0
4,23.68,3.31,No,2,False,True,0,1,0
5,24.59,3.61,No,4,True,True,0,1,0


In [10]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="smoker")
train.head()

Unnamed: 0,total_bill,tip,smoker,size,is_female,is_dinner,day_Sat,day_Sun,day_Thur
116,17.31,3.5,No,2,True,True,0,1,0
76,10.51,1.25,No,2,False,True,1,0,0
168,31.71,4.5,No,4,False,True,0,1,0
208,38.73,3.0,Yes,4,False,True,1,0,0
182,23.33,5.65,Yes,2,False,True,0,1,0


In [11]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['smoker'])
y_train = train.smoker # labeled data == supervise algorithm

X_validate = validate.drop(columns=['smoker'])
y_validate = validate.smoker

X_test = test.drop(columns=['smoker'])
y_test = test.smoker

In [12]:
train.head()

Unnamed: 0,total_bill,tip,smoker,size,is_female,is_dinner,day_Sat,day_Sun,day_Thur
116,17.31,3.5,No,2,True,True,0,1,0
76,10.51,1.25,No,2,False,True,1,0,0
168,31.71,4.5,No,4,False,True,0,1,0
208,38.73,3.0,Yes,4,False,True,1,0,0
182,23.33,5.65,Yes,2,False,True,0,1,0


In [14]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
#clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [15]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

In [16]:
# Visualize the model so iut can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'

In [17]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array(['No', 'No', 'No'], dtype=object)

In [18]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[1., 0.],
       [1., 0.],
       [1., 0.]])

In [19]:
y_train.head(3)

116    No
76     No
168    No
Name: smoker, dtype: object

In [20]:
train["most_frequent"] = "No"
baseline_accuracy = (train.smoker == train.most_frequent).mean()
baseline_accuracy 

0.6176470588235294

In [21]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 1.00


In [22]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00        84
         Yes       1.00      1.00      1.00        52

    accuracy                           1.00       136
   macro avg       1.00      1.00      1.00       136
weighted avg       1.00      1.00      1.00       136



## Takeaways so far
- Pretty high accuracy on training data. 
- But how does this model perform on out-of-sample data?

In [23]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

0.6610169491525424

In [None]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

In [None]:
y_validate.head(3)

In [None]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))