# Decision Tree Exercise

In [24]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # sklearn is a machine learning library
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import graphviz
from graphviz import Graph

In [25]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [26]:
# drop duplicate columns
# Drop columns that we discovered from Explore stage didn't really have a lot of bearing
df = df[["survived", "pclass", "sex", "age", "fare"]]
df.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [27]:
def encode_gender(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [28]:
df.sex = df.sex.apply(encode_gender)
df.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,0,22.0,7.25
1,1,1,1,38.0,71.2833
2,1,3,1,26.0,7.925
3,1,1,1,35.0,53.1
4,0,3,0,35.0,8.05


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    int64  
 3   age       714 non-null    float64
 4   fare      891 non-null    float64
dtypes: float64(2), int64(3)
memory usage: 34.9 KB


In [30]:
print(f"Survived nulls: {df.survived.isna().sum()}")
print(f"Class nulls:  {df.pclass.isna().sum()}")
print(f"Gender nulls: {df.sex.isna().sum()}")
print(f"Age nulls: {df.age.isna().sum()}")
print(f"Fare nulls: {df.fare.isna().sum()}")

Survived nulls: 0
Class nulls:  0
Gender nulls: 0
Age nulls: 177
Fare nulls: 0


In [31]:
# nice and clean
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
fare          0
dtype: int64

In [32]:
# get the median age
median_age = df[df.age.notnull()].age.median()
median_age

28.0

In [10]:
# the pandas .median method ignores nulls
df.age.median()

28.0

In [33]:
# fill the nulls w/ the median
df.age = df.age.fillna(median_age)
print(f"Age nulls: {df.age.isna().sum()}")

Age nulls: 0


In [34]:
# Setup the X and y variables
X = df.drop("survived", axis=1)
y = df[["survived"]]

In [37]:
X.head()

Unnamed: 0,pclass,sex,age,fare
0,3,0,22.0,7.25
1,1,1,38.0,71.2833
2,3,1,26.0,7.925
3,1,1,35.0,53.1
4,3,0,35.0,8.05


In [38]:
# What defines the "positive case" of a prediction?
# Up to convention in the industry
# Some data will already have 0s and 1s
# Shortcut -> adopt the "1" as your positive case
# Define what that means? What are we trying to measure?
y.head()

Unnamed: 0,survived
0,0
1,1
2,1
3,1
4,0


In [43]:
# Setup the train-test split

# train_test_split gives us 2 datasets
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=y.survived)

# Now, let's split our "train/validate into train and validate"
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size=0.3, random_state = 123, stratify=y_train_validate)


In [44]:
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=123)

In [45]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=123)

In [46]:
y_pred = clf.predict(X_train)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,

## Model's Predicted Performance

In [60]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [48]:
# first_argument = actual
# second_argument = prediction
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89       269
           1       0.95      0.63      0.76       167

    accuracy                           0.85       436
   macro avg       0.88      0.81      0.82       436
weighted avg       0.87      0.85      0.84       436



# Model Performance on Test Data

In [56]:
# Get the predicted y values from the X_validate
y_pred = clf.predict(X_validate)

In [58]:
# .score defaults to accuracy
print(f"Accuracy of Decision Tree on Validate data is: {clf.score(X_validate, y_validate)}")

Accuracy of Decision Tree on Validate data is: 0.8181818181818182


In [59]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       115
           1       0.85      0.64      0.73        72

    accuracy                           0.82       187
   macro avg       0.83      0.78      0.80       187
weighted avg       0.82      0.82      0.81       187



- Reality is True/False, the prediction is Positive/Negative
- Accuracy: TP + TN / total observations is # of True predictions out of all observations
- Recall AKA True Positive Rate AKA Sensitivity is TP / (TP + FN) is TP / ALL actual positives
- Precision = TP / (TP + FP) == ratio of TP to all predicted positives
- F1 score is what you use if FP and FN cost the same to make
- Smoke alarm has the positive case of detecting a fire
- False Postive: alarm goes off, but everything is cool. (false alarm)
- False Negative == miss. the alarm stays quiet, but the kitchen is on fire...

In [64]:
dot_data = export_graphviz(clf, feature_names= X.columns, class_names= {0:'not survived', 1:'survived'}, rounded=True, filled=True, out_file=None)

graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

'iris_decision_tree.pdf'

In [62]:
X_train.head()

Unnamed: 0,pclass,sex,age,fare
203,3,0,45.5,7.225
72,2,0,21.0,73.5
143,3,0,19.0,6.75
243,3,0,22.0,7.125
664,3,0,20.0,7.925
