In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
import seaborn as sns

import env
import prepare
import acquire


# Titanic Dataset

In [2]:
# import titanic dataset, run our prepare functions, split into train/validate/test, and validate size of df's
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
tit_train, tit_validate, tit_test = prepare.my_split(df, target='survived')
tit_train.shape, tit_validate.shape, tit_test.shape

((426, 15), (143, 15), (143, 15))

In [3]:
tit_train.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
174,0,1,male,56.0,0,0,30.6958,C,Cherbourg,1,0,1,1,0,0


## What is your baseline prediction?  What is your baseline accuracy? 

In [4]:
# find the most common condition for our target to determine baseline.  It's rate of occurance is the accuracy of 
# our baseline
print('The most common value (our baseline) is:',tit_train.survived.value_counts().idxmax())
print('The rate of occurance (our baseline accuracy) is:', len(tit_train[tit_train.survived == 0]) / len(tit_train) * 100)

The most common value (our baseline) is: 0
The rate of occurance (our baseline accuracy) is: 59.624413145539904


# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [5]:
# split each of our datasets into X and y 

X_train = tit_train.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_train = tit_train.survived

X_validate = tit_validate.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_validate = tit_validate.survived

X_test = tit_test.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_test = tit_test.survived

In [6]:
# create and fit the model
clf = DecisionTreeClassifier()

clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_prob = clf.predict_proba(X_train)


## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [7]:
# model score
print(f'training score: {clf.score(X_train, y_train):.2%}')

training score: 99.30%


In [8]:
# confusion matrix
cm = confusion_matrix(y_train, y_pred)
print(pd.DataFrame(cm))

     0    1
0  254    0
1    3  169


In [9]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       254
           1       1.00      0.98      0.99       172

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



## Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [10]:
# accuracy
print('------- Model 1 -----------')
print('Accuracy score is: ', accuracy_score(y_train, y_pred))

------- Model 1 -----------
Accuracy score is:  0.9929577464788732


In [11]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp = cm[0,0]
tn = cm[1,1]
fp = cm[0,1]
fn = cm[1,0]
print('------- Model 1 -----------')
print('True positive rate is:', tp/(tp+tn+fp+fn))
print('False positive rate is', fp/(tp+tn+fp+fn))
print('True negative rate is', tn/(tp+tn+fp+fn))
print('False negative rate is', fn/(tp+tn+fp+fn))

------- Model 1 -----------
True positive rate is: 0.596244131455399
False positive rate is 0.0
True negative rate is 0.3967136150234742
False negative rate is 0.007042253521126761


In [12]:
# precision, recall, f1-score and support
print('------- Model 1 -----------')
print('Precision is: ', precision_score(y_train, y_pred))
print('Recall is:', recall_score(y_train, y_pred))
print('f1 score is:', f1_score(y_train, y_pred))
print('Support is 0:', tp+fp)
print('           1:', tn+fn)

------- Model 1 -----------
Precision is:  1.0
Recall is: 0.9825581395348837
f1 score is: 0.9912023460410557
Support is 0: 254
           1: 172


## Run through steps 2-4 using a different max_depth value.

In [13]:
# changing max depth to 5
clf2 = DecisionTreeClassifier(max_depth=5)

clf2 = clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [14]:
# model score
print('------- Model 2 -----------')
print(f'training score for model2: {clf2.score(X_train, y_train):.2%}')

------- Model 2 -----------
training score for model2: 85.92%


In [15]:
# confusion matrix
cm2 = confusion_matrix(y_train, y_pred2)
print(pd.DataFrame(cm2))

     0    1
0  234   20
1   40  132


In [16]:
# classification report
print('------- Model 2 -----------')
print(classification_report(y_train, y_pred2))

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       254
           1       0.87      0.77      0.81       172

    accuracy                           0.86       426
   macro avg       0.86      0.84      0.85       426
weighted avg       0.86      0.86      0.86       426



In [17]:
# accuracy
print('------- Model 2 -----------')
print('Accuracy score for model 2 is: ', accuracy_score(y_train, y_pred2))

------- Model 2 -----------
Accuracy score for model 2 is:  0.8591549295774648


In [18]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp2 = cm2[0,0]
tn2 = cm2[1,1]
fp2 = cm2[0,1]
fn2 = cm2[1,0]

print('------- Model 2 -----------')
print('True positive rate is:', tp2/(tp2+tn2+fp2+fn2))
print('False positive rate is', fp2/(tp2+tn2+fp2+fn2))
print('True negative rate is', tn2/(tp2+tn2+fp2+fn2))
print('False negative rate is', fn2/(tp2+tn2+fp2+fn2))

------- Model 2 -----------
True positive rate is: 0.5492957746478874
False positive rate is 0.046948356807511735
True negative rate is 0.30985915492957744
False negative rate is 0.09389671361502347


In [19]:
# precision, recall, f1-score and support
print('------- Model 2 -----------')
print('Precision is: ', precision_score(y_train, y_pred2))
print('Recall is:', recall_score(y_train, y_pred2))
print('f1 score is:', f1_score(y_train, y_pred2))
print('Support is 0:', tp2+fp2)
print('           1:', tn2+fn2)

------- Model 2 -----------
Precision is:  0.868421052631579
Recall is: 0.7674418604651163
f1 score is: 0.8148148148148148
Support is 0: 254
           1: 172


## Which model performs better on your in-sample data?

Model 1 performs better when evaluating performance against the in-sample (training) data

## Which model performs best on your out-of-sample data, the validate set?

In [22]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))


------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.77

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.79


In [23]:
# classification report
y_pred = clf.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

           0       0.82      0.78      0.80        85
           1       0.70      0.76      0.73        58

    accuracy                           0.77       143
   macro avg       0.76      0.77      0.76       143
weighted avg       0.77      0.77      0.77       143

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        85
           1       0.73      0.76      0.75        58

    accuracy                           0.79       143
   macro avg       0.78      0.79      0.78       143
weighted avg       0.79      0.79      0.79       143



Model 2 performs slightly better with a max depth of 5