In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from acquire import get_titanic_data
# This is the version of prepare included in the florence classification exercises repo:
from prepare import prep_titanic
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import export_graphviz
import graphviz

In [2]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [3]:
# acquire the data
df = get_titanic_data()
# prepare the data
train, validate, test = prep_titanic(df)

Using cached csv


In [4]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [5]:
# drop out non-numerical columns or non-encoded version remaining in this data set
drops = ['sex', 'class','embarked', 'embark_town', 'passenger_id']

In [6]:
[dataset.drop(columns=drops, inplace=True) for dataset in [train, validate, test]]

[None, None, None]

In [7]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0
50,0,3,7.0,4,1,39.6875,0,0,1
218,1,1,32.0,0,0,76.2917,1,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0


In [8]:
# We will be attempting to make a Decision Tree Classifier Model that will predict survival on the 
# Titanic that performs better than the baseline

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [9]:
# obtain our mode
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [10]:
train['baseline_assumption_death'] = 0

In [11]:
print(f'Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is {(train.baseline_assumption_death == train.survived).mean():.3}')

Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is 0.618


2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [12]:
# create the model
clf1 = DecisionTreeClassifier()

In [13]:
# remove baseline assumption from the train
train.drop(columns='baseline_assumption_death', inplace=True)

In [14]:
# split our X and y
X_train = train.drop(columns='survived')
y_train = train[['survived']]

In [15]:
# fit the model
clf1.fit(X_train, y_train)

DecisionTreeClassifier()

In [16]:
# designate our X and y
X = X_train
y = y_train

In [17]:
accuracy = clf1.score(X, y)
y_pred = clf1.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.9738
The True Positive Rate is 0.968, The False Positive Rate is 0.0228,
The True Negative Rate is 0.977, and the False Negative Rate is 0.0316



Unnamed: 0,precision,recall,f1-score,support
0,0.980392,0.977199,0.978793,307.0
1,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [18]:
# use the model to predict
y_pred = clf1.predict(X_train)

In [19]:
# check out the values in the predictions
pd.Series(y_pred).value_counts()

0    306
1    191
dtype: int64

In [20]:
# model score: accuracy

In [21]:
accuracy = clf1.score(X_train, y_train)

In [22]:
accuracy

0.9738430583501007

In [23]:
# confusion matrix
conf = confusion_matrix(y_train, y_pred)

In [24]:
conf

array([[300,   7],
       [  6, 184]])

In [25]:
# get the classification report
class_report = classification_report(y_train, y_pred, output_dict=True)

In [26]:
class_report

{'0': {'precision': 0.9803921568627451,
  'recall': 0.9771986970684039,
  'f1-score': 0.9787928221859705,
  'support': 307},
 '1': {'precision': 0.9633507853403142,
  'recall': 0.968421052631579,
  'f1-score': 0.9658792650918635,
  'support': 190},
 'accuracy': 0.9738430583501007,
 'macro avg': {'precision': 0.9718714711015296,
  'recall': 0.9728098748499914,
  'f1-score': 0.972336043638917,
  'support': 497},
 'weighted avg': {'precision': 0.9738773468239887,
  'recall': 0.9738430583501007,
  'f1-score': 0.9738560498562314,
  'support': 497}}

In [27]:
pd.DataFrame(class_report).rename(columns={'0': 'deceased', '1': 'survived'}).T

Unnamed: 0,precision,recall,f1-score,support
deceased,0.980392,0.977199,0.978793,307.0
survived,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [28]:
conf

array([[300,   7],
       [  6, 184]])

In [29]:
# turn our confusion matrix into a dataframe for human legibility
conf_df = pd.DataFrame(conf, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [30]:
conf_df

Unnamed: 0,predict_death,predict_survive
actual_death,300,7
actual_survive,6,184


In [31]:
# make a key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [32]:
rubric_df

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


In [33]:
joined = pd.concat([conf_df, rubric_df], axis=1)

In [34]:
rubric_df + ': ' + conf_df.values.astype(str)

Unnamed: 0,predict_death,predict_survive
actual_death,true negative: 300,false positive: 7
actual_survive,false negative: 6,true positive: 184


5. Run through steps 2-4 using a different max_depth value.

In [35]:
# clf2
clf2 = DecisionTreeClassifier(max_depth=3)

In [36]:
# fit the model

In [37]:
clf2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [38]:
y_pred = clf2.predict(X_train)

6. Which model performs better on your in-sample data?

In [39]:
# Model #1:
accuracy = clf1.score(X, y)
y_pred = clf1.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.9738
The True Positive Rate is 0.968, The False Positive Rate is 0.0228,
The True Negative Rate is 0.977, and the False Negative Rate is 0.0316



Unnamed: 0,precision,recall,f1-score,support
0,0.980392,0.977199,0.978793,307.0
1,0.963351,0.968421,0.965879,190.0
accuracy,0.973843,0.973843,0.973843,0.973843
macro avg,0.971871,0.97281,0.972336,497.0
weighted avg,0.973877,0.973843,0.973856,497.0


In [40]:
conf

array([[300,   7],
       [  6, 184]])

In [41]:
y.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [42]:
# Model #2:
accuracy = clf2.score(X, y)
y_pred = clf2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report


The accuracy for our model is 0.7123
The True Positive Rate is 0.342, The False Positive Rate is 0.0586,
The True Negative Rate is 0.941, and the False Negative Rate is 0.658



Unnamed: 0,precision,recall,f1-score,support
0,0.698068,0.941368,0.801664,307.0
1,0.783133,0.342105,0.47619,190.0
accuracy,0.712274,0.712274,0.712274,0.712274
macro avg,0.7406,0.641737,0.638927,497.0
weighted avg,0.730587,0.712274,0.677238,497.0


7. Which model performs best on your out-of-sample data, the validate set?

In [43]:
# get predictions for our validation sets
y_val_pred_1 = clf1.predict(validate.drop(columns='survived'))
y_val_pred_2 = clf2.predict(validate.drop(columns='survived'))

In [44]:
# get validation accuracy
accuracy_v_1 = clf1.score(validate.drop(columns='survived'), validate.survived)
accuracy_v_2 = clf2.score(validate.drop(columns='survived'), validate.survived)

In [45]:
accuracy_v_1

0.677570093457944

In [46]:
accuracy_v_2

0.7102803738317757

In [47]:
# 
# 
# We see here a very significant drop-off on model 1,
# where we did not specify a maximum depth, 
# suggesting it was very overfit to the training set
# 
# Model 2 did not perform as well on training data, 
# but performed better on out-of-sample data at a rate
# better than our baseline, so it is the most useful model 
# we have so far!
# 
# 

In [48]:
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [49]:
graph.render('titanic_model_2_tree', view=True)

'titanic_model_2_tree.pdf'

In [50]:
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [51]:
graph.render('titanic_model_1_tree', view=True)

'titanic_model_1_tree.pdf'