In [1]:
# numpy, pandas, usual suspects
import numpy as np
import pandas as pd
# Decision Tree Classifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier
# custom acquire scripting:
from acquire import get_titanic_data
# This is the version of prepare included in the florence classification exercises repo:
from prepare import prep_titanic
# filter out any noisey warning flags
import warnings
warnings.filterwarnings('ignore')
# graphviz for decision tree node visualization
from sklearn.tree import export_graphviz
import graphviz
# metric imports from sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [2]:
# acquire the data
df = get_titanic_data()
# prepare the data
train, validate, test = prep_titanic(df)

Using cached csv


In [3]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [4]:
# drop out non-numerical columns or non-encoded version remaining in this data set
drops = ['sex', 'class','embarked', 'embark_town', 'passenger_id']

In [5]:
# procedure:
# drop the columns in the list drops for every data set in train, validate test

In [6]:
# for dset in [train, validate, test]:
#     dsest = dset.drop(columns=drops)

In [7]:
train['dumbcol'] = 'hamsandwich'

In [8]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,dumbcol
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,hamsandwich
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,hamsandwich
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,hamsandwich
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,hamsandwich
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,hamsandwich


In [9]:
# 
# Reminder:
# using inplace=True kwarg ==> returns a None, changes the dataframe itself
# 
# not using inplace=True:
# returns and altered dataframe, does not change original

In [10]:
dumbvar = train.drop(columns=['dumbcol'], inplace=True)

In [11]:
type(dumbvar)

NoneType

In [12]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.000000,0,0,40.1250,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.000000,0,0,134.5000,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.000000,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.000000,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,313,0,3,male,28.000000,0,0,7.8958,S,Third,Southampton,1,0,1
636,636,0,3,male,32.000000,0,0,7.9250,S,Third,Southampton,1,0,1
222,222,0,3,male,51.000000,0,0,8.0500,S,Third,Southampton,1,0,1
485,485,0,3,female,29.916875,3,1,25.4667,S,Third,Southampton,0,0,1


In [13]:
# [drop those columns in this dataframe inplace for every dataset in train, validate, and test]
[dataset.drop(columns=drops, inplace=True) for dataset in [train, validate, test]]

[None, None, None]

In [14]:
# making changes to dataframes:

In [15]:
# looking at our training dataset:
train.head(2)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0


In [16]:
# i = i + 1

# Modularity in scripting changes to dataframes: 
# reassignment offers some more flexibility in variable
# names rather than calling inplace=True, such as:
# train = train.drop(columns=['age','sibsp'])
# train.drop(columns=somecols, inplace=True)
# 
# Just like when we are iterating through a list-like array of columns,
# train[somecol] in a iterative list of somecols
# is a little easier to change via bracket notation rather than
# train.somecol, which will be dependent on what is attached to the dots
# 
# train = train.drop(columns=['age','sibsp'])

In [17]:
train.head(2)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0


In [18]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0
50,0,3,7.0,4,1,39.6875,0,0,1
218,1,1,32.0,0,0,76.2917,1,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0


In [19]:
# We will be attempting to make a Decision Tree Classifier Model that will predict survival on the 
# Titanic that performs better than the baseline

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [20]:
# obtain our mode
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [21]:
train['baseline_assumption_death'] = 0

In [22]:
print(f'Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is {(train.baseline_assumption_death == train.survived).mean():.3}')

Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is 0.618


2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [23]:
# create the model
clf1 = DecisionTreeClassifier()

In [24]:
# Options that we have here:
# 1. drop out the baseline assumption Series after we make the calculation
# 2. Not actually assign the baseline Series into the dataframe
# 3. We could choose to explicitly feed in a set of features that does not include
# baseline_assumption_death in our X

In [25]:
# remove baseline assumption from the train
train.drop(columns='baseline_assumption_death', inplace=True)

In [26]:
# split our X and y
X_train = train.drop(columns='survived')
y_train = train[['survived']]

In [27]:
# sanity check
# do i have any data leakage or contamination?
# contamination: any data in my X that indicates the target
# contamination: any data that could lead to an improper assumption of the target:
# passenger id could falsely point to survived or not survived depending on how our data is structured
X_train.head(2)

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,Q,S
583,1,36.0,0,0,40.125,1,0,0
337,1,41.0,0,0,134.5,1,0,0


In [28]:
# fit the model
# clf1.fit(X_train[some_list_of_columns], y_train)

In [29]:
# fit the model
clf1.fit(X_train, y_train)

DecisionTreeClassifier()

In [30]:
# designate our X and y
# this is not a needed step, just placed here for clarity
X = X_train
y = y_train

In [31]:
conf

NameError: name 'conf' is not defined

In [None]:
conf[1]

In [None]:
conf[1][1]

In [None]:
# true positive rate: things we predicted true that were in fact true,
# things we said were true divided by the set of all true things
tpr = conf[1][1] / conf[1].sum()

In [None]:
# lower row: all the things that are actually class 1
conf[1]
# lower right corner: all the things we predicteded 1 that are actually 1
conf[1][1]

In [None]:
# call my accuracy score on the model object
accuracy = clf1.score(X, y)
# gather my model's predictions
y_pred = clf1.predict(X)
# calculate a confusion matrix
conf = confusion_matrix(y, y_pred)
# get a classification report
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
# compute some metrics on my own using the confusion matrix
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
# use the model to predict
y_pred = clf1.predict(X_train)

In [None]:
# check out the values in the predictions
pd.Series(y_pred).value_counts()

In [None]:
# model score: accuracy

In [None]:
accuracy = clf1.score(X_train, y_train)

In [None]:
accuracy

In [None]:
# confusion matrix
conf = confusion_matrix(y_train, y_pred)

In [None]:
conf

In [None]:
# get the classification report
class_report = classification_report(y_train, y_pred, output_dict=True)

In [None]:
class_report

In [None]:
pd.DataFrame(class_report).rename(columns={'0': 'deceased', '1': 'survived'}).T

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
conf

In [None]:
# turn our confusion matrix into a dataframe for human legibility
conf_df = pd.DataFrame(conf, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [None]:
conf_df

In [None]:
# make a key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [None]:
rubric_df

In [None]:
joined = pd.concat([conf_df, rubric_df], axis=1)

In [None]:
rubric_df + ': ' + conf_df.values.astype(str)

5. Run through steps 2-4 using a different max_depth value.

In [None]:
# clf2
# change your variable names on new models for comparison
clf2 = DecisionTreeClassifier(max_depth=3)

In [None]:
# fit the model

In [None]:
clf2.fit(X_train, y_train)

In [None]:
y_pred = clf2.predict(X_train)

6. Which model performs better on your in-sample data?

In [None]:
# Model #1:
accuracy = clf1.score(X, y)
y_pred = clf1.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

In [None]:
conf

In [None]:
y.survived.value_counts()

In [None]:
# Model #2:
accuracy = clf2.score(X, y)
y_pred = clf2.predict(X)
conf = confusion_matrix(y, y_pred)
class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
conf = confusion_matrix(y, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
The accuracy for our model is {accuracy:.4}
The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
''')
class_report

7. Which model performs best on your out-of-sample data, the validate set?

In [None]:
# 
# reminder again: Do NOT retrain your model on validate or test, just predict!
# 

In [None]:
# get predictions for our validation sets
# X_validate ==> everything from validate that isn't survived
# get predictions on my first model on my validate set
y_val_pred_1 = clf1.predict(validate.drop(columns='survived'))
# get predictions on my second model on my validate set
y_val_pred_2 = clf2.predict(validate.drop(columns='survived'))

In [None]:
# get validation accuracy
accuracy_v_1 = clf1.score(validate.drop(columns='survived'), validate.survived)
accuracy_v_2 = clf2.score(validate.drop(columns='survived'), validate.survived)

In [None]:
accuracy_v_1

In [None]:
accuracy_v_2

In [None]:
# 
# 
# We see here a very significant drop-off on model 1,
# where we did not specify a maximum depth, 
# suggesting it was very overfit to the training set
# 
# Model 2 did not perform as well on training data, 
# but performed better on out-of-sample data at a rate
# better than our baseline, so it is the most useful model 
# we have so far!
# 
# 

In [None]:
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [None]:
graph.render('titanic_model_2_tree', view=True)

In [None]:
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [None]:
graph.render('titanic_model_1_tree', view=True)