In [None]:
# numpy, pandas, usual suspects
import numpy as np
import pandas as pd
# custom acquire scripting:
from acquire import get_titanic_data
# This is the version of prepare included in the florence classification exercises repo:
from prepare import prep_titanic
# filter out any noisy warning flags
import warnings
warnings.filterwarnings('ignore')
# graphviz for decision tree node visualization
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
# metric imports from sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [14]:
# acquire the data
df = get_titanic_data()
# prepare the data
train, validate, test = prep_titanic(df)

In [15]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,1,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,0,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,1,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,0,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,0,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 583 to 553
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  497 non-null    int64  
 1   survived      497 non-null    int64  
 2   pclass        497 non-null    int64  
 3   sex           497 non-null    int64  
 4   age           497 non-null    float64
 5   sibsp         497 non-null    int64  
 6   parch         497 non-null    int64  
 7   fare          497 non-null    float64
 8   embarked      497 non-null    object 
 9   class         497 non-null    object 
 10  embark_town   497 non-null    object 
 11  alone         497 non-null    int64  
 12  Q             497 non-null    uint8  
 13  S             497 non-null    uint8  
dtypes: float64(2), int64(7), object(3), uint8(2)
memory usage: 51.4+ KB


In [17]:
train.describe()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,Q,S
count,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0
mean,442.885312,0.382294,2.285714,0.663984,29.916875,0.515091,0.366197,33.434799,0.607646,0.076459,0.730382
std,259.437654,0.486437,0.839341,0.47282,13.265639,1.139589,0.755827,54.568008,0.488767,0.265998,0.444208
min,0.0,0.0,1.0,0.0,0.67,0.0,0.0,0.0,0.0,0.0,0.0
25%,215.0,0.0,2.0,0.0,23.0,0.0,0.0,7.925,0.0,0.0,0.0
50%,449.0,0.0,3.0,1.0,29.916875,0.0,0.0,14.4542,1.0,0.0,1.0
75%,667.0,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0,0.0,1.0
max,890.0,1.0,3.0,1.0,80.0,8.0,5.0,512.3292,1.0,1.0,1.0


In [20]:
# drop out non-numerical columns or non-encoded version remaining in this data set
drops = ['sex', 'class','embarked', 'embark_town', 'passenger_id']

In [19]:
# train.columns, validate.columns, test.columns

In [22]:
[dataset.drop(columns=drops, inplace=True) for dataset in [train, validate, test]]

[None, None, None]

In [24]:
train.columns, validate.columns

(Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'Q',
        'S'],
       dtype='object'),
 Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'Q',
        'S'],
       dtype='object'))

In [None]:
# We will be attempting to make a Decision Tree Classifier Model that will predict survival on the 
# Titanic that performs better than the baseline

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [25]:
# What is our target? survived
# In order to determine the mode, let's pull a value count
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [26]:
# we know what our X and y are, let's be explicit about defining them
X_train = train.drop(columns='survived')
y_train = train.survived

In [27]:
(y_train == 0).mean()

0.6177062374245473

In [28]:
print(f'Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is {(y_train == 0).mean():.3}')

Our baseline accuracy for nonsurvival in all cases on the Titanic Dataset is 0.618


2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [29]:
# 1. Make the thing
# 2. Fit the thing
# 3. Use the thing

In [30]:
# create the model
clf = DecisionTreeClassifier()

In [31]:
# fit the thing
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [32]:
# predict, or use the thing
# if I want the basic accuracy score:
clf.score(X_train, y_train)

0.9738430583501007

In [33]:
clf.predict(X_train)

array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [34]:
y_preds = pd.DataFrame({
    'y_act': y_train,
    'baseline': 0,
    'model_1': clf.predict(X_train)
})

In [35]:
y_preds

Unnamed: 0,y_act,baseline,model_1
583,0,0,0
337,1,0,1
50,0,0,0
218,1,0,1
31,1,0,1
...,...,...,...
313,0,0,0
636,0,0,0
222,0,0,0
485,0,0,0


In [36]:
train.head(2)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0


In [None]:
# X_train, y_train
# clf.fit(train.drop(columns='survived'), train['survived'])
# clf.fit(train[['pclass', 'Q']], train['survived'])
# useful to do something like selected_cols = ['pclass', 'Q']
############
# ensure any predict calls utilize the same feature space.
# a model fit on pclass and Q will anticipate input of that same shape

In [None]:
# Options that we have here:
# 1. drop out the baseline assumption Series after we make the calculation
# 2. Not actually assign the baseline Series into the dataframe
# 3. We could choose to explicitly feed in a set of features that does not include
# baseline_assumption_death in our X

In [None]:
# sanity check
# do i have any data leakage or contamination?
# contamination: any data in my X that indicates the target
# contamination: any data that could lead to an improper assumption of the target:
# passenger id could falsely point to survived or not survived depending on how our data is structured


3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [37]:
# we have our model clf, let's get those metrics from our 
# informational output
# insample: training set
acc = clf.score(X_train, y_train)
accuracy_score(y_preds.y_act, y_preds.model_1)

0.9738430583501007

In [38]:
acc

0.9738430583501007

In [40]:
pd.DataFrame(classification_report(y_preds.y_act, y_preds.model_1, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.980392,0.963351,0.973843,0.971871,0.973877
recall,0.977199,0.968421,0.973843,0.97281,0.973843
f1-score,0.978793,0.965879,0.973843,0.972336,0.973856
support,307.0,190.0,0.973843,497.0,497.0


In [41]:
conf = confusion_matrix(y_preds.y_act, y_preds.model_1)

In [42]:
conf

array([[300,   7],
       [  6, 184]])

In [44]:
rubric_df = pd.DataFrame({
    'pred_death': ['True Negative', 'False Negative'],
    'pred_survive':['False Positive','True Positive']
    }, index=['actual_death', 'actual_survive'])

In [45]:
rubric_df

Unnamed: 0,pred_death,pred_survive
actual_death,True Negative,False Positive
actual_survive,False Negative,True Positive


In [46]:
rubric_df + ': ' + conf.astype(str)

Unnamed: 0,pred_death,pred_survive
actual_death,True Negative: 300,False Positive: 7
actual_survive,False Negative: 6,True Positive: 184


In [49]:
TN = conf[0,0]
FP = conf[0,1]
FN = conf[1,0]
TP = conf[1,1]

In [50]:
TN, FP, FN, TP

(300, 7, 6, 184)

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [51]:
# accuracy
ALL = TP + FP + FN + TN
acc = (TP + TN) / ALL
# true positive rate, also recall
TPR = recall = TP/ (TP + FN)
# false positive rate
FPR = FP / (FP + TN)
# true negative rate
TNR = TN / (TN + FP)
# false negative rate
FNR = FN / (FN + TP)
# precision
precision = TP / (TP + FP)
# f1
f1_score = 2 * (precision*recall) / (precision+recall)
# support
support_pos = TP + FN
support_neg = FP + TN

In [52]:
# consider putting these calculations into a function
# def my_metrics(y_preds.y_act, y_preds.model1):
# calculate confusion matrix here\
# calculate your metrics here
# output those values, probably into a dictionary or something clean :)

5. Run through steps 2-4 using a different max_depth value.

In [53]:
# clf2
# change your variable names on new models for comparison
clf2 = DecisionTreeClassifier(max_depth=4)

In [54]:
# fit the model
clf2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4)

6. Which model performs better on your in-sample data?

In [55]:
# make a new column in y_preds
y_preds['model2'] = clf2.predict(X_train)

In [56]:
clf2.score(X_train, y_train)

0.7323943661971831

7. Which model performs best on your out-of-sample data, the validate set?

In [57]:
clf.score(X_train, y_train)

0.9738430583501007

In [58]:
X_val, y_val = validate.drop(columns='survived'), validate.survived

In [None]:
# get predictions for our validation sets
# X_validate ==> everything from validate that isn't survived
# get predictions on my first model on my validate set
# get predictions on my second model on my validate set


In [None]:
# get validation accuracy


In [None]:
# 
# reminder again: Do NOT retrain your model on validate or test, just predict!
# 

In [59]:
clf.score(X_val, y_val)

0.6869158878504673

In [60]:
# model 2:
clf2.score(X_train, y_train)

0.7323943661971831

In [61]:
clf2.score(X_val, y_val)

0.7149532710280374

In [None]:
# maybe do this a few more times....
# i know that im going to be creating,fitting,predicting repeatedly...

In [62]:
# loop it!
models = []
model_scores = []
for i in range(2,8):
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    models.append(model)
    model_scores.append(model.score(X_train, y_train))

In [63]:
len(models)

6