**Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:**



In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test
# generic split code found in prepare.py

In [3]:
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


1. **What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.**



In [4]:
df["did_survive"] = df.survived == 1
df["is_female"] = df.sex == "female"
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,did_survive,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,False


In [5]:
dummy_df = pd.get_dummies(df[["class"]], drop_first=True)
dummy_df = pd.get_dummies(df[["embark_town"]], drop_first=True)
dummy_df = pd.get_dummies(df[["embarked"]], drop_first=True)
dummy_df

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [6]:
df = pd.concat([df, dummy_df], axis=1)

# drop the old columns
df = df.drop(columns=["survived", 'class', 'embark_town', 'sex', 'deck', 'embarked'])
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,age,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,22.0,1,0,7.25,0,False,False,0,1
1,1,1,1,38.0,1,0,71.2833,0,True,True,0,0
2,2,2,3,26.0,0,0,7.925,1,True,True,0,1
3,3,3,1,35.0,1,0,53.1,0,True,True,0,1
4,4,4,3,35.0,0,0,8.05,1,False,False,0,1


In [7]:
df = df.dropna()

In [8]:
train, validate, test = split(df, stratify_by = "did_survive")
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,age,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
652,652,652,3,21.0,0,0,8.4333,1,False,False,0,1
813,813,813,3,6.0,4,2,31.275,0,False,True,0,1
194,194,194,1,44.0,0,0,27.7208,1,True,True,0,0
417,417,417,2,18.0,0,2,13.0,0,True,True,0,1
460,460,460,1,48.0,0,0,26.55,1,True,False,0,1


In [9]:
x_train = train.drop(columns = ['did_survive'])
y_train = train.did_survive

x_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

x_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

#x is feature
# y is traget variable

In [10]:
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,age,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
652,652,652,3,21.0,0,0,8.4333,1,False,False,0,1
813,813,813,3,6.0,4,2,31.275,0,False,True,0,1
194,194,194,1,44.0,0,0,27.7208,1,True,True,0,0
417,417,417,2,18.0,0,2,13.0,0,True,True,0,1
460,460,460,1,48.0,0,0,26.55,1,True,False,0,1


In [11]:
y_train.head()

652    False
813    False
194     True
417     True
460     True
Name: did_survive, dtype: bool

In [12]:
clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [13]:
df = df.dropna()

In [14]:
clf = clf.fit(x_train, y_train)

In [15]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [16]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False,  True,  True, False])

In [17]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.87634409, 0.12365591],
       [0.69444444, 0.30555556],
       [0.16666667, 0.83333333],
       [0.03896104, 0.96103896],
       [0.64814815, 0.35185185]])

In [18]:
y_train.head(3)

652    False
813    False
194     True
Name: did_survive, dtype: bool

In [19]:
clf.score(x_train, y_train)

0.8421052631578947

In [20]:
train.did_survive.value_counts()

False    237
True     162
Name: did_survive, dtype: int64

In [21]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.594

2. **Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)**



In [22]:
clf = clf.fit(x_train, y_train)

In [23]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [24]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False,  True,  True, False])

In [25]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.87634409, 0.12365591],
       [0.69444444, 0.30555556],
       [0.16666667, 0.83333333],
       [0.03896104, 0.96103896],
       [0.64814815, 0.35185185]])

In [26]:
y_train.head(3)

652    False
813    False
194     True
Name: did_survive, dtype: bool

In [27]:
clf.score(x_train, y_train)

0.8421052631578947

In [28]:
train.did_survive.value_counts()

False    237
True     162
Name: did_survive, dtype: int64

In [29]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.594

3. **Evaluate your in-sample results using the model score, confusion matrix, and classification report.**



In [34]:
# model score
clf.score(x_validate, y_validate)


0.7732558139534884

In [30]:
# confusion matrix
confusion_matrix(y_train, y_pred)

array([[227,  10],
       [ 53, 109]])

In [31]:
# confusion matrix
y_train.value_counts()

False    237
True     162
Name: did_survive, dtype: int64

In [32]:
# confusion matrix
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,False,True
False,227,10
True,53,109


In [33]:
# Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.81      0.96      0.88       237
        True       0.92      0.67      0.78       162

    accuracy                           0.84       399
   macro avg       0.86      0.82      0.83       399
weighted avg       0.85      0.84      0.84       399



4. **Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**



In [40]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(227, 10, 53, 109)

In [41]:
sorted(('didnt_survive', 'survived'))

['didnt_survive', 'survived']

In [42]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 109
False Positives 10
False Negatives 53
True Negatives 227
-------------
Accuracy is 0.8421052631578947
Recall is 0.6728395061728395
Precision is 0.9159663865546218


5. **Run through steps 2-4 using a different max_depth value.**



In [43]:
clf = DecisionTreeClassifier(max_depth = 5, random_state = 123)

In [44]:
df = df.dropna()

In [45]:
clf = clf.fit(x_train, y_train)

In [46]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [47]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False,  True,  True, False])

In [48]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.88888889, 0.11111111],
       [0.82142857, 0.17857143],
       [0.28571429, 0.71428571],
       [0.02941176, 0.97058824],
       [0.70833333, 0.29166667]])

In [49]:
clf.score(x_train, y_train)

0.8872180451127819

In [50]:
train.did_survive.value_counts()

False    237
True     162
Name: did_survive, dtype: int64

In [51]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.594

In [52]:
# model score
clf.score(x_validate, y_validate)

0.7848837209302325

In [53]:
# confusion matrix
confusion_matrix(y_train, y_pred)

array([[232,   5],
       [ 40, 122]])

In [54]:
# confusion matrix
y_train.value_counts()

False    237
True     162
Name: did_survive, dtype: int64

In [55]:
# confusion matrix
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,False,True
False,232,5
True,40,122


In [56]:
# Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.85      0.98      0.91       237
        True       0.96      0.75      0.84       162

    accuracy                           0.89       399
   macro avg       0.91      0.87      0.88       399
weighted avg       0.90      0.89      0.88       399



In [57]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(232, 5, 40, 122)

In [58]:
sorted(('didnt_survive', 'survived'))

['didnt_survive', 'survived']

In [59]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 122
False Positives 5
False Negatives 40
True Negatives 232
-------------
Accuracy is 0.8872180451127819
Recall is 0.7530864197530864
Precision is 0.9606299212598425


6. **Which model performs better on your in-sample data?**



Question 5

7. **Which model performs best on your out-of-sample data, the validate set?**

One used in questions 1-4