In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

## Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

- What is your baseline prediction? <b> predicting survivability </b>

- What is your baseline accuracy? <b> 61.6%</b> remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

- Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample) <b> complete</b>

- Evaluate your in-sample results using the model score, confusion matrix, and classification report. <b> complete</b>

- Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support. <b>complete, see classification report</b>

- Run through steps 2-4 using a different max_depth value. <b> complete, depth value of 3 works best</b> 

- Which model performs better on your in-sample data? <b> complete</b>

- Which model performs best on your out-of-sample data, the validate set? <b> complete</b>

In [2]:
df = pd.read_csv('/Users/davidberchelmann/codeup-data-science/classification-exercises/titanic_df.csv')

def handle_missing_values(df):
    return df.assign(
        embark_town=df.embark_town.fillna('Other'),
        embarked=df.embarked.fillna('O'),
    )

def remove_columns(df):
    return df.drop(columns=['deck'])

def encode_embarked(df):
    encoder = LabelEncoder()
    encoder.fit(df.embarked)
    return df.assign(embarked_encode = encoder.transform(df.embarked))

def prep_titanic_data(df):
    df = df\
        .pipe(handle_missing_values)\
        .pipe(remove_columns)\
        .pipe(encode_embarked)
    return df

def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test





In [3]:
# check out data columns first before cleaning. Get rid of 'Unnamed', 'passenger_id', 'pclass', 'age', 'sibsp', 'parch'
# rename survived column to 'yes' 'no' and use as target variable

df.head()


Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
# clean data followed by creating train/validate/test function

def clean_titanic():
    '''
    clean_titanic will take a dataframe acquired as df and remove columns that are:
    duplicates,
    have too many nulls,
    and will fill in smaller amounts of nulls in embark_town
    encode sex and embark_town columns
    
    return: single cleaned dataframe
    '''
    df.drop_duplicates(inplace=True)
    dropcols = ['age', 'deck', 'embarked', 'Unnamed: 0', 'passenger_id', 'pclass', 'sibsp', 'parch']
    df.drop(columns=dropcols, inplace=True)
    df['embark_town'] = df['embark_town'].fillna('Southampton')
    dummies = pd.get_dummies(df[['embark_town', 'sex', 'class']], drop_first=True)
    return pd.concat([df, dummies], axis=1)


def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test



In [5]:
# clean data using function from above

df = clean_titanic()

In [6]:
# rename survied column to using a no/yes in place of 0/1

df['survived'] = df['survived'].replace([0,1],['no', 'yes'])

In [7]:
# check columns to make sure changes have been made

df.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
0,no,male,7.25,Third,Southampton,0,0,1,1,0,1
1,yes,female,71.2833,First,Cherbourg,0,0,0,0,0,0
2,yes,female,7.925,Third,Southampton,1,0,1,0,0,1
3,yes,female,53.1,First,Southampton,0,0,1,0,0,0
4,no,male,8.05,Third,Southampton,1,0,1,1,0,1


In [8]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = train_validate_test_split(df, seed=123)
train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,no,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,yes,male,20.525,Third,Southampton,0,0,1,1,0,1
50,no,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,yes,female,26.0,Second,Southampton,0,0,1,0,1,0
306,yes,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [9]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_test = test.survived

In [10]:
# check training data

train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,no,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,yes,male,20.525,Third,Southampton,0,0,1,1,0,1
50,no,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,yes,female,26.0,Second,Southampton,0,0,1,0,1,0
306,yes,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [11]:
df.groupby('survived').sex.value_counts()

survived  sex   
no        male      468
          female     81
yes       female    233
          male      109
Name: sex, dtype: int64

In [12]:
train.groupby('survived').sex.value_counts()

survived  sex   
no        male      265
          female     42
yes       female    133
          male       58
Name: sex, dtype: int64

In [13]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [14]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [15]:
train.survived.unique()

array(['no', 'yes'], dtype=object)

In [16]:
clf.classes_

array(['no', 'yes'], dtype=object)

In [17]:
train["most_frequent"] = "no"

In [38]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [19]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array(['no', 'no', 'no'], dtype=object)

In [20]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.69827586, 0.30172414]])

In [21]:
y_train.head(3)

583     no
165    yes
50      no
Name: survived, dtype: object

In [22]:

baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy 

0.6164658634538153

In [23]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [24]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.83      0.90      0.86       307
         yes       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



In [25]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

0.7850467289719626

In [26]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [27]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array(['no', 'no', 'no'], dtype=object)

In [28]:
y_validate.head(3)

610    no
424    no
568    no
Name: survived, dtype: object

In [29]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.80      0.87      0.83       132
         yes       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78       214



In [30]:
not_male = train[train.sex_male > 0.5]

In [31]:
fare = not_male[not_male.fare <= 18.275]

In [32]:
fare.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third,most_frequent
883,no,male,10.5,Second,Southampton,1,0,1,1,1,0,no
459,no,male,7.75,Third,Queenstown,1,1,0,1,0,1,no
338,yes,male,8.05,Third,Southampton,1,0,1,1,0,1,no
77,no,male,8.05,Third,Southampton,1,0,1,1,0,1,no
722,no,male,13.0,Second,Southampton,1,0,1,1,1,0,no


In [33]:
alone = fare[fare.alone <=.5]

In [34]:
alone.survived.value_counts()

no     16
yes     4
Name: survived, dtype: int64

In [35]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'