In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

## Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

- What is your baseline prediction? <b> predicting survivability </b>

- What is your baseline accuracy? <b> 61.6%</b> remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

- Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample) <b> complete</b>

- Evaluate your in-sample results using the model score, confusion matrix, and classification report. <b> complete</b>

- Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support. <b>complete, see classification report</b>

- Run through steps 2-4 using a different max_depth value. <b> complete, depth value of 3 works best</b> 

- Which model performs better on your in-sample data? <b> model with depth of 3</b>

- Which model performs best on your out-of-sample data, the validate set? <b> The model with a depth of 3</b>

In [4]:
df = pd.read_csv('/Users/davidberchelmann/codeup-data-science/classification-exercises/titanic_df.csv')

def handle_missing_values(df):
    return df.assign(
        embark_town=df.embark_town.fillna('Other'),
        embarked=df.embarked.fillna('O'),
    )

def remove_columns(df):
    return df.drop(columns=['deck'])

def encode_embarked(df):
    encoder = LabelEncoder()
    encoder.fit(df.embarked)
    return df.assign(embarked_encode = encoder.transform(df.embarked))

def prep_titanic_data(df):
    df = df\
        .pipe(handle_missing_values)\
        .pipe(remove_columns)\
        .pipe(encode_embarked)
    return df

def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test





In [5]:
# check out data columns first before cleaning. Get rid of 'Unnamed', 'passenger_id', 'pclass', 'age', 'sibsp', 'parch'
# rename survived column to 'yes' 'no' and use as target variable

df.head()


Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [6]:
# clean data followed by creating train/validate/test function

def clean_titanic():
    '''
    clean_titanic will take a dataframe acquired as df and remove columns that are:
    duplicates,
    have too many nulls,
    and will fill in smaller amounts of nulls in embark_town
    encode sex and embark_town columns
    
    return: single cleaned dataframe
    '''
    df.drop_duplicates(inplace=True)
    dropcols = ['age', 'deck', 'embarked', 'Unnamed: 0', 'passenger_id', 'pclass', 'sibsp', 'parch']
    df.drop(columns=dropcols, inplace=True)
    df['embark_town'] = df['embark_town'].fillna('Southampton')
    dummies = pd.get_dummies(df[['embark_town', 'sex', 'class']], drop_first=True)
    return pd.concat([df, dummies], axis=1)


def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test



In [7]:
# clean data using function from above

df = clean_titanic()

In [8]:
# rename survied column to using a no/yes in place of 0/1

df['survived'] = df['survived'].replace([0,1],['no', 'yes'])

In [9]:
# check columns to make sure changes have been made

df.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
0,no,male,7.25,Third,Southampton,0,0,1,1,0,1
1,yes,female,71.2833,First,Cherbourg,0,0,0,0,0,0
2,yes,female,7.925,Third,Southampton,1,0,1,0,0,1
3,yes,female,53.1,First,Southampton,0,0,1,0,0,0
4,no,male,8.05,Third,Southampton,1,0,1,1,0,1


In [10]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = train_validate_test_split(df, seed=123)
train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,no,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,yes,male,20.525,Third,Southampton,0,0,1,1,0,1
50,no,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,yes,female,26.0,Second,Southampton,0,0,1,0,1,0
306,yes,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [11]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_test = test.survived

In [12]:
# check training data

train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,no,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,yes,male,20.525,Third,Southampton,0,0,1,1,0,1
50,no,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,yes,female,26.0,Second,Southampton,0,0,1,0,1,0
306,yes,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [13]:
df.groupby('survived').sex.value_counts()

survived  sex   
no        male      468
          female     81
yes       female    233
          male      109
Name: sex, dtype: int64

In [14]:
train.groupby('survived').sex.value_counts()

survived  sex   
no        male      265
          female     42
yes       female    133
          male       58
Name: sex, dtype: int64

In [15]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [16]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [17]:
train.survived.unique()

array(['no', 'yes'], dtype=object)

In [18]:
clf.classes_

array(['no', 'yes'], dtype=object)

In [19]:
train["most_frequent"] = "no"

In [20]:
train['most_frequent'].value_counts()

no    498
Name: most_frequent, dtype: int64

In [21]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [22]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array(['no', 'no', 'no'], dtype=object)

In [23]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.69827586, 0.30172414]])

In [24]:
y_train.head(3)

583     no
165    yes
50      no
Name: survived, dtype: object

In [25]:

baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy 

0.6164658634538153

In [26]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [27]:
accuracy = clf.score(X_train, y_train)
accuracy

0.8232931726907631

In [28]:
conf = confusion_matrix(y_train, y_pred)
conf

array([[276,  31],
       [ 57, 134]])

In [29]:
class_report = classification_report(y_train, y_pred, output_dict=True)

In [30]:
pd.DataFrame(class_report).T

Unnamed: 0,precision,recall,f1-score,support
no,0.828829,0.899023,0.8625,307.0
yes,0.812121,0.701571,0.752809,191.0
accuracy,0.823293,0.823293,0.823293,0.823293
macro avg,0.820475,0.800297,0.807654,498.0
weighted avg,0.822421,0.823293,0.82043,498.0


In [31]:
conf_df = pd.DataFrame(conf, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

conf_df

Unnamed: 0,predict_death,predict_survive
actual_death,276,31
actual_survive,57,134


In [32]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

0.7850467289719626

In [33]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [34]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array(['no', 'no', 'no'], dtype=object)

In [35]:
y_validate.head(3)

610    no
424    no
568    no
Name: survived, dtype: object

In [36]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.80      0.87      0.83       132
         yes       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78       214



In [37]:
not_male = train[train.sex_male > 0.5]

In [38]:
fare = not_male[not_male.fare <= 18.275]

In [39]:
fare.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third,most_frequent
883,no,male,10.5,Second,Southampton,1,0,1,1,1,0,no
459,no,male,7.75,Third,Queenstown,1,1,0,1,0,1,no
338,yes,male,8.05,Third,Southampton,1,0,1,1,0,1,no
77,no,male,8.05,Third,Southampton,1,0,1,1,0,1,no
722,no,male,13.0,Second,Southampton,1,0,1,1,1,0,no


In [40]:
alone = fare[fare.alone <=.5]

In [41]:
alone.survived.value_counts()

no     16
yes     4
Name: survived, dtype: int64

In [42]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None, class_names=clf.classes_)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [43]:
# run new model with depth of 2
clf1 = DecisionTreeClassifier(max_depth=2, random_state=123)


In [44]:
# Now let's train our model on the training data
# fitting == training the model
clf1 = clf1.fit(X_train, y_train)
clf1

DecisionTreeClassifier(max_depth=2, random_state=123)

In [45]:
y_pred = clf1.predict(X_train)
y_pred[0:3]

array(['no', 'no', 'no'], dtype=object)

In [46]:
# Estimate the probabilities for each class
y_pred_proba = clf1.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.68644068, 0.31355932]])

In [47]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf1.score(X_validate, y_validate)

0.7616822429906542

In [48]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [49]:
#----------------------------------------------------------------------

from sklearn.ensemble import RandomForestClassifier

### Random Forest Exercises

- Continue working in your model file with titanic data to do the following:

- Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

- Evaluate your results using the model score, confusion matrix, and classification report.

- Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Run through steps increasing your min_samples_leaf and decreasing your max_depth.

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [69]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [70]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [71]:
print(rf.feature_importances_)

[0.48108242 0.03828706 0.01592272 0.02759923 0.33183925 0.02041746
 0.08485187]


In [72]:
y_pred = rf.predict(X_train)

In [73]:
y_pred_proba = rf.predict_proba(X_train)

In [74]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [75]:
print(confusion_matrix(y_train, y_pred))

[[302   5]
 [ 27 164]]


In [76]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.92      0.98      0.95       307
         yes       0.97      0.86      0.91       191

    accuracy                           0.94       498
   macro avg       0.94      0.92      0.93       498
weighted avg       0.94      0.94      0.93       498



In [77]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.77


In [78]:
# crate a function to calculate these metrics
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report
    

In [79]:
get_metrics_binary(rf)


    The accuracy for our model is 0.9357
    The True Positive Rate is 0.859, The False Positive Rate is 0.0163,
    The True Negative Rate is 0.984, and the False Negative Rate is 0.141
    


Unnamed: 0,precision,recall,f1-score,support
no,0.917933,0.983713,0.949686,307.0
yes,0.970414,0.858639,0.911111,191.0
accuracy,0.935743,0.935743,0.935743,0.935743
macro avg,0.944174,0.921176,0.930398,498.0
weighted avg,0.938061,0.935743,0.934891,498.0


In [80]:
#-------------------------------------

In [81]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [82]:
rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=3, random_state=123)

In [83]:
print(rf2.feature_importances_)

[0.26997225 0.04453345 0.01518327 0.03525965 0.4832382  0.02733744
 0.12447574]


In [84]:
y_pred = rf2.predict(X_train)

In [85]:
y_pred_proba = rf2.predict_proba(X_train)

In [86]:
print(confusion_matrix(y_train, y_pred))

[[299   8]
 [ 70 121]]


In [88]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.78


In [87]:
get_metrics_binary(rf2)


    The accuracy for our model is 0.8434
    The True Positive Rate is 0.634, The False Positive Rate is 0.0261,
    The True Negative Rate is 0.974, and the False Negative Rate is 0.366
    


Unnamed: 0,precision,recall,f1-score,support
no,0.810298,0.973941,0.884615,307.0
yes,0.937984,0.633508,0.75625,191.0
accuracy,0.843373,0.843373,0.843373,0.843373
macro avg,0.874141,0.803725,0.820433,498.0
weighted avg,0.85927,0.843373,0.835383,498.0


In [90]:
# get predictions for our validation sets
y_val_pred_1 = rf.predict(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']))
y_val_pred_2 = rf2.predict(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']))

In [94]:
# get validation accuracy
accuracy_v_1 = rf.score(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']), validate.survived)
accuracy_v_2 = rf2.score(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']), validate.survived)

In [95]:
accuracy_v_1

0.7710280373831776

In [96]:
accuracy_v_2

0.780373831775701

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

    <b>The first model performed best on the in sample data which had a depth of 10 and min samples of 1.</b>
    
- After making a few models, which one has the best performance (or closest metrics) on both train and validate?

    <b>The second model performed slightly better on validate (1 percentage point) while the first model had a higher performance on train data.</b>