1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from acquire import get_titanic_data
# This is the version of prepare included in the florence classification exercises repo:
from prepare import prep_titanic
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [3]:
# acquire the data
df = get_titanic_data()
# prepare the data
train, validate, test = prep_titanic(df)

Using cached csv


In [4]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [5]:
# drop out non-numerical columns or non-encoded version remaining in this data set
drops = ['sex', 'class','embarked', 'embark_town', 'passenger_id']

In [6]:
for dataset in [train, validate, test]:
    dataset.drop(columns=drops, inplace=True)

In [7]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Q,S
583,0,1,36.0,0,0,40.125,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0
50,0,3,7.0,4,1,39.6875,0,0,1
218,1,1,32.0,0,0,76.2917,1,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0


In [8]:
X_train, y_train = train.drop(columns='survived'), train[['survived']]

In [9]:
# Establish our baseline.  The rate at which the assumption of the majority class matches the real values.  If a model does not perform better than this, it would not be wise to deploy.
baseline = (y_train.value_counts().idxmax() == y_train).mean()
baseline

survived    0.617706
dtype: float64

In [10]:
# assuming everyone perished is roughly 62% accurate

In [11]:
# create the Random Forest Model
clf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=1349)

In [12]:
# fit the thing
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=1349)

In [13]:
# use the thing
y_pred = clf.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [14]:
clf_score = clf.score(X_train, y_train)
conf = confusion_matrix(y_train, y_pred)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))



    The accuracy for our model is 0.9276
    The True Positive Rate is 0.868, The False Positive Rate is 0.0358,
    The True Negative Rate is 0.964, and the False Negative Rate is 0.132
    


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.922118,0.9375,0.927565,0.929809,0.927999
recall,0.964169,0.868421,0.927565,0.916295,0.927565
f1-score,0.942675,0.901639,0.927565,0.922157,0.926987
support,307.0,190.0,0.927565,497.0,497.0


In [15]:
# target is survival, with a binary 0 representing a passenger that did not surivive the titanic wreck
# and 1 representing a survivor

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [16]:
# create the Random Forest Model
clf1 = RandomForestClassifier(min_samples_leaf=3, max_depth=3, random_state=1349)

In [17]:
# fit the model
clf1.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=1349)

In [18]:
y_pred1 = clf1.predict(X_train)
clf_score = clf1.score(X_train, y_train)
conf = confusion_matrix(y_train, y_pred1)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))


    The accuracy for our model is 0.7525
    The True Positive Rate is 0.537, The False Positive Rate is 0.114,
    The True Negative Rate is 0.886, and the False Negative Rate is 0.463
    


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.922118,0.9375,0.927565,0.929809,0.927999
recall,0.964169,0.868421,0.927565,0.916295,0.927565
f1-score,0.942675,0.901639,0.927565,0.922157,0.926987
support,307.0,190.0,0.927565,497.0,497.0


In [19]:
X_val, y_val = validate.drop(columns='survived'), validate.survived

In [20]:
print('Model #1: min samples 1, max depth 10: ON VALIDATE SET')
clf_score = clf.score(X_val, y_val)
y_pred_val = clf.predict(X_val)
conf = confusion_matrix(y_val, y_pred_val)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
print('-------------------------------------------\n Model #2: min samples 3, max_depth 3 : ON VALIDATE SET\n')
clf_score = clf1.score(X_val, y_val)
y_pred_val1 = clf1.predict(X_val)
conf = confusion_matrix(y_val, y_pred_val1)
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print(f'''
    The accuracy for our model is {clf_score:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')

Model #1: min samples 1, max depth 10: ON VALIDATE SET

    The accuracy for our model is 0.7617
    The True Positive Rate is 0.646, The False Positive Rate is 0.167,
    The True Negative Rate is 0.833, and the False Negative Rate is 0.354
    
-------------------------------------------
 Model #2: min samples 3, max_depth 3 : ON VALIDATE SET


    The accuracy for our model is 0.743
    The True Positive Rate is 0.5, The False Positive Rate is 0.106,
    The True Negative Rate is 0.894, and the False Negative Rate is 0.5
    


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [21]:
conf = confusion_matrix(y_train, y_pred)

In [22]:
conf

array([[296,  11],
       [ 25, 165]])

In [23]:
# make a key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [24]:
rubric_df

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [25]:
# accuracy:
# accuracy = (true positives + true negatives) / (true positives + true negatives + false positives + false negatives)

# True Positive Rate: Sensitivity
# RECALL for the positive class --> out of those that actually survived, how many did we predict would survive?
# TPR = true positives / (true positives + false negatives)
#  If we wanted to calculate PRECISION, it would be true positives / (true positives + false postives)
# Recall being true positives over the sum of the row, precision being the true positive over the sum of the column
# i.e, out of the values we predicted survived, how many were actual survivors?

# False Positive Rate: 
# FPR = false positives / (false positive + true negatives)

# True Negative Rate: Specificity
# Recall for the negative class --> out of those that perished, how many did we predict would not make it?
# TNR = true negatives / (true negatives + false positives)

# False negative rate:
# FNR = false negatives / (false negatives + true positives)

In [26]:
clf.predict_proba(X_train)

array([[0.70104785, 0.29895215],
       [0.06711869, 0.93288131],
       [0.9758    , 0.0242    ],
       [0.10064358, 0.89935642],
       [0.0443181 , 0.9556819 ],
       [0.68404055, 0.31595945],
       [0.47006918, 0.52993082],
       [0.67405366, 0.32594634],
       [0.45560234, 0.54439766],
       [1.        , 0.        ],
       [0.82134381, 0.17865619],
       [0.89806093, 0.10193907],
       [0.13108025, 0.86891975],
       [0.59453012, 0.40546988],
       [0.60759589, 0.39240411],
       [0.38816308, 0.61183692],
       [0.93794664, 0.06205336],
       [0.18171429, 0.81828571],
       [0.5230589 , 0.4769411 ],
       [0.78048702, 0.21951298],
       [0.13383776, 0.86616224],
       [0.82246706, 0.17753294],
       [0.2079232 , 0.7920768 ],
       [0.19833333, 0.80166667],
       [0.38900423, 0.61099577],
       [0.93237155, 0.06762845],
       [0.12313969, 0.87686031],
       [0.77758204, 0.22241796],
       [0.58227628, 0.41772372],
       [0.67053517, 0.32946483],
       [0.

In [27]:
my_preds = clf.predict_proba(X_train)[:,1]

In [28]:
my_preds < .7

array([ True, False,  True, False, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False, False,  True,
        True,  True, False, False,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True, False,  True, False,
        True,  True, False, False,  True,  True,  True, False,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,