In [1]:
import acquire
import prepare

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
df = acquire.get_titanic_data()
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [6]:
train, val, test = prepare.prep_titanic_data(df)

In [8]:
X_train = train.drop(columns=['survived', 'sex', 'embark_town'])
y_train = train[['survived']]

X_val = val.drop(columns=['survived', 'sex', 'embark_town'])
y_val = val[['survived']]

X_test = test.drop(columns=['survived', 'sex', 'embark_town'])
y_test = test[['survived']]

In [9]:
X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,36.000000,0,0,40.1250,1,1,0,0
165,3,9.000000,0,2,20.5250,0,1,0,1
50,3,7.000000,4,1,39.6875,0,1,0,1
259,2,50.000000,0,1,26.0000,0,0,0,1
306,1,29.678105,0,0,110.8833,1,0,0,0
...,...,...,...,...,...,...,...,...,...
313,3,28.000000,0,0,7.8958,1,1,0,1
636,3,32.000000,0,0,7.9250,1,1,0,1
222,3,51.000000,0,0,8.0500,1,1,0,1
485,3,29.678105,3,1,25.4667,0,0,0,1


In [10]:
y_train

Unnamed: 0,survived
583,0
165,1
50,0
259,1
306,1
...,...
313,0
636,0
222,0
485,0


## Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [11]:
rf = RandomForestClassifier(
    min_samples_leaf=1, 
    max_depth=10, 
    random_state=13
)

In [13]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


RandomForestClassifier(max_depth=10, random_state=13)

In [15]:
y_preds = rf.predict(X_train)

## Evaluate your results using the model score, confusion matrix, and classification report.

In [16]:
rf.score(X_train, y_train)

0.963855421686747

In [17]:
confusion_matrix(y_train, y_preds)

array([[307,   0],
       [ 18, 173]])

In [19]:
print(classification_report(y_train, y_preds))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       307
           1       1.00      0.91      0.95       191

    accuracy                           0.96       498
   macro avg       0.97      0.95      0.96       498
weighted avg       0.97      0.96      0.96       498



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [22]:
TN, FP, FN, TP = confusion_matrix(y_train,y_preds).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN, ALL

(173, 307, 0, 18, 498)

In [23]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.963855421686747
True Positive Rate: 0.9057591623036649
False Positive Rate: 0.0
True Negative Rate: 1.0
False Negative Rate: 0.09424083769633508
Precision: 1.0
Recall: 0.9057591623036649
F1 Score: 0.9505494505494505
Support (0): 191
Support (1): 307


## Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [24]:
model_prediction = y_train.copy()

for i in range(1, 6):
    for j in range(10, 5, -1):
        rf = RandomForestClassifier(
            min_samples_leaf=i, 
            max_depth=j, 
            random_state=13
        )
        
        rf.fit(X_train, y_train)
        
        curr_preds = rf.predict(X_train)
        
        model_prediction[f'msl_{i}_md_{j}'] = curr_preds

  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)


In [30]:
model_prediction

Unnamed: 0,survived,msl_1_md_10,msl_1_md_9,msl_1_md_8,msl_1_md_7,msl_1_md_6,msl_2_md_10,msl_2_md_9,msl_2_md_8,msl_2_md_7,...,msl_4_md_10,msl_4_md_9,msl_4_md_8,msl_4_md_7,msl_4_md_6,msl_5_md_10,msl_5_md_9,msl_5_md_8,msl_5_md_7,msl_5_md_6
583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
165,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
259,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
306,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [34]:
actuals = model_prediction.survived
preds = model_prediction.drop(columns = 'survived')

for column in preds.columns:
    
    accuracy = (actuals == preds[column]).mean()
    
    print(f'{column} accuracy: {accuracy}')

msl_1_md_10 accuracy: 0.963855421686747
msl_1_md_9 accuracy: 0.9578313253012049
msl_1_md_8 accuracy: 0.9457831325301205
msl_1_md_7 accuracy: 0.9196787148594378
msl_1_md_6 accuracy: 0.9016064257028112
msl_2_md_10 accuracy: 0.9257028112449799
msl_2_md_9 accuracy: 0.9196787148594378
msl_2_md_8 accuracy: 0.9156626506024096
msl_2_md_7 accuracy: 0.9096385542168675
msl_2_md_6 accuracy: 0.8835341365461847
msl_3_md_10 accuracy: 0.9036144578313253
msl_3_md_9 accuracy: 0.9036144578313253
msl_3_md_8 accuracy: 0.8955823293172691
msl_3_md_7 accuracy: 0.8955823293172691
msl_3_md_6 accuracy: 0.8755020080321285
msl_4_md_10 accuracy: 0.8955823293172691
msl_4_md_9 accuracy: 0.891566265060241
msl_4_md_8 accuracy: 0.8875502008032129
msl_4_md_7 accuracy: 0.8815261044176707
msl_4_md_6 accuracy: 0.8714859437751004
msl_5_md_10 accuracy: 0.8815261044176707
msl_5_md_9 accuracy: 0.8815261044176707
msl_5_md_8 accuracy: 0.8815261044176707
msl_5_md_7 accuracy: 0.8775100401606426
msl_5_md_6 accuracy: 0.87148594377510