In [1]:
from pydataset import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import \
accuracy_score,\
recall_score,\
precision_score,\
confusion_matrix,\
classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = get_titanic_data('titanic_db')
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [3]:
def train_val_test(df, strat, seed = 123):
    train, val_test = train_test_split(df, train_size = 0.8, random_state = seed, stratify = df[strat])
    val, test = train_test_split(val_test, train_size = 0.5, random_state = seed, stratify = val_test[strat])
    return train, val, test

In [4]:
train, validate, test = train_val_test(df, 'survived')

In [5]:
train = train.drop(columns=['embark_town', 'class', 'deck', 'age'])
validate = validate.drop(columns=['embark_town', 'class', 'deck', 'age'])
test = test.drop(columns=['embark_town', 'class', 'deck', 'age'])


In [6]:
train.survived.value_counts()

survived
0    439
1    273
Name: count, dtype: int64

In [7]:
train['baseline_prediction'] = 0
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')


baseline accuracy: 61.66%


In [8]:
validate['baseline_prediction'] = 0
baseline_accuracy = (validate.baseline_prediction == validate.survived).mean()

test['baseline_prediction'] = 0
baseline_accuracy = (test.baseline_prediction == test.survived).mean()

In [9]:
target = 'survived'

X_train = train.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])
X_test = test.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])


y_validate = validate[target]
y_test = test[target]

In [10]:
RandomForestClassifier?

[0;31mInit signature:[0m
[0mRandomForestClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'sqrt'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[

In [11]:
# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) 
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
rf = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10)
rf.fit(X_train, y_train)

In [12]:
# Evaluate your results using the model score, confusion matrix, and classification report.
rf.score(X_train, y_train)


0.8370786516853933

In [13]:
model_1_preds = rf.predict(X_train)
confusion_matrix(y_train, model_1_preds)

array([[408,  31],
       [ 85, 188]])

In [14]:
print(classification_report(y_train, model_1_preds))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       439
           1       0.86      0.69      0.76       273

    accuracy                           0.84       712
   macro avg       0.84      0.81      0.82       712
weighted avg       0.84      0.84      0.83       712



In [15]:
rf.score(X_validate, y_validate)

0.6292134831460674

In [16]:
model_1_val_preds = rf.predict(X_validate)
confusion_matrix(y_validate, model_1_val_preds)

array([[39, 16],
       [17, 17]])

In [17]:
print(classification_report(y_validate, model_1_val_preds))

              precision    recall  f1-score   support

           0       0.70      0.71      0.70        55
           1       0.52      0.50      0.51        34

    accuracy                           0.63        89
   macro avg       0.61      0.60      0.61        89
weighted avg       0.63      0.63      0.63        89



In [18]:
rf.score(X_test, y_test)

0.7

In [19]:
model_1_test_preds = rf.predict(X_test)
confusion_matrix(y_test, model_1_test_preds)

array([[42, 13],
       [14, 21]])

In [20]:
print(classification_report(y_test, model_1_test_preds))

              precision    recall  f1-score   support

           0       0.75      0.76      0.76        55
           1       0.62      0.60      0.61        35

    accuracy                           0.70        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.70      0.70      0.70        90



In [21]:
# Print and clearly label the following: Accuracy, true positive rate, false positive rate,
#  true negative rate, false negative rate, precision, recall, f1-score, and support.
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")


In [22]:
confu = confusion_matrix(y_train, model_1_preds)
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP

(408, 31, 85, 188)

In [23]:
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.8370786516853933

True Positive Rate/Sensitivity/Recall/Power: 0.6886446886446886
False Positive Rate/False Alarm Ratio/Fall-out: 0.07061503416856492
True Negative Rate/Specificity/Selectivity: 0.929384965831435
False Negative Rate/Miss Rate: 0.31135531135531136

Precision/PPV: 0.8584474885844748
F1 Score: 0.7642276422764228

Support (0): 273
Support (1): 439


In [24]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.
rf1 = RandomForestClassifier(min_samples_leaf = 3, max_depth = 5)
rf1.fit(X_train, y_train)


In [25]:
rf1.score(X_train, y_train)

0.7570224719101124

In [26]:
model_1_preds = rf1.predict(X_train)
confusion_matrix(y_train, model_1_preds)

array([[398,  41],
       [132, 141]])

In [27]:
print(classification_report(y_train, model_1_preds))

              precision    recall  f1-score   support

           0       0.75      0.91      0.82       439
           1       0.77      0.52      0.62       273

    accuracy                           0.76       712
   macro avg       0.76      0.71      0.72       712
weighted avg       0.76      0.76      0.74       712



In [28]:
rf1.score(X_validate, y_validate)

0.651685393258427

In [29]:
model_1_val_preds = rf1.predict(X_validate)
confusion_matrix(y_validate, model_1_val_preds)

array([[41, 14],
       [17, 17]])

In [30]:
print(classification_report(y_validate, model_1_val_preds))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73        55
           1       0.55      0.50      0.52        34

    accuracy                           0.65        89
   macro avg       0.63      0.62      0.62        89
weighted avg       0.65      0.65      0.65        89



In [31]:
rf1.score(X_test, y_test)

0.7333333333333333

In [32]:
model_1_test_preds = rf1.predict(X_test)
confusion_matrix(y_test, model_1_test_preds)

array([[48,  7],
       [17, 18]])

In [35]:
print(classification_report(y_test, model_1_test_preds))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80        55
           1       0.72      0.51      0.60        35

    accuracy                           0.73        90
   macro avg       0.73      0.69      0.70        90
weighted avg       0.73      0.73      0.72        90



In [36]:
confu = confusion_matrix(y_train, model_1_preds)
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP

(398, 41, 132, 141)

In [37]:
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.7570224719101124

True Positive Rate/Sensitivity/Recall/Power: 0.5164835164835165
False Positive Rate/False Alarm Ratio/Fall-out: 0.09339407744874716
True Negative Rate/Specificity/Selectivity: 0.9066059225512528
False Negative Rate/Miss Rate: 0.4835164835164835

Precision/PPV: 0.7747252747252747
F1 Score: 0.6197802197802198

Support (0): 273
Support (1): 439


In [None]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# the second random forest has a lower accuracy, lower recall, higher false-positive rate, lower true-negative rate, 
# higher false-negative rate, lower precision, and lower f1

# the second one has less overfitting due to less max depth, leading to less data poisoning and lower accuracy

In [None]:
# After making a few models, which one has the best performance (or closest metrics) on both train and validate?

# the second random forest had less disparity between test and validate than the first, so it performs better 
