In [1]:
import numpy as np
import pandas as pd
from pydataset import data
import env as env
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix
import acquire as acq
import prepare as prep
import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored
import os
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


from io import StringIO

Load Dataset

In [2]:
titanic = acq.get_titanic_data()
titanic = acq.prep_titanic(titanic)

csv file found and loaded


Prep dataset

In [3]:
X = titanic.drop(columns= 'survived')
Y = titanic.survived

X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

X_train, X_validate, X_test, y_train, y_validate, y_test = acq.train_validate_test_split(X, Y)

In [4]:
X_train = pd.DataFrame(X_train)
X_validate = pd.DataFrame(X_validate)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

In [5]:
X_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone
455,3,0,0,7.8958,1
380,1,0,0,227.525,1
492,1,0,0,30.5,1
55,1,0,0,35.5,1
243,3,0,0,7.125,1


In [6]:
y_train.head()

Unnamed: 0,survived
455,1
380,1
492,0
55,1
243,0


In [7]:
y_train.value_counts()

survived
0           329
1           205
dtype: int64

In [8]:
baseline_accuracy = 329/534
baseline_accuracy

0.6161048689138576

In [9]:
rf = RandomForestClassifier()
rf

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [18]:
rf = RandomForestClassifier(random_state = 123, min_samples_leaf = 1, max_depth = 10)
rf

In [23]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


In [24]:
rf.score(X_train, y_train)

0.8632958801498127

Evaluate your results using the model score, confusion matrix, and classification report.


In [25]:
rf.score(X_train, y_train)

0.8632958801498127

In [27]:
y_pred = rf.predict(X_train)
y_pred[:5]

array([0, 1, 1, 1, 0])

In [28]:
confusion_matrix(y_train, y_pred)

array([[317,  12],
       [ 61, 144]])

In [30]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.90       329
           1       0.92      0.70      0.80       205

    accuracy                           0.86       534
   macro avg       0.88      0.83      0.85       534
weighted avg       0.87      0.86      0.86       534



Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [32]:
labels = sorted(y_train.survived.unique())
labels

[0, 1]

In [33]:
TN, FP, FN, TP = conf.ravel()
TN, FP, FN, TP

(317, 12, 65, 140)

In [34]:
all_ = (TP + TN + FP + FN)

accuracy = (TP + TN) / all_

TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)

TNR = TN / (FP + TN)
FNR = FN / (FN + TP)

precision = TP / (TP + FP)
f1 = 2 * ((precision * recall) / (precision + recall))

support_pos = TP + FN
support_neg = FP + TN

In [35]:
print(f"Accuracy: {accuracy}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
print(f"False Negative Rate/Miss Rate: {FNR}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy: 0.8558052434456929

True Positive Rate/Sensitivity/Recall/Power: 0.6829268292682927
False Positive Rate/False Alarm Ratio/Fall-out: 0.0364741641337386
True Negative Rate/Specificity/Selectivity: 0.9635258358662614
False Negative Rate/Miss Rate: 0.3170731707317073

Precision/PPV: 0.9210526315789473
F1 Score: 0.7843137254901961

Support (0): 205
Support (1): 329


Run through steps increasing your min_samples_leaf and decreasing your max_depth.


In [36]:
rf = RandomForestClassifier(random_state = 123, min_samples_leaf = 3, max_depth = 7)
rf

In [37]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


In [38]:
y_pred = rf.predict(X_train)
y_pred[:5]

array([0, 1, 1, 0, 0])

In [39]:
confusion_matrix(y_train, y_pred)

array([[301,  28],
       [ 90, 115]])

In [40]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.91      0.84       329
           1       0.80      0.56      0.66       205

    accuracy                           0.78       534
   macro avg       0.79      0.74      0.75       534
weighted avg       0.78      0.78      0.77       534



In [41]:
conf = confusion_matrix(y_train, y_pred)
conf

array([[301,  28],
       [ 90, 115]])

In [42]:
TN, FP, FN, TP = conf.ravel()
TN, FP, FN, TP

(301, 28, 90, 115)

In [43]:
all_ = (TP + TN + FP + FN)

accuracy = (TP + TN) / all_

TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)

TNR = TN / (FP + TN)
FNR = FN / (FN + TP)

precision = TP / (TP + FP)
f1 = 2 * ((precision * recall) / (precision + recall))

support_pos = TP + FN
support_neg = FP + TN

In [44]:
print(f"Accuracy: {accuracy}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
print(f"False Negative Rate/Miss Rate: {FNR}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy: 0.7790262172284644

True Positive Rate/Sensitivity/Recall/Power: 0.5609756097560976
False Positive Rate/False Alarm Ratio/Fall-out: 0.0851063829787234
True Negative Rate/Specificity/Selectivity: 0.9148936170212766
False Negative Rate/Miss Rate: 0.43902439024390244

Precision/PPV: 0.8041958041958042
F1 Score: 0.6609195402298851

Support (0): 205
Support (1): 329


What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

