In [2]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd


from pydataset import data

import acquire
import prepare

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
titanic = prepare.prep_titanic()
titanic

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,1,0,7.2500,0,1,0,1
1,1,1,1,1,0,71.2833,0,0,0,0
2,2,1,3,0,0,7.9250,1,0,0,1
3,3,1,1,1,0,53.1000,0,0,0,1
4,4,0,3,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,0,0,13.0000,1,1,0,1
887,887,1,1,0,0,30.0000,1,0,0,1
888,888,0,3,1,2,23.4500,0,0,0,1
889,889,1,1,0,0,30.0000,1,1,0,0


In [3]:
baseline = 549 / (549 + 342)
baseline

0.6161616161616161

In [4]:
train, validate, test = prepare.split_data(titanic, 'survived')

In [5]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [55]:
rf = RandomForestClassifier(bootstrap=True,
                            class_weight=None,
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=10,
                            max_depth=10,
                            random_state=823)

In [56]:
rf.fit(X_train, y_train)

In [57]:
print(rf.feature_importances_)

[0.25334803 0.07793831 0.0572877  0.05443736 0.24297186 0.013274
 0.26503044 0.00749547 0.02821682]


In [58]:
y_pred = rf.predict(X_train)
y_pred[0:10]

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0])

In [59]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

array([[0.45878788, 0.54121212],
       [0.112     , 0.888     ],
       [0.012     , 0.988     ],
       ...,
       [0.1       , 0.9       ],
       [0.9       , 0.1       ],
       [0.02142857, 0.97857143]])

Evaluate your results using the model score, confusion matrix, and classification report.

In [60]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [61]:
print(confusion_matrix(y_train, y_pred))

[[323   6]
 [  9 196]]


In [62]:
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,323,6
1,9,196


In [63]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       329
           1       0.97      0.96      0.96       205

    accuracy                           0.97       534
   macro avg       0.97      0.97      0.97       534
weighted avg       0.97      0.97      0.97       534



Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Accuracy = $\frac{TP+TN}{TP+TN+FP+FN}$ = $\frac{515}{534}$ = 0.9644194756554307

True Positive Rate = $\frac{TP}{TP+TN+FP+FN}$ = $\frac{328}{534}$ = 0.6142322097378277

True Negative Rate = $\frac{TN}{TP+TN+FP+FN}$ = $\frac{187}{534}$ = 0.350187265917603

False Positive Rate = $\frac{FP}{TP+TN+FP+FN}$ = $\frac{18}{534}$ = 0.033707865168539325

False Negative Rate = $\frac{FN}{TP+TN+FP+FN}$ = $\frac{1}{534}$ = 0.0018726591760299626

Precision = $\frac{TP}{Predicted Positive}$ = $\frac{TP}{TP+FP}$ = $\frac{328}{346}$ = 0.9479768786127167

Recall/Sensitivity = $\frac{TP}{Actual Positive}$ = $\frac{TP}{TP+FN}$ = $\frac{328}{329}$ = 0.9969604863221885

F1-Score =  $\frac{1}{2*(\frac{1}{Precision}+\frac{1}{Recall})}$ = $\frac{2 * Precision * Recall}{Precision + Recall}$ = $\frac{2* \frac{296}{368} * \frac{296}{329}}{\frac{296}{368} + \frac{296}{329}}$  = 0.9718518518518519

Support = 329 for pos 205 for negative 534 for the test.

Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [74]:
rf2 = RandomForestClassifier(bootstrap=True,
                            class_weight=None,
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=10,
                            max_depth=2,
                            random_state=823)

In [75]:
rf2.fit(X_train, y_train)

In [76]:
print(rf2.feature_importances_)
y_pred = rf2.predict(X_train)
y_pred[0:10]

[0.         0.08816828 0.12458945 0.07085037 0.24414779 0.02273863
 0.44713566 0.         0.00236981]


array([0, 1, 1, 0, 1, 0, 1, 1, 0, 0])

In [77]:
y_pred_proba = rf2.predict_proba(X_train)
y_pred_proba

array([[0.82833001, 0.17166999],
       [0.34887232, 0.65112768],
       [0.29148988, 0.70851012],
       ...,
       [0.50489954, 0.49510046],
       [0.40200401, 0.59799599],
       [0.34887232, 0.65112768]])

In [78]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.80


In [79]:
print(confusion_matrix(y_train, y_pred))

[[305  24]
 [ 85 120]]


In [80]:
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,305,24
1,85,120


In [81]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       329
           1       0.83      0.59      0.69       205

    accuracy                           0.80       534
   macro avg       0.81      0.76      0.77       534
weighted avg       0.80      0.80      0.79       534



What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The low leaf count and high max depth seem to perform much better because seemingly the higher the max depth the more each tree tends to hone in on the proper predictions while the low min leaves count allows the model to remain simple for each tree.

In [None]:
After making a few models, which one has the best performance (or closest metrics) on both train and validate?