In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,\
f1_score, precision_recall_fscore_support
import prepare

## 1.
Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
train, validate, test = prepare.wrangle_data('titanic')

In [4]:
X_cols = train.drop(columns=['sex','embark_town','survived']).columns.to_list()
y_cols = 'survived'

In [6]:
X_train = train[X_cols]
y_train = train[y_cols]

In [8]:
# make the random forest classifier object
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
rf1 = RandomForestClassifier(max_depth=10, min_samples_leaf=1, random_state=123)
# fit the rf object
rf1.fit(X_train, y_train)
# use the rf object
model_1_preds = rf1.predict(X_train)

## 2.
Evaluate your results using the model score, confusion matrix, and classification report.

In [9]:
# model 1 score
print(f'model 1 accuracy is: {rf1.score(X_train, y_train):.2%}\n')

# confusion matrix for model 1
print(f'confusion matrix for model 1: \n{confusion_matrix(y_train, model_1_preds)}\n')

# classification report for model 1
print(f'classification report for model 1:\n\n \
{classification_report(y_train, model_1_preds)}')

model 1 accuracy is: 96.59%

confusion matrix for model 1: 
[[306   1]
 [ 16 175]]

classification report for model 1:

               precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       0.99      0.92      0.95       191

    accuracy                           0.97       498
   macro avg       0.97      0.96      0.96       498
weighted avg       0.97      0.97      0.97       498



## 3.
Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [10]:
# Accuracy
accuracy_model_1 = accuracy_score(y_train, model_1_preds)

# true positive rate
tp_model_1 = ((y_train == 1) & (model_1_preds == 1)).sum()

# false positive rate
fp_model_1 = ((y_train == 0) & (model_1_preds == 1)).sum()

# true negative rate
tn_model_1 = ((y_train == 0) & (model_1_preds == 0)).sum()

# false negative rate
fn_model_1 = ((y_train == 1) & (model_1_preds == 0)).sum()

# f1-score
f1_model_1 = f1_score(y_train, model_1_preds)

# precision, recall, and support.
precision_model_1, recall_model_1, _, support_model_1 = precision_recall_fscore_support(
    y_train, model_1_preds)

In [11]:
print(f'accuracy for model 1: \t{accuracy_model_1}')
print(f'true positives for model 1: \t{tp_model_1}')
print(f'false positives for model 1: \t{fp_model_1}')
print(f'true negatives for model 1: \t{tn_model_1}')
print(f'false negatives for model 1: \t{fn_model_1}')
print(f'f1_score for model 1: \t{f1_model_1}')
print(f'precision for model 1: \t{precision_model_1}')
print(f'recall for model 1: \t{recall_model_1}')
print(f'support for model 1: \t{support_model_1}')

accuracy for model 1: 	0.9658634538152611
true positives for model 1: 	175
false positives for model 1: 	1
true negatives for model 1: 	306
false negatives for model 1: 	16
f1_score for model 1: 	0.9536784741144414
precision for model 1: 	[0.95031056 0.99431818]
recall for model 1: 	[0.99674267 0.91623037]
support for model 1: 	[307 191]


## 4.
Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [39]:
[pair for pair in zip(range(1,10), range(10,1,-1))]

[(1, 10), (2, 9), (3, 8), (4, 7), (5, 6), (6, 5), (7, 4), (8, 3), (9, 2)]

In [40]:
rf_model_dict = {}
for i in [pair for pair in zip(range(1,10), range(10,1,-1))]:
    rf = RandomForestClassifier(min_samples_leaf=i[0],
                               max_depth=i[1])
    rf.fit(X_train, y_train)
    rf_model_dict[f'rf_{i[0]}'] = {}
    rf_model_dict[f'rf_{i[0]}']['model'] = rf
    rf_model_dict[f'rf_{i[0]}']['train_score'] = rf.score(X_train, y_train)
    rf_model_dict[f'rf_{i[0]}']['val_score'] = rf.score(X_val, y_val)
    rf_model_dict[f'rf_{i[0]}']['val_diff'] = \
    rf.score(X_train, y_train) - rf.score(X_val, y_val)

In [41]:
rf_model_dict

{'rf_1': {'model': RandomForestClassifier(max_depth=10),
  'train_score': 0.963855421686747,
  'val_score': 0.8037383177570093,
  'val_diff': 0.16011710392973766},
 'rf_2': {'model': RandomForestClassifier(max_depth=9, min_samples_leaf=2),
  'train_score': 0.9196787148594378,
  'val_score': 0.8084112149532711,
  'val_diff': 0.11126749990616669},
 'rf_3': {'model': RandomForestClassifier(max_depth=8, min_samples_leaf=3),
  'train_score': 0.8955823293172691,
  'val_score': 0.8037383177570093,
  'val_diff': 0.09184401156025979},
 'rf_4': {'model': RandomForestClassifier(max_depth=7, min_samples_leaf=4),
  'train_score': 0.8755020080321285,
  'val_score': 0.794392523364486,
  'val_diff': 0.08110948466764256},
 'rf_5': {'model': RandomForestClassifier(max_depth=6, min_samples_leaf=5),
  'train_score': 0.8775100401606426,
  'val_score': 0.7990654205607477,
  'val_diff': 0.0784446195998949},
 'rf_6': {'model': RandomForestClassifier(max_depth=5, min_samples_leaf=6),
  'train_score': 0.8654618

In [42]:
max_score = 0
for model in rf_model_dict:
    if rf_model_dict[model]['val_score'] > max_score:
        max_score = rf_model_dict[model]['val_score']
        model_name = model
model_name

'rf_6'

In [25]:
# make the random forest classifier object
# increasing your min_samples_leaf and decreasing your max_depth.
rf2 = RandomForestClassifier(max_depth=6, min_samples_leaf=4, random_state=123)
# fit the rf object
rf2.fit(X_train, y_train)
# use the rf object
model_2_preds = rf2.predict(X_train)

In [26]:
# model 2 score
print(f'model 2 accuracy is: {rf2.score(X_train, y_train):.2%}\n')

# confusion matrix for model 2
print(f'confusion matrix for model 2: \n{confusion_matrix(y_train, model_2_preds)}\n')

# classification report for model 2
print(f'classification report for model 2:\n\n \
{classification_report(y_train, model_2_preds)}')

model 2 accuracy is: 87.35%

confusion matrix for model 2: 
[[296  11]
 [ 52 139]]

classification report for model 2:

               precision    recall  f1-score   support

           0       0.85      0.96      0.90       307
           1       0.93      0.73      0.82       191

    accuracy                           0.87       498
   macro avg       0.89      0.85      0.86       498
weighted avg       0.88      0.87      0.87       498



In [27]:
# Accuracy
accuracy_model_2 = accuracy_score(y_train, model_2_preds)

# true positive rate
tp_model_2 = ((y_train == 1) & (model_2_preds == 1)).sum()

# false positive rate
fp_model_2 = ((y_train == 0) & (model_2_preds == 1)).sum()

# true negative rate
tn_model_2 = ((y_train == 0) & (model_2_preds == 0)).sum()

# false negative rate
fn_model_2 = ((y_train == 1) & (model_2_preds == 0)).sum()

# f1-score
f1_model_2 = f1_score(y_train, model_2_preds)

# precision, recall, and support.
precision_model_2, recall_model_2, _, support_model_2 = precision_recall_fscore_support(
    y_train, model_2_preds)

In [28]:
print(f'accuracy for model 2: \t{accuracy_model_2}')
print(f'true positives for model 2: \t{tp_model_2}')
print(f'false positives for model 2: \t{fp_model_2}')
print(f'true negatives for model 2: \t{tn_model_2}')
print(f'false negatives for model 2: \t{fn_model_2}')
print(f'f1_score for model 2: \t{f1_model_2}')
print(f'precision for model 2: \t{precision_model_2}')
print(f'recall for model 2: \t{recall_model_2}')
print(f'support for model 2: \t{support_model_2}')

accuracy for model 2: 	0.8734939759036144
true positives for model 2: 	139
false positives for model 2: 	11
true negatives for model 2: 	296
false negatives for model 2: 	52
f1_score for model 2: 	0.81524926686217
precision for model 2: 	[0.85057471 0.92666667]
recall for model 2: 	[0.96416938 0.72774869]
support for model 2: 	[307 191]


In [29]:
# make the random forest classifier object
# increasing your min_samples_leaf and decreasing your max_depth.
rf3 = RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=123)
# fit the rf object
rf3.fit(X_train, y_train)
# use the rf object
model_3_preds = rf3.predict(X_train)

In [30]:
# model 3 score
print(f'model 3 accuracy is: {rf3.score(X_train, y_train):.2%}\n')

# confusion matrix for model 3
print(f'confusion matrix for model 3: \n{confusion_matrix(y_train, model_3_preds)}\n')

# classification report for model 3
print(f'classification report for model 3:\n\n \
{classification_report(y_train, model_3_preds)}')

model 3 accuracy is: 86.35%

confusion matrix for model 3: 
[[290  17]
 [ 51 140]]

classification report for model 3:

               precision    recall  f1-score   support

           0       0.85      0.94      0.90       307
           1       0.89      0.73      0.80       191

    accuracy                           0.86       498
   macro avg       0.87      0.84      0.85       498
weighted avg       0.87      0.86      0.86       498



In [31]:
# Accuracy
accuracy_model_3 = accuracy_score(y_train, model_3_preds)

# true positive rate
tp_model_3 = ((y_train == 1) & (model_3_preds == 1)).sum()

# false positive rate
fp_model_3 = ((y_train == 0) & (model_3_preds == 1)).sum()

# true negative rate
tn_model_3 = ((y_train == 0) & (model_3_preds == 0)).sum()

# false negative rate
fn_model_3 = ((y_train == 1) & (model_3_preds == 0)).sum()

# f1-score
f1_model_3 = f1_score(y_train, model_3_preds)

# precision, recall, and support.
precision_model_3, recall_model_3, _, support_model_3 = precision_recall_fscore_support(
    y_train, model_3_preds)

In [32]:
print(f'accuracy for model 3: \t{accuracy_model_3}')
print(f'true positives for model 3: \t{tp_model_3}')
print(f'false positives for model 3: \t{fp_model_3}')
print(f'true negatives for model 3: \t{tn_model_3}')
print(f'false negatives for model 3: \t{fn_model_3}')
print(f'f1_score for model 3: \t{f1_model_3}')
print(f'precision for model 3: \t{precision_model_3}')
print(f'recall for model 3: \t{recall_model_3}')
print(f'support for model 3: \t{support_model_3}')

accuracy for model 3: 	0.8634538152610441
true positives for model 3: 	140
false positives for model 3: 	17
true negatives for model 3: 	290
false negatives for model 3: 	51
f1_score for model 3: 	0.8045977011494254
precision for model 3: 	[0.85043988 0.89171975]
recall for model 3: 	[0.94462541 0.73298429]
support for model 3: 	[307 191]


## 5.
What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [33]:
# model 1 score
print(f'model 1 accuracy is: {rf1.score(X_train, y_train):.2%}\n')

# confusion matrix for model 1
print(f'confusion matrix for model 1: \n{confusion_matrix(y_train, model_1_preds)}\n')

# classification report for model 1
print(f'classification report for model 1:\n\n \
{classification_report(y_train, model_1_preds)}')

# model 2 score
print(f'model 2 accuracy is: {rf2.score(X_train, y_train):.2%}\n')

# confusion matrix for model 2
print(f'confusion matrix for model 2: \n{confusion_matrix(y_train, model_2_preds)}\n')

# classification report for model 2
print(f'classification report for model 2:\n\n \
{classification_report(y_train, model_2_preds)}')

# model 3 score
print(f'model 3 accuracy is: {rf3.score(X_train, y_train):.2%}\n')

# confusion matrix for model 3
print(f'confusion matrix for model 3: \n{confusion_matrix(y_train, model_3_preds)}\n')

# classification report for model 3
print(f'classification report for model 3:\n\n \
{classification_report(y_train, model_3_preds)}')

model 1 accuracy is: 96.59%

confusion matrix for model 1: 
[[306   1]
 [ 16 175]]

classification report for model 1:

               precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       0.99      0.92      0.95       191

    accuracy                           0.97       498
   macro avg       0.97      0.96      0.96       498
weighted avg       0.97      0.97      0.97       498

model 2 accuracy is: 87.35%

confusion matrix for model 2: 
[[296  11]
 [ 52 139]]

classification report for model 2:

               precision    recall  f1-score   support

           0       0.85      0.96      0.90       307
           1       0.93      0.73      0.82       191

    accuracy                           0.87       498
   macro avg       0.89      0.85      0.86       498
weighted avg       0.88      0.87      0.87       498

model 3 accuracy is: 86.35%

confusion matrix for model 3: 
[[290  17]
 [ 51 140]]

classification repor

The first model with the highest depth and lowest min_sample_leaf appears to be the best performing model based on the train data

## 6. 
After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [34]:
X_val = validate[X_cols]
y_val = validate[y_cols]

In [35]:
val_1_preds = rf1.predict(X_val)
val_2_preds = rf2.predict(X_val)
val_3_preds = rf3.predict(X_val)

In [36]:
# model 1 score
print(f'model 1 accuracy using the validation data is: {rf1.score(X_val, y_val):.2%}\n')

# confusion matrix for model 1
print(f'confusion matrix for model 1 using the validation data: \n{confusion_matrix(y_val, val_1_preds)}\n')

# classification report for model 1
print(f'classification report for model 1 using the validation data:\n\n \
{classification_report(y_val, val_1_preds)}')

# model 2 score
print(f'model 2 accuracy using the validation data is: {rf2.score(X_val, y_val):.2%}\n')

# confusion matrix for model 2
print(f'confusion matrix for model 2 using the validation data: \n{confusion_matrix(y_val, val_2_preds)}\n')

# classification report for model 2
print(f'classification report for model 2 using the validation data:\n\n \
{classification_report(y_val, val_2_preds)}')

# model 3 score
print(f'model 3 accuracy using the validation data is: {rf3.score(X_val, y_val):.2%}\n')

# confusion matrix for model 3
print(f'confusion matrix for model 3 using the validation data: \n{confusion_matrix(y_val, val_3_preds)}\n')

# classification report for model 3
print(f'classification report for model 3 using the validation data:\n\n \
{classification_report(y_val, val_3_preds)}')

model 1 accuracy using the validation data is: 79.44%

confusion matrix for model 1 using the validation data: 
[[110  22]
 [ 22  60]]

classification report for model 1 using the validation data:

               precision    recall  f1-score   support

           0       0.83      0.83      0.83       132
           1       0.73      0.73      0.73        82

    accuracy                           0.79       214
   macro avg       0.78      0.78      0.78       214
weighted avg       0.79      0.79      0.79       214

model 2 accuracy using the validation data is: 80.37%

confusion matrix for model 2 using the validation data: 
[[118  14]
 [ 28  54]]

classification report for model 2 using the validation data:

               precision    recall  f1-score   support

           0       0.81      0.89      0.85       132
           1       0.79      0.66      0.72        82

    accuracy                           0.80       214
   macro avg       0.80      0.78      0.78       214
wei

In [37]:
87.35 - 80.37

6.97999999999999

In [38]:
86.35 - 79.91

6.439999999999998

Model 3 with max_depth=5, min_samples_leaf=5 had the lowest drop in accuracy from the training data and the validation data.