# Random Forest Exercises

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

## Acquire

In [2]:
# Acquire Step
df = acquire.get_titanic_data()
df.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


## Prepare

In [3]:
# prepare the data
train, validate, test = prepare.prep_titanic_data(df)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,1,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,1,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,0,0,1
306,1,1,female,29.678105,0,0,110.8833,Cherbourg,1,0,0,0


In [4]:
X_train = train.drop(columns=['survived','sex','embark_town'])
y_train = train.survived

X_validate = validate.drop(columns=['survived','sex', 'embark_town'])
y_validate = validate.survived

X_test = test.drop(columns=['survived','sex', 'embark_town'])
y_test = test.survived

In [5]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,36.0,0,0,40.125,1,1,0,0
165,3,9.0,0,2,20.525,0,1,0,1
50,3,7.0,4,1,39.6875,0,1,0,1
259,2,50.0,0,1,26.0,0,0,0,1
306,1,29.678105,0,0,110.8833,1,0,0,0


In [6]:
# check the shape
X_train.shape, X_validate.shape, X_test.shape

((498, 9), (214, 9), (179, 9))

## Explore
* Think back to your Explore lesson and ask questions you want to know about the data
* Make decisions on which varaibles to keep and which to discard
* Accomplish feature engineering to provide the model with new variables

## Model

0. Baseline (mode of the target for classification problems)
1. Create Algorithm Object
2. Create Model by fitting algorithm to X_train, y_train
3. Run Model by predicting using X_train
4. Evaluate Model by comparing y_train_pred with y_train (actual values)
5. Run Model by predicting using X_validate
6. Evaluate Model by comparing y_validate_pred with y_validate (actual values)

In [8]:
#  establish the value we will predict for all observations
baseline_prediction = y_train.mode()

# create a series of predictions with that value, 
# the same length as our training set
y_train_pred = pd.Series([0]*len(y_train))

# compute accuracy of baseline
cm = confusion_matrix(y_train, y_train_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = (tp+tn)/(tn+fp+fn+tp)
accuracy

0.6164658634538153

### Question 1

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [9]:
# Create the Algorithm Object
rf = RandomForestClassifier(min_samples_leaf=1, 
                            max_depth=10, 
                            random_state=123)

# Create the model by fitting the algorithm to X_train and y_train
rf.fit(X_train, y_train)

# Run the model on X_train to make predictions (y_train_pred)
y_train_pred = rf.predict(X_train)

### Question 2

Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
# model score (accuracy)
rf.score(X_train, y_train)

0.9698795180722891

In [11]:
# confusion matrix (y_train_pred, y_train)
cm = confusion_matrix(y_train, y_train_pred)
cm

array([[307,   0],
       [ 15, 176]])

In [13]:
# classification report
report = pd.DataFrame(classification_report(y_train, 
                                            y_train_pred, 
                                            output_dict=True))
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.953416,1.0,0.96988,0.976708,0.971283
recall,1.0,0.921466,0.96988,0.960733,0.96988
f1-score,0.976153,0.959128,0.96988,0.96764,0.969623
support,307.0,191.0,0.96988,498.0,498.0


### Question 3

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [14]:
# extract tp, tn, fp, fn from the confusion matrix to use in calculations
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn)/(tn + fp + fn + tp)
print(f"Accuracy: {accuracy}")

true_positive_rate = tp/(tp + fn)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = fp/(fp + tn)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = tn/(tn + fp)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = fn/(fn + tp)
print(f"False Negative Rate: {false_negative_rate}")

precision = tp/(tp + fp)
print(f"Precision: {precision}")

recall = tp/(tp + fn)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

Accuracy: 0.9698795180722891
True Positive Rate: 0.9214659685863874
False Positive Rate: 0.0
True Negative Rate: 1.0
False Negative Rate: 0.07853403141361257
Precision: 1.0
Recall: 0.9214659685863874
F1 Score: 0.9591280653950953
Support (0): 191
Support (1): 307


### Question 4

Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
# create an empty list to append results to
metrics = []

for j in range (1, 10):
    for i in range(2, 10):
        rf = RandomForestClassifier(max_depth=i, 
                                    min_samples_leaf=j, 
                                    random_state=123)

        # Fit the model (on train and only train)
        rf = rf.fit(X_train, y_train)

        # We'll evaluate the model's performance on train, first
        in_sample_accuracy = rf.score(X_train, y_train)
    
        out_of_sample_accuracy = rf.score(X_validate, y_validate)

        output = {
            "min_samples_per_leaf": j,
            "max_depth": i,
            "train_accuracy": in_sample_accuracy,
            "validate_accuracy": out_of_sample_accuracy
        }
    
        metrics.append(output)

In [29]:
# create a df from metrics
df = pd.DataFrame(metrics)

# compute difference in accuracy between train and validate
df["difference"] = df.train_accuracy - df.validate_accuracy

# sort the df by validate_accuracy (descending) and take top 10
df.sort_values(by=['validate_accuracy'], ascending=False).head(10)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
22,3,8,0.903614,0.827103,0.076512
15,2,9,0.915663,0.827103,0.08856
14,2,8,0.917671,0.817757,0.099914
5,1,7,0.933735,0.817757,0.115978
12,2,6,0.883534,0.813084,0.07045
31,4,9,0.891566,0.813084,0.078482
21,3,7,0.89759,0.808411,0.089179
38,5,8,0.885542,0.808411,0.077131
39,5,9,0.883534,0.808411,0.075123
23,3,9,0.901606,0.808411,0.093195


A min_samples_per_leaf of 3 and a max_depth of 8 performs best on out-of-sample dataset and has a reasonable difference between train and validate, and that indicates it is generalizable. 