## Random Forest Exercises

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from pydataset import data

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

from acquire import get_iris_data
from acquire import get_titanic_data
from acquire import get_telco_data
from prepare import split_data
import os
import acquire
from env import get_db_url

Create a new notebook, random_forests, and work with titanic data to do the following: 

In [10]:
titanic_df = get_titanic_data()
titanic_df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [11]:
def clean_titanic(df):

    df = df.drop(columns =['embark_town','class','deck'])

    df.embarked = df.embarked.fillna(value='S')

    dummy_df = pd.get_dummies(df[['sex','embarked']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

In [13]:
titanic_df = clean_titanic(titanic_df)

KeyError: "['embark_town', 'class', 'deck'] not found in axis"

In [14]:
train, validate, test = split_data(titanic_df, col_to_stratify='survived')
train.shape, validate.shape, test.shape

((534, 13), (178, 13), (179, 13))

In [15]:
X_train = train.drop(columns=['survived', 'passenger_id', 'sex', 'embarked', 'age'])
y_train = train.survived

X_validate = validate.drop(columns=['survived', 'passenger_id', 'sex', 'embarked','age'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'passenger_id', 'sex', 'embarked','age'])
y_test = test.survived

In [16]:
def establish_baseline(y_train):

    baseline_prediction = y_train.mode()

    y_train_pred = pd.Series((baseline_prediction[0]), range(len(y_train)))

    cm = confusion_matrix(y_train, y_train_pred)
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp+tn)/(tn+fp+fn+tp)
    return accuracy

In [17]:
establish_baseline(y_train)

0.6161048689138576

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [18]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [19]:
rf.fit(X_train, y_train)

In [20]:
print(rf.feature_importances_)

[0.11894513 0.05902933 0.06353089 0.38805589 0.02276681 0.30707546
 0.01661057 0.02398593]


In [21]:
y_pred = rf.predict(X_train)

In [22]:
y_pred_proba = rf.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [23]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))


Accuracy of random forest classifier on training set: 0.92


In [24]:
print(confusion_matrix(y_train, y_pred))

[[320   9]
 [ 35 170]]


In [25]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.97      0.94       329
           1       0.95      0.83      0.89       205

    accuracy                           0.92       534
   macro avg       0.93      0.90      0.91       534
weighted avg       0.92      0.92      0.92       534



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [26]:
cm = confusion_matrix(y_train, y_pred)
cm

array([[320,   9],
       [ 35, 170]])

In [27]:
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn)/(tn + fp + fn + tp)

true_positive_rate = tp/(tp + fn)
false_positive_rate = fp/(fp + tn)
true_negative_rate = tn/(tn + fp)
false_negative_rate = fn/(fn + tp)

precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1_score = 2*(precision*recall)/(precision+recall)

support_pos = tp + fn
support_neg = fp + tn

dict = {
    'metric' : ['accuracy'
                ,'true_positive_rate'
                ,'false_positive_rate'
                ,'true_negative_rate'
                ,'false_negative_rate'
                ,'precision'
                ,'recall'
                ,'f1_score'
                ,'support_pos'
                ,'support_neg']
    ,'score' : [accuracy
                ,true_positive_rate
                ,false_positive_rate
                ,true_negative_rate
                ,false_negative_rate
                ,precision
                ,recall
                ,f1_score
                ,support_pos
                ,support_neg]
}

pd.DataFrame(dict)


Unnamed: 0,metric,score
0,accuracy,0.917603
1,true_positive_rate,0.829268
2,false_positive_rate,0.027356
3,true_negative_rate,0.972644
4,false_negative_rate,0.170732
5,precision,0.949721
6,recall,0.829268
7,f1_score,0.885417
8,support_pos,205.0
9,support_neg,329.0


4. Run through steps increasing your min_samples_leaf and decreasing your max_depth. 

In [28]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=6, 
                            random_state=123)

In [29]:
rf.fit(X_train, y_train)

In [30]:
y_pred = rf.predict(X_train)

In [31]:
y_pred_proba = rf.predict_proba(X_train)

In [32]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))


Accuracy of random forest classifier on training set: 0.84


In [33]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.95      0.88       329
           1       0.89      0.68      0.77       205

    accuracy                           0.84       534
   macro avg       0.86      0.81      0.82       534
weighted avg       0.85      0.84      0.84       534



In [34]:
cm = confusion_matrix(y_train, y_pred)
cm

array([[311,  18],
       [ 66, 139]])

In [35]:
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn)/(tn + fp + fn + tp)

true_positive_rate = tp/(tp + fn)
false_positive_rate = fp/(fp + tn)
true_negative_rate = tn/(tn + fp)
false_negative_rate = fn/(fn + tp)

precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1_score = 2*(precision*recall)/(precision+recall)

support_pos = tp + fn
support_neg = fp + tn

dict = {
    'metric' : ['accuracy'
                ,'true_positive_rate'
                ,'false_positive_rate'
                ,'true_negative_rate'
                ,'false_negative_rate'
                ,'precision'
                ,'recall'
                ,'f1_score'
                ,'support_pos'
                ,'support_neg']
    ,'score' : [accuracy
                ,true_positive_rate
                ,false_positive_rate
                ,true_negative_rate
                ,false_negative_rate
                ,precision
                ,recall
                ,f1_score
                ,support_pos
                ,support_neg]
}

pd.DataFrame(dict)


Unnamed: 0,metric,score
0,accuracy,0.842697
1,true_positive_rate,0.678049
2,false_positive_rate,0.054711
3,true_negative_rate,0.945289
4,false_negative_rate,0.321951
5,precision,0.88535
6,recall,0.678049
7,f1_score,0.767956
8,support_pos,205.0
9,support_neg,329.0


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The second model seems worse on every metric