In [10]:
#importing standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#importing libraries for modeling
from sklearn.model_selection import train_test_split
from prepare import tts
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from env import get_connection
import acquire

In [2]:
# function to get the titanic data from my acquire file

def prep_titanic():
    titan=acquire.get_titanic()
    titan.drop(columns=['passenger_id', 'embarked', 'deck', 'age', 'class'], inplace=True)
    dummy_var=pd.get_dummies(titan[['embark_town', 'sex']], drop_first=True)
    titan=pd.concat([titan, dummy_var], axis=1)
    return titan

In [3]:
#assigning the titanic data to a dataframe

df=prep_titanic()

df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.25,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,1,0
3,1,1,female,1,0,53.1,Southampton,0,0,1,0
4,0,3,male,0,0,8.05,Southampton,1,0,1,1


In [4]:
df.dtypes

survived                     int64
pclass                       int64
sex                         object
sibsp                        int64
parch                        int64
fare                       float64
embark_town                 object
alone                        int64
embark_town_Queenstown       uint8
embark_town_Southampton      uint8
sex_male                     uint8
dtype: object

In [5]:
#dropping columns that have been split into dummy variables

df = df.drop(columns=['embark_town', 'sex'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,1,0,7.25,0,0,1,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,1,0
3,1,1,1,0,53.1,0,0,1,0
4,0,3,0,0,8.05,1,0,1,1


In [6]:
#Adding another dummy variable for pclass and dropping the original column

df=pd.get_dummies(df, columns=['pclass'], drop_first=True)

df.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,pclass_2,pclass_3
0,0,1,0,7.25,0,0,1,1,0,1
1,1,1,0,71.2833,0,0,0,0,0,0
2,1,0,0,7.925,1,0,1,0,0,1
3,1,1,0,53.1,0,0,1,0,0,0
4,0,0,0,8.05,1,0,1,1,0,1


In [7]:
#using the tts function from my prepare file to split into train, validate, test

t_train, t_val, t_test = tts(df)

stratify=survived


In [21]:
#assigning the X and y train variables for modeling

X_train = t_train.drop(columns=['survived'])
y_train = t_train['survived']

X_val = t_val.drop(columns=['survived'])
y_val = t_val['survived']

X_test = t_test.drop(columns=['survived'])
y_test = t_test['survived']

# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [22]:
#make the model

rf=RandomForestClassifier(min_samples_leaf=1,
                            max_depth=10, 
                            random_state=8675309)

In [23]:
#fit the model

rf.fit(X_train, y_train)

In [24]:
#make predictions

y_pred=rf.predict(X_train)

# Evaluate your results using the model score, confusion matrix, and classification report.



In [30]:
#model score

rf.score(X_train, y_train)

0.9317269076305221

In [42]:
#printing the classification report into a dataframe
#of our trained model against the predictions

traindf= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True, 
                                   target_names=['died', 'survived']))
traindf.round(2)

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.92,0.95,0.93,0.94,0.93
recall,0.97,0.86,0.93,0.92,0.93
f1-score,0.95,0.91,0.93,0.93,0.93
support,307.0,191.0,0.93,498.0,498.0


In [38]:
#dataframe of the confusion matrix (pos=survived, neg=died)

cmt= pd.DataFrame(data=confusion_matrix(y_train, y_pred), 
             columns= ['pred died', 'pred survived'], 
             index= ['actually died', 'actually survived'])
cmt

Unnamed: 0,pred died,pred survived
actually died,299,8
actually survived,26,165


# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [79]:
#assigning data results to variables

accuracy= traindf.iloc[1:]['accuracy'][0]
precision= traindf.iloc[0:2]['survived'][0]
recall= traindf.iloc[1:2]['survived'][0]
f1= traindf.iloc[2:3]['survived'][0]
support= traindf.iloc[3:4]['survived'][0]
tn, fp, fn, tp= confusion_matrix(y_train, y_pred).ravel()
neg= tn+fp
pos= fn+tp

In [88]:
#printing results using vairables

print(f'Accuracy is {accuracy.round(4)*100}%')
print(f'Precision is {precision.round(4)*100}%')
print(f'Recall is {recall.round(4)*100}%')
print(f'F1-score is {f1.round(4)*100}%')
print(f'Support is {support}')
print(f'True positive rate is {(tp/pos).round(4)*100}%')
print(f'False positive rate is {(fp/pos).round(4)*100}%')
print(f'True negative rate is {(tn/neg).round(4)*100}%')
print(f'False negative rate is {(fn/neg).round(3)*100}%')

Accuracy is 93.17%
Precision is 95.38%
Recall is 86.39%
F1-score is 90.66%
Support is 191.0
True positive rate is 86.39%
False positive rate is 4.19%
True negative rate is 97.39%
False negative rate is 8.5%


# Run through steps increasing your min_samples_leaf and decreasing your max_depth.



In [14]:
#for loop to run through max depth increase and print results

for i in range(1, 21):
    forest=RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=i, 
                            random_state=8675309)
    forest.fit(X_train, y_train)
    y_preds=forest.predict(X_train)
    report=classification_report(y_train, y_preds, output_dict=True)
    print(f'Max depth of {i}')
    print(pd.DataFrame(report))
    print()

Max depth of 1
                    0           1  accuracy   macro avg  weighted avg
precision    0.763959    0.942308  0.801205    0.853134      0.832362
recall       0.980456    0.513089  0.801205    0.746773      0.801205
f1-score     0.858773    0.664407  0.801205    0.761590      0.784227
support    307.000000  191.000000  0.801205  498.000000    498.000000

Max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.763959    0.942308  0.801205    0.853134      0.832362
recall       0.980456    0.513089  0.801205    0.746773      0.801205
f1-score     0.858773    0.664407  0.801205    0.761590      0.784227
support    307.000000  191.000000  0.801205  498.000000    498.000000

Max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.796247    0.920000  0.827309    0.858123      0.843710
recall       0.967427    0.602094  0.827309    0.784760      0.827309
f1-score     0.873529    0.727848  0.827309

In [90]:
#For loop to increase min samples leaf with constant depth

for i in range(1, 21):
    forest=RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=i,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=8675309)
    forest.fit(X_train, y_train)
    y_preds=forest.predict(X_train)
    report=classification_report(y_train, y_preds, output_dict=True)
    print(f'min samples leaf of {i}')
    print(pd.DataFrame(report))
    print()

min samples leaf of 1
                    0           1  accuracy   macro avg  weighted avg
precision    0.920000    0.953757  0.931727    0.936879      0.932947
recall       0.973941    0.863874  0.931727    0.918908      0.931727
f1-score     0.946203    0.906593  0.931727    0.926398      0.931011
support    307.000000  191.000000  0.931727  498.000000    498.000000

min samples leaf of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.880952    0.932099   0.89759    0.906526      0.900569
recall       0.964169    0.790576   0.89759    0.877373      0.897590
f1-score     0.920684    0.855524   0.89759    0.888104      0.895693
support    307.000000  191.000000   0.89759  498.000000    498.000000

min samples leaf of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.855882    0.898734  0.869478    0.877308      0.872317
recall       0.947883    0.743455  0.869478    0.845669      0.869478
f1-score     0.899536 

# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



In [94]:
# accuracy seems to increase and plateau as max depth increases
# accuracy slowly decreases as min sample leaves increases
# with max depth performing best at 10 and leaves at 1, it makes sense that they
# would perform best together at 5

In [93]:

forest=RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=8675309)
forest.fit(X_train, y_train)
y_preds=forest.predict(X_train)
report=classification_report(y_train, y_preds, output_dict=True)
print('min samples leaf of 5 and max depth 5')
print(pd.DataFrame(report))
print()

min samples leaf of 5 and max depth 5
                    0           1  accuracy   macro avg  weighted avg
precision    0.816156    0.899281  0.839357    0.857718      0.848037
recall       0.954397    0.654450  0.839357    0.804424      0.839357
f1-score     0.879880    0.757576  0.839357    0.818728      0.832972
support    307.000000  191.000000  0.839357  498.000000    498.000000

