In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
import seaborn as sns

import env
import prepare
import acquire


# Titanic Dataset

In [3]:
# import titanic dataset, run our prepare functions, split into train/validate/test, and validate size of df's
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
tit_train, tit_validate, tit_test = prepare.my_split(df, target='survived')
tit_train.shape, tit_validate.shape, tit_test.shape

((426, 15), (143, 15), (143, 15))

In [4]:
tit_train.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
318,1,1,female,31.0,0,2,164.8667,S,Southampton,0,1,0,0,0,1


## What is your baseline prediction?  What is your baseline accuracy? 

In [5]:
# find the most common condition for our target to determine baseline.  It's rate of occurance is the accuracy of 
# our baseline
print('The most common value (our baseline) is:',tit_train.survived.value_counts().idxmax())
print('The rate of occurance (our baseline accuracy) is:', len(tit_train[tit_train.survived == 0]) / len(tit_train) * 100)

The most common value (our baseline) is: 0
The rate of occurance (our baseline accuracy) is: 59.624413145539904


# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [6]:
# split each of our datasets into X and y 

X_train = tit_train.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_train = tit_train.survived

X_validate = tit_validate.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_validate = tit_validate.survived

X_test = tit_test.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_test = tit_test.survived

In [9]:
# create and fit the model
clf = DecisionTreeClassifier()

clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_prob = clf.predict_proba(X_train)

## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [10]:
# model score
print(f'training score: {clf.score(X_train, y_train):.2%}')

training score: 98.59%


In [12]:
# confusion matrix
cm = confusion_matrix(y_train, y_pred)
print(pd.DataFrame(cm))

     0    1
0  254    0
1    6  166


In [13]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       254
           1       1.00      0.97      0.98       172

    accuracy                           0.99       426
   macro avg       0.99      0.98      0.99       426
weighted avg       0.99      0.99      0.99       426



## Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [14]:
# accuracy
print('------- Model 1 -----------')
print('Accuracy score is: ', accuracy_score(y_train, y_pred))

------- Model 1 -----------
Accuracy score is:  0.9859154929577465


In [15]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp = cm[0,0]
tn = cm[1,1]
fp = cm[0,1]
fn = cm[1,0]
print('------- Model 1 -----------')
print('True positive rate is:', tp/(tp+tn+fp+fn))
print('False positive rate is', fp/(tp+tn+fp+fn))
print('True negative rate is', tn/(tp+tn+fp+fn))
print('False negative rate is', fn/(tp+tn+fp+fn))

------- Model 1 -----------
True positive rate is: 0.596244131455399
False positive rate is 0.0
True negative rate is 0.38967136150234744
False negative rate is 0.014084507042253521


In [16]:
# precision, recall, f1-score and support
print('------- Model 1 -----------')
print('Precision is: ', precision_score(y_train, y_pred))
print('Recall is:', recall_score(y_train, y_pred))
print('f1 score is:', f1_score(y_train, y_pred))
print('Support is 0:', tp+fp)
print('           1:', tn+fn)

------- Model 1 -----------
Precision is:  1.0
Recall is: 0.9651162790697675
f1 score is: 0.9822485207100593
Support is 0: 254
           1: 172


## Run through steps 2-4 using a different max_depth value.

In [17]:
# changing max depth to 5
clf2 = DecisionTreeClassifier(max_depth=5)

clf2 = clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [18]:
# model score
print('------- Model 2 -----------')
print(f'training score for model2: {clf2.score(X_train, y_train):.2%}')

------- Model 2 -----------
training score for model2: 84.74%


In [19]:
# confusion matrix
cm2 = confusion_matrix(y_train, y_pred2)
print(pd.DataFrame(cm2))

     0    1
0  248    6
1   59  113


In [20]:
# classification report
print('------- Model 2 -----------')
print(classification_report(y_train, y_pred2))

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.81      0.98      0.88       254
           1       0.95      0.66      0.78       172

    accuracy                           0.85       426
   macro avg       0.88      0.82      0.83       426
weighted avg       0.87      0.85      0.84       426



In [21]:
# accuracy
print('------- Model 2 -----------')
print('Accuracy score for model 2 is: ', accuracy_score(y_train, y_pred2))

------- Model 2 -----------
Accuracy score for model 2 is:  0.8474178403755869


In [22]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp2 = cm2[0,0]
tn2 = cm2[1,1]
fp2 = cm2[0,1]
fn2 = cm2[1,0]

print('------- Model 2 -----------')
print('True positive rate is:', tp2/(tp2+tn2+fp2+fn2))
print('False positive rate is', fp2/(tp2+tn2+fp2+fn2))
print('True negative rate is', tn2/(tp2+tn2+fp2+fn2))
print('False negative rate is', fn2/(tp2+tn2+fp2+fn2))

------- Model 2 -----------
True positive rate is: 0.5821596244131455
False positive rate is 0.014084507042253521
True negative rate is 0.2652582159624413
False negative rate is 0.13849765258215962


In [23]:
# precision, recall, f1-score and support
print('------- Model 2 -----------')
print('Precision is: ', precision_score(y_train, y_pred2))
print('Recall is:', recall_score(y_train, y_pred2))
print('f1 score is:', f1_score(y_train, y_pred2))
print('Support is 0:', tp2+fp2)
print('           1:', tn2+fn2)

------- Model 2 -----------
Precision is:  0.9495798319327731
Recall is: 0.6569767441860465
f1 score is: 0.7766323024054982
Support is 0: 254
           1: 172


## Which model performs better on your in-sample data?

Model 1 performs better when evaluating performance against the in-sample (training) data

## Which model performs best on your out-of-sample data, the validate set?

In [20]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))


------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.73

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.73


In [21]:
# classification report
y_pred = clf.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

           0       0.78      0.78      0.78        85
           1       0.67      0.67      0.67        58

    accuracy                           0.73       143
   macro avg       0.72      0.72      0.72       143
weighted avg       0.73      0.73      0.73       143

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.79      0.75      0.77        85
           1       0.66      0.71      0.68        58

    accuracy                           0.73       143
   macro avg       0.73      0.73      0.73       143
weighted avg       0.74      0.73      0.74       143



Model 2 performs slightly better with a max depth of 5

# Work through these same exercises using the iris dataset.
    ## building two models simultaneously

In [24]:
# get and prepare iris data
iris = acquire.get_iris_data()
iris = prepare.prep_iris(iris)
i_train, i_validate, i_test = prepare.my_split(iris, target='species')
i_train.shape, i_validate.shape, i_test.shape

((90, 5), (30, 5), (30, 5))

In [43]:
# find the baseline and the baseline accuracy
baseline = i_train.species.value_counts().idxmax()
print('The most common value (our baseline) is:', baseline)
print('The rate of occurance (our baseline accuracy) is:', len(i_train[i_train.species == baseline]) / len(i_train) * 100)

The most common value (our baseline) is: versicolor
The rate of occurance (our baseline accuracy) is: 33.33333333333333


In [44]:
# split each of our datasets into X and y 

X_train = i_train.drop(columns=('species'))
y_train = i_train.species

X_validate = i_validate.drop(columns=('species'))
y_validate = i_validate.species

X_test = i_test.drop(columns='species')
y_test = i_test.species

In [45]:
# create and fit the models
clf1 = DecisionTreeClassifier(max_depth=8)
clf2 = DecisionTreeClassifier(max_depth=2)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)

y_pred1 = clf1.predict(X_train)
y_pred2 = clf2.predict(X_train)

y_pred_prob1 = clf1.predict_proba(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [46]:
# model score
print(f'Model 1 training score: {clf1.score(X_train, y_train):.2%}')
print(f'Model 2 training score: {clf2.score(X_train, y_train):.2%}')

Model 1 training score: 100.00%
Model 2 training score: 97.78%


In [47]:
# confusion matrix
cm1 = confusion_matrix(y_train, y_pred1)
cm2 = confusion_matrix(y_train, y_pred2)
print('----Model 1----')
print(pd.DataFrame(cm1))
print('')
print('----Model 2----')
print(pd.DataFrame(cm2))

----Model 1----
    0   1   2
0  30   0   0
1   0  30   0
2   0   0  30

----Model 2----
    0   1   2
0  30   0   0
1   0  28   2
2   0   0  30


In [48]:
# classification report
print(classification_report(y_train, y_pred1))
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        30
  versicolor       1.00      1.00      1.00        30
   virginica       1.00      1.00      1.00        30

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        30
  versicolor       1.00      0.93      0.97        30
   virginica       0.94      1.00      0.97        30

    accuracy                           0.98        90
   macro avg       0.98      0.98      0.98        90
weighted avg       0.98      0.98      0.98        90



## ...and comparing with validation data

In [49]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.93

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.90


In [50]:
# classification report
y_pred1 = clf1.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred1))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.80      0.89        10
   virginica       0.83      1.00      0.91        10

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30

------- Model 2 -----------
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.70      0.82        10
   virginica       0.77      1.00      0.87        10

    accuracy                           0.90        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.92      0.90      0.90        30



The first model performs better with max depth of 8 versus max depth of 2 for the second model

# Experiment with this model on other datasets with a higher number of output classes.


In [53]:
# The glass dataset contains 9 columns quantifying the contents of 9 different elements in a glass sample.  
# The 10th column is the class of the glass, and integer from 1-7.

glass = pd.read_csv('glass.csv')
glass.head()

Unnamed: 0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00.1,1
0,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1


In [54]:
glass.columns =['ri', 'na', 'mg', 'al', 'si', 'k','ca','ba','fe', 'class']

In [55]:
# split into train, validate, test
g_train, g_validate, g_test = prepare.my_split(glass, target='class')
g_train.shape, g_validate.shape, g_test.shape

((127, 10), (43, 10), (43, 10))

In [56]:
# find the baseline and the baseline accuracy
baseline = g_train['class'].value_counts().idxmax()
print('The most common value (our baseline) is:', baseline)
print('The rate of occurance (our baseline accuracy) is:', len(g_train[g_train['class'] == baseline]) / len(g_train) * 100)

The most common value (our baseline) is: 2
The rate of occurance (our baseline accuracy) is: 36.22047244094488


In [57]:
# split each of our datasets into X and y 

X_train = g_train.drop(columns=('class'))
y_train = g_train['class']

X_validate = g_validate.drop(columns=('class'))
y_validate = g_validate['class']

X_test = g_test.drop(columns='class')
y_test = g_test['class']

In [58]:
# create and fit the models
clf1 = DecisionTreeClassifier(max_depth=8)
clf2 = DecisionTreeClassifier(max_depth=4)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)

y_pred1 = clf1.predict(X_train)
y_pred2 = clf2.predict(X_train)

y_pred_prob1 = clf1.predict_proba(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [59]:
# model score
print(f'Model 1 training score: {clf1.score(X_train, y_train):.2%}')
print(f'Model 2 training score: {clf2.score(X_train, y_train):.2%}')

Model 1 training score: 96.06%
Model 2 training score: 78.74%


In [60]:
# confusion matrix
cm1 = confusion_matrix(y_train, y_pred1)
cm2 = confusion_matrix(y_train, y_pred2)
print('----Model 1----')
print(pd.DataFrame(cm1))
print('')
print('----Model 2----')
print(pd.DataFrame(cm2))

----Model 1----
    0   1  2  3  4   5
0  41   0  0  0  0   0
1   3  43  0  0  0   0
2   2   0  8  0  0   0
3   0   0  0  8  0   0
4   0   0  0  0  5   0
5   0   0  0  0  0  17

----Model 2----
    0   1  2  3  4   5
0  37   3  1  0  0   0
1  10  30  3  3  0   0
2   5   0  5  0  0   0
3   0   0  0  8  0   0
4   1   0  0  0  4   0
5   0   0  1  0  0  16


In [61]:
# classification report
print(classification_report(y_train, y_pred1))
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           1       0.89      1.00      0.94        41
           2       1.00      0.93      0.97        46
           3       1.00      0.80      0.89        10
           5       1.00      1.00      1.00         8
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00        17

    accuracy                           0.96       127
   macro avg       0.98      0.96      0.97       127
weighted avg       0.96      0.96      0.96       127

              precision    recall  f1-score   support

           1       0.70      0.90      0.79        41
           2       0.91      0.65      0.76        46
           3       0.50      0.50      0.50        10
           5       0.73      1.00      0.84         8
           6       1.00      0.80      0.89         5
           7       1.00      0.94      0.97        17

    accuracy                           0.79       127
   macro avg       0.81

In [62]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.74

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.70


In [63]:
# classification reports
y_pred1 = clf1.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred1))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

           1       0.80      0.86      0.83        14
           2       0.74      0.93      0.82        15
           3       0.00      0.00      0.00         4
           5       1.00      1.00      1.00         2
           6       0.00      0.00      0.00         2
           7       0.80      0.67      0.73         6

    accuracy                           0.74        43
   macro avg       0.56      0.58      0.56        43
weighted avg       0.68      0.74      0.70        43

------- Model 2 -----------
              precision    recall  f1-score   support

           1       0.73      0.79      0.76        14
           2       0.72      0.87      0.79        15
           3       0.00      0.00      0.00         4
           5       0.67      1.00      0.80         2
           6       0.00      0.00      0.00         2
           7       1.00      0.67      0.80         6

    accuracy         

The first model (max depth 8) performed better on the training set, but the second model (max depth 4)
performed better on the validation set

# Random Forest Exercises

### Continue working in your model file with titanic data to do the following:

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.my_split(df, target='survived')
train.shape, validate.shape, test.shape

((426, 15), (143, 15), (143, 15))

In [4]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
543,1,2,male,32.0,1,0,26.0,S,Southampton,0,0,1,0,0,1
469,1,3,female,0.75,2,1,19.2583,C,Cherbourg,0,1,0,1,0,0
323,1,2,female,22.0,1,1,29.0,S,Southampton,0,1,0,0,0,1
228,0,2,male,18.0,0,0,13.0,S,Southampton,1,0,1,0,0,1
510,1,3,male,29.0,0,0,7.75,Q,Queenstown,1,0,1,0,1,0


In [11]:
# id columns for features in the model
features = ['pclass', 'age','alone','fare','sex_male']

In [12]:
# create X and y versions of our train/validate/test datasets
X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [13]:
# create random forest
rf = RandomForestClassifier(min_samples_leaf=1,
                            max_depth=10, 
                            random_state=123)

In [14]:
# train random forest model
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [15]:
# make predictions
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

### Evaluate your results using the model score, confusion matrix, and classification report.



In [16]:
# check accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.96


In [17]:
cm = confusion_matrix(y_train, y_pred)
print(cm)

[[254   0]
 [ 15 157]]


In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       254
           1       1.00      0.91      0.95       172

    accuracy                           0.96       426
   macro avg       0.97      0.96      0.96       426
weighted avg       0.97      0.96      0.96       426



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [26]:
# create function to output requested scoring of model

def model_scores(cm, min_leaf, max_depth):
    '''
    Function to get all model scores necessary for codeup exercises
    Accepts a confusion matrix, and prints a report with the following:
        Accuracy
        True positive rate
        False positive rate
        True negative rate
        False negative rate 
        Precision
        Recall
        f1-score
        positive support
        negative support
    '''
    
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    TP = cm[1,1]
    ALL = TP + FP + FN + TN
    
    print('Model stats for Random Forest with:')
    print("")
    print('     min_samples_leaf =',min_leaf)
    print('        and max_depth =',max_depth)
    print("")

    # accuracy
    acc = (TP + TN) / ALL
    print('Accuracy: {:.2f}'.format(acc))
#     # true positive rate, also recall
#     TPR = recall = TP/ (TP + FN)
#     print('True Positive Rate: {:.2f}'.format(TPR))
#     # false positive rate
#     FPR = FP / (FP + TN)
#     print('False Positive Rate: {:.2f}'.format(FPR))
#     # true negative rate
#     TNR = TN / (TN + FP)
#     print('True Negative Rate: {:.2f}'.format(TNR))
#     # false negative rate
#     FNR = FN / (FN + TP)
#     print('Flase Negative Rate: {:.2f}'.format(FNR))
#     # precision
#     precision = TP / (TP + FP)
#     print('Precision: {:.2f}'.format(precision))
#     # recall
#     print('Recall: {:.2f}'.format(recall))
#     # f1
#     f1_score = 2 * (precision*recall) / (precision+recall)
#     print('f1 score: {:.2f}'.format(f1_score))
#     # support
#     support_pos = TP + FN
#     print('Positive support:',support_pos)
#     support_neg = FP + TN
#     print('Negative support:',support_neg)
#     print('-----------------------------------------')
    
model_scores(cm, 1, 10)

Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 10

Accuracy: 0.83


## Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [20]:
best_i = 0
best_j = 0
best_acc = 0

for i in range(1,11):
    for j in range(1,11):
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_train)

        cm = confusion_matrix(y_train, y_pred)
        model_scores(cm, i, j)
        
        if rf.score(X_train, y_train) > best_acc:
            best_acc = rf.score(X_train, y_train)
            best_i = i
            best_j = j
            
print('best model had ', best_i,' for min_samples_leaf and', best_j, 'for max_depth')


Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 1

Accuracy: 0.78
True Positive Rate: 0.49
False Positive Rate: 0.02
True Negative Rate: 0.98
Flase Negative Rate: 0.51
Precision: 0.93
Recall: 0.49
f1 score: 0.65
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 2

Accuracy: 0.79
True Positive Rate: 0.52
False Positive Rate: 0.02
True Negative Rate: 0.98
Flase Negative Rate: 0.48
Precision: 0.94
Recall: 0.52
f1 score: 0.67
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 3

Accuracy: 0.83
True Positive Rate: 0.64
False Positive Rate: 0.04
True Negative Rate: 0.96
Flase Negative Rate: 0.36
Precision: 0.92
Recall: 0.64
f1 score: 0.76
Positive support: 172
Negative support: 254
-------------------------------

Model stats for Random Forest with:

     min_samples_leaf = 3
        and max_depth = 9

Accuracy: 0.88
True Positive Rate: 0.79
False Positive Rate: 0.06
True Negative Rate: 0.94
Flase Negative Rate: 0.21
Precision: 0.91
Recall: 0.79
f1 score: 0.84
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 3
        and max_depth = 10

Accuracy: 0.88
True Positive Rate: 0.79
False Positive Rate: 0.06
True Negative Rate: 0.94
Flase Negative Rate: 0.21
Precision: 0.90
Recall: 0.79
f1 score: 0.84
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 4
        and max_depth = 1

Accuracy: 0.78
True Positive Rate: 0.49
False Positive Rate: 0.02
True Negative Rate: 0.98
Flase Negative Rate: 0.51
Precision: 0.93
Recall: 0.49
f1 score: 0.65
Positive support: 172
Negative support: 254
------------------------------

Model stats for Random Forest with:

     min_samples_leaf = 6
        and max_depth = 7

Accuracy: 0.86
True Positive Rate: 0.76
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.24
Precision: 0.88
Recall: 0.76
f1 score: 0.81
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 6
        and max_depth = 8

Accuracy: 0.85
True Positive Rate: 0.73
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.27
Precision: 0.88
Recall: 0.73
f1 score: 0.80
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 6
        and max_depth = 9

Accuracy: 0.86
True Positive Rate: 0.75
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.25
Precision: 0.88
Recall: 0.75
f1 score: 0.81
Positive support: 172
Negative support: 254
-------------------------------

Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 5

Accuracy: 0.83
True Positive Rate: 0.68
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.32
Precision: 0.86
Recall: 0.68
f1 score: 0.76
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 6

Accuracy: 0.84
True Positive Rate: 0.72
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.28
Precision: 0.87
Recall: 0.72
f1 score: 0.78
Positive support: 172
Negative support: 254
-----------------------------------------
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 7

Accuracy: 0.84
True Positive Rate: 0.72
False Positive Rate: 0.07
True Negative Rate: 0.93
Flase Negative Rate: 0.28
Precision: 0.87
Recall: 0.72
f1 score: 0.79
Positive support: 172
Negative support: 254
-------------------------------

In [39]:
# model performs best on training data when minimum leaf samples is lower and max depth is higher

In [29]:
best_i = 0
best_j = 0
best_acc = 0

for i in range(1,11):
    for j in range(1,11):
        # create and fit the classifier using train
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        
        # create predictions and scores using validate
        y_pred = rf.predict(X_validate)

        cm = confusion_matrix(y_validate, y_pred)
        model_scores(cm, i, j)
        
        if rf.score(X_validate, y_validate) >= best_acc:
            best_acc = rf.score(X_validate, y_validate)
            best_i = i
            best_j = j
            
print('best model had ', best_i,' for min_samples_leaf and', best_j, 'for max_depth')
print('And an accuracy of', best_acc)

Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 1

Accuracy: 0.82
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 2

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 3

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 4

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 5

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 6

Accuracy: 0.86
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 7

Accuracy: 0.86
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 8

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 9

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_l

Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 1

Accuracy: 0.82
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 2

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 3

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 4

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 5

Accuracy: 0.82
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 6

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 7

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 8

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 9

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_l

In [24]:
# when using validation data the best model was the one with 4 min samples per leaf, and max depth of 3

In [33]:
# comparing the two

best_i = 0
best_j = 0
best_dif = 100
best_acc = 0

for i in range(1,11):
    for j in range(1,11):
        # create and fit the classifier using train
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        
        y_train_pred = rf.predict(X_train)
        y_val_pred = rf.predict(X_validate)

        cm_train = confusion_matrix(y_train, y_train_pred)
        cm_validate = confusion_matrix(y_validate, y_val_pred)
        
        
        
        if abs((rf.score(X_train, y_train)) - (rf.score(X_validate, y_validate))) < best_dif:
            best_dif = abs((rf.score(X_train, y_train)) - (rf.score(X_validate, y_validate)))
            best_i = i
            best_j = j
            
            best_acc = rf.score(X_validate, y_validate)
            
print('best model had ', best_i,' for min_samples_leaf and', best_j, 'for max_depth')
print('With a validate accuracy of', best_acc)
print('With an accuracy difference of', best_dif)
            

best model had  8  for min_samples_leaf and 4 for max_depth
With a validate accuracy of 0.8181818181818182
With an accuracy difference of 0.0010670081092615247
