In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
import seaborn as sns

import env
import prepare
import acquire


# Titanic Dataset

In [2]:
# import titanic dataset, run our prepare functions, split into train/validate/test, and validate size of df's
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
tit_train, tit_validate, tit_test = prepare.my_split(df, target='survived')
tit_train.shape, tit_validate.shape, tit_test.shape

((426, 15), (143, 15), (143, 15))

In [3]:
tit_train.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
266,0,3,male,16.0,4,1,39.6875,S,Southampton,0,0,1,0,0,1


## What is your baseline prediction?  What is your baseline accuracy? 

In [4]:
# find the most common condition for our target to determine baseline.  It's rate of occurance is the accuracy of 
# our baseline
print('The most common value (our baseline) is:',tit_train.survived.value_counts().idxmax())
print('The rate of occurance (our baseline accuracy) is:', len(tit_train[tit_train.survived == 0]) / len(tit_train) * 100)

The most common value (our baseline) is: 0
The rate of occurance (our baseline accuracy) is: 59.624413145539904


# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [5]:
# split each of our datasets into X and y 

X_train = tit_train.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_train = tit_train.survived

X_validate = tit_validate.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_validate = tit_validate.survived

X_test = tit_test.drop(columns=['survived', 'sex', 'embarked', 'embark_town', 'sex_female', 'embark_town_Cherbourg'])
y_test = tit_test.survived

In [6]:
# create and fit the model
clf = DecisionTreeClassifier()

clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_prob = clf.predict_proba(X_train)

## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [7]:
# model score
print(f'training score: {clf.score(X_train, y_train):.2%}')

training score: 99.30%


In [8]:
# confusion matrix
cm = confusion_matrix(y_train, y_pred)
print(pd.DataFrame(cm))

     0    1
0  254    0
1    3  169


In [9]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       254
           1       1.00      0.98      0.99       172

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



## Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [10]:
# accuracy
print('------- Model 1 -----------')
print('Accuracy score is: ', accuracy_score(y_train, y_pred))

------- Model 1 -----------
Accuracy score is:  0.9929577464788732


In [11]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp = cm[0,0]
tn = cm[1,1]
fp = cm[0,1]
fn = cm[1,0]
print('------- Model 1 -----------')
print('True positive rate is:', tp/(tp+tn+fp+fn))
print('False positive rate is', fp/(tp+tn+fp+fn))
print('True negative rate is', tn/(tp+tn+fp+fn))
print('False negative rate is', fn/(tp+tn+fp+fn))

------- Model 1 -----------
True positive rate is: 0.596244131455399
False positive rate is 0.0
True negative rate is 0.3967136150234742
False negative rate is 0.007042253521126761


In [12]:
# precision, recall, f1-score and support
print('------- Model 1 -----------')
print('Precision is: ', precision_score(y_train, y_pred))
print('Recall is:', recall_score(y_train, y_pred))
print('f1 score is:', f1_score(y_train, y_pred))
print('Support is 0:', tp+fp)
print('           1:', tn+fn)

------- Model 1 -----------
Precision is:  1.0
Recall is: 0.9825581395348837
f1 score is: 0.9912023460410557
Support is 0: 254
           1: 172


## Run through steps 2-4 using a different max_depth value.

In [13]:
# changing max depth to 5
clf2 = DecisionTreeClassifier(max_depth=5)

clf2 = clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [14]:
# model score
print('------- Model 2 -----------')
print(f'training score for model2: {clf2.score(X_train, y_train):.2%}')

------- Model 2 -----------
training score for model2: 87.32%


In [15]:
# confusion matrix
cm2 = confusion_matrix(y_train, y_pred2)
print(pd.DataFrame(cm2))

     0    1
0  248    6
1   48  124


In [16]:
# classification report
print('------- Model 2 -----------')
print(classification_report(y_train, y_pred2))

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       254
           1       0.95      0.72      0.82       172

    accuracy                           0.87       426
   macro avg       0.90      0.85      0.86       426
weighted avg       0.88      0.87      0.87       426



In [17]:
# accuracy
print('------- Model 2 -----------')
print('Accuracy score for model 2 is: ', accuracy_score(y_train, y_pred2))

------- Model 2 -----------
Accuracy score for model 2 is:  0.8732394366197183


In [18]:
# true positive rate, false positive rate, true negative rate, false negative rate
tp2 = cm2[0,0]
tn2 = cm2[1,1]
fp2 = cm2[0,1]
fn2 = cm2[1,0]

print('------- Model 2 -----------')
print('True positive rate is:', tp2/(tp2+tn2+fp2+fn2))
print('False positive rate is', fp2/(tp2+tn2+fp2+fn2))
print('True negative rate is', tn2/(tp2+tn2+fp2+fn2))
print('False negative rate is', fn2/(tp2+tn2+fp2+fn2))

------- Model 2 -----------
True positive rate is: 0.5821596244131455
False positive rate is 0.014084507042253521
True negative rate is 0.29107981220657275
False negative rate is 0.11267605633802817


In [19]:
# precision, recall, f1-score and support
print('------- Model 2 -----------')
print('Precision is: ', precision_score(y_train, y_pred2))
print('Recall is:', recall_score(y_train, y_pred2))
print('f1 score is:', f1_score(y_train, y_pred2))
print('Support is 0:', tp2+fp2)
print('           1:', tn2+fn2)

------- Model 2 -----------
Precision is:  0.9538461538461539
Recall is: 0.7209302325581395
f1 score is: 0.8211920529801324
Support is 0: 254
           1: 172


## Which model performs better on your in-sample data?

Model 1 performs better when evaluating performance against the in-sample (training) data

## Which model performs best on your out-of-sample data, the validate set?

In [20]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))


------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.82

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.87


In [21]:
# classification report
y_pred = clf.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        85
           1       0.75      0.83      0.79        58

    accuracy                           0.82       143
   macro avg       0.81      0.82      0.81       143
weighted avg       0.82      0.82      0.82       143

------- Model 2 -----------
              precision    recall  f1-score   support

           0       0.84      0.96      0.90        85
           1       0.93      0.72      0.82        58

    accuracy                           0.87       143
   macro avg       0.89      0.84      0.86       143
weighted avg       0.88      0.87      0.86       143



Model 2 performs slightly better with a max depth of 5

# Work through these same exercises using the iris dataset.
    ## building two models simultaneously

In [22]:
# get and prepare iris data
iris = acquire.get_iris_data()
iris = prepare.prep_iris(iris)
i_train, i_validate, i_test = prepare.my_split(iris, target='species')
i_train.shape, i_validate.shape, i_test.shape

((90, 5), (30, 5), (30, 5))

In [23]:
# find the baseline and the baseline accuracy
baseline = i_train.species.value_counts().idxmax()
print('The most common value (our baseline) is:', baseline)
print('The rate of occurance (our baseline accuracy) is:', len(i_train[i_train.species == baseline]) / len(i_train) * 100)

The most common value (our baseline) is: setosa
The rate of occurance (our baseline accuracy) is: 33.33333333333333


In [24]:
# split each of our datasets into X and y 

X_train = i_train.drop(columns=('species'))
y_train = i_train.species

X_validate = i_validate.drop(columns=('species'))
y_validate = i_validate.species

X_test = i_test.drop(columns='species')
y_test = i_test.species

In [25]:
# create and fit the models
clf1 = DecisionTreeClassifier(max_depth=8)
clf2 = DecisionTreeClassifier(max_depth=2)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)

y_pred1 = clf1.predict(X_train)
y_pred2 = clf2.predict(X_train)

y_pred_prob1 = clf1.predict_proba(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [26]:
# model score
print(f'Model 1 training score: {clf1.score(X_train, y_train):.2%}')
print(f'Model 2 training score: {clf2.score(X_train, y_train):.2%}')

Model 1 training score: 100.00%
Model 2 training score: 95.56%


In [27]:
# confusion matrix
cm1 = confusion_matrix(y_train, y_pred1)
cm2 = confusion_matrix(y_train, y_pred2)
print('----Model 1----')
print(pd.DataFrame(cm1))
print('')
print('----Model 2----')
print(pd.DataFrame(cm2))

----Model 1----
    0   1   2
0  30   0   0
1   0  30   0
2   0   0  30

----Model 2----
    0   1   2
0  30   0   0
1   0  26   4
2   0   0  30


In [28]:
# classification report
print(classification_report(y_train, y_pred1))
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        30
  versicolor       1.00      1.00      1.00        30
   virginica       1.00      1.00      1.00        30

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        30
  versicolor       1.00      0.87      0.93        30
   virginica       0.88      1.00      0.94        30

    accuracy                           0.96        90
   macro avg       0.96      0.96      0.96        90
weighted avg       0.96      0.96      0.96        90



## ...and comparing with validation data

In [29]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.93

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.93


In [30]:
# classification report
y_pred1 = clf1.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred1))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30

------- Model 2 -----------
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



The first model performs better with max depth of 8 versus max depth of 2 for the second model

# Experiment with this model on other datasets with a higher number of output classes.


In [31]:
# The glass dataset contains 9 columns quantifying the contents of 9 different elements in a glass sample.  
# The 10th column is the class of the glass, and integer from 1-7.

glass = pd.read_csv('glass.csv')
glass.head()

Unnamed: 0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00.1,1
0,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1


In [32]:
glass.columns =['ri', 'na', 'mg', 'al', 'si', 'k','ca','ba','fe', 'class']

In [33]:
# split into train, validate, test
g_train, g_validate, g_test = prepare.my_split(glass, target='class')
g_train.shape, g_validate.shape, g_test.shape

((127, 10), (43, 10), (43, 10))

In [34]:
# find the baseline and the baseline accuracy
baseline = g_train['class'].value_counts().idxmax()
print('The most common value (our baseline) is:', baseline)
print('The rate of occurance (our baseline accuracy) is:', len(g_train[g_train['class'] == baseline]) / len(g_train) * 100)

The most common value (our baseline) is: 2
The rate of occurance (our baseline accuracy) is: 36.22047244094488


In [35]:
# split each of our datasets into X and y 

X_train = g_train.drop(columns=('class'))
y_train = g_train['class']

X_validate = g_validate.drop(columns=('class'))
y_validate = g_validate['class']

X_test = g_test.drop(columns='class')
y_test = g_test['class']

In [36]:
# create and fit the models
clf1 = DecisionTreeClassifier(max_depth=8)
clf2 = DecisionTreeClassifier(max_depth=4)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)

y_pred1 = clf1.predict(X_train)
y_pred2 = clf2.predict(X_train)

y_pred_prob1 = clf1.predict_proba(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [37]:
# model score
print(f'Model 1 training score: {clf1.score(X_train, y_train):.2%}')
print(f'Model 2 training score: {clf2.score(X_train, y_train):.2%}')

Model 1 training score: 98.43%
Model 2 training score: 82.68%


In [38]:
# confusion matrix
cm1 = confusion_matrix(y_train, y_pred1)
cm2 = confusion_matrix(y_train, y_pred2)
print('----Model 1----')
print(pd.DataFrame(cm1))
print('')
print('----Model 2----')
print(pd.DataFrame(cm2))

----Model 1----
    0   1  2  3  4   5
0  41   0  0  0  0   0
1   1  45  0  0  0   0
2   1   0  9  0  0   0
3   0   0  0  8  0   0
4   0   0  0  0  5   0
5   0   0  0  0  0  17

----Model 2----
    0   1  2  3  4   5
0  37   3  1  0  0   0
1   8  34  2  2  0   0
2   3   1  6  0  0   0
3   0   0  0  8  0   0
4   1   0  0  0  4   0
5   0   0  1  0  0  16


In [39]:
# classification report
print(classification_report(y_train, y_pred1))
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           1       0.95      1.00      0.98        41
           2       1.00      0.98      0.99        46
           3       1.00      0.90      0.95        10
           5       1.00      1.00      1.00         8
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00        17

    accuracy                           0.98       127
   macro avg       0.99      0.98      0.99       127
weighted avg       0.98      0.98      0.98       127

              precision    recall  f1-score   support

           1       0.76      0.90      0.82        41
           2       0.89      0.74      0.81        46
           3       0.60      0.60      0.60        10
           5       0.80      1.00      0.89         8
           6       1.00      0.80      0.89         5
           7       1.00      0.94      0.97        17

    accuracy                           0.83       127
   macro avg       0.84

In [40]:
print('------- Model 1 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))
print('')
print('------- Model 2 -----------')
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

------- Model 1 -----------
Accuracy of Decision Tree classifier on validate set: 0.65

------- Model 2 -----------
Accuracy of Decision Tree classifier on validate set: 0.63


In [41]:
# classification reports
y_pred1 = clf1.predict(X_validate)
y_pred2 = clf2.predict(X_validate)

print('------- Model 1 -----------')
print(classification_report(y_validate, y_pred1))

print('------- Model 2 -----------')
print(classification_report(y_validate, y_pred2))

------- Model 1 -----------
              precision    recall  f1-score   support

           1       0.62      0.93      0.74        14
           2       0.62      0.53      0.57        15
           3       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         2
           6       0.67      1.00      0.80         2
           7       1.00      0.83      0.91         6

    accuracy                           0.65        43
   macro avg       0.48      0.55      0.50        43
weighted avg       0.59      0.65      0.61        43

------- Model 2 -----------
              precision    recall  f1-score   support

           1       0.61      0.79      0.69        14
           2       0.67      0.53      0.59        15
           3       0.33      0.25      0.29         4
           5       0.25      0.50      0.33         2
           6       1.00      1.00      1.00         2
           7       1.00      0.67      0.80         6

    accuracy         

The first model (max depth 8) performed better on the training set, but the second model (max depth 4)
performed better on the validation set

# Random Forest Exercises

### Continue working in your model file with titanic data to do the following:

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.my_split(df, target='survived')
train.shape, validate.shape, test.shape

((426, 15), (143, 15), (143, 15))

In [44]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
130,0,3,male,33.0,0,0,7.8958,C,Cherbourg,1,0,1,1,0,0
391,1,3,male,21.0,0,0,7.7958,S,Southampton,1,0,1,0,0,1
785,0,3,male,25.0,0,0,7.25,S,Southampton,1,0,1,0,0,1
162,0,3,male,26.0,0,0,7.775,S,Southampton,1,0,1,0,0,1
16,0,3,male,2.0,4,1,29.125,Q,Queenstown,0,0,1,0,1,0


In [45]:
# id columns for features in the model
features = ['pclass', 'age','alone','fare','sex_male']

In [46]:
# create X and y versions of our train/validate/test datasets
X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [47]:
# create random forest
rf = RandomForestClassifier(min_samples_leaf=1,
                            max_depth=10, 
                            random_state=123)

In [48]:
# train random forest model
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [49]:
# make predictions
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

### Evaluate your results using the model score, confusion matrix, and classification report.



In [50]:
# check accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.98


In [51]:
cm = confusion_matrix(y_train, y_pred)
print(cm)

[[254   0]
 [  7 165]]


In [52]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       254
           1       1.00      0.96      0.98       172

    accuracy                           0.98       426
   macro avg       0.99      0.98      0.98       426
weighted avg       0.98      0.98      0.98       426



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [53]:
# create function to output requested scoring of model

def model_scores(cm, min_leaf, max_depth):
    '''
    Function to get all model scores necessary for codeup exercises
    Accepts a confusion matrix, and prints a report with the following:
        Accuracy
        True positive rate
        False positive rate
        True negative rate
        False negative rate 
        Precision
        Recall
        f1-score
        positive support
        negative support
    '''
    
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    TP = cm[1,1]
    ALL = TP + FP + FN + TN
    
    print('Model stats for Random Forest with:')
    print("")
    print('     min_samples_leaf =',min_leaf)
    print('        and max_depth =',max_depth)
    print("")

    # accuracy
    acc = (TP + TN) / ALL
    print('Accuracy: {:.2f}'.format(acc))
#     # true positive rate, also recall
#     TPR = recall = TP/ (TP + FN)
#     print('True Positive Rate: {:.2f}'.format(TPR))
#     # false positive rate
#     FPR = FP / (FP + TN)
#     print('False Positive Rate: {:.2f}'.format(FPR))
#     # true negative rate
#     TNR = TN / (TN + FP)
#     print('True Negative Rate: {:.2f}'.format(TNR))
#     # false negative rate
#     FNR = FN / (FN + TP)
#     print('Flase Negative Rate: {:.2f}'.format(FNR))
#     # precision
#     precision = TP / (TP + FP)
#     print('Precision: {:.2f}'.format(precision))
#     # recall
#     print('Recall: {:.2f}'.format(recall))
#     # f1
#     f1_score = 2 * (precision*recall) / (precision+recall)
#     print('f1 score: {:.2f}'.format(f1_score))
#     # support
#     support_pos = TP + FN
#     print('Positive support:',support_pos)
#     support_neg = FP + TN
#     print('Negative support:',support_neg)
#     print('-----------------------------------------')
    
model_scores(cm, 1, 10)

Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 10

Accuracy: 0.98


## Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [54]:
best_i = 0
best_j = 0
best_acc = 0

for i in range(1,11):
    for j in range(1,11):
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_train)

        cm = confusion_matrix(y_train, y_pred)
        model_scores(cm, i, j)
        
        if rf.score(X_train, y_train) > best_acc:
            best_acc = rf.score(X_train, y_train)
            best_i = i
            best_j = j
            
print('best model had ', best_i,' for min_samples_leaf and', best_j, 'for max_depth')


Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 1

Accuracy: 0.80
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 2

Accuracy: 0.80
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 3

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 4

Accuracy: 0.86
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 5

Accuracy: 0.89
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 6

Accuracy: 0.92
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 7

Accuracy: 0.94
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 8

Accuracy: 0.96
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 9

Accuracy: 0.97
Model stats for Random Forest with:

     min_samples_l

Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 1

Accuracy: 0.80
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 2

Accuracy: 0.80
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 3

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 4

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 5

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 6

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 7

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 8

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 9

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_l

In [55]:
# model performs best on training data when minimum leaf samples is lower and max depth is higher

In [56]:
best_i = 0
best_j = 0
best_acc = 0

for i in range(1,11):
    for j in range(1,11):
        # create and fit the classifier using train
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        
        # create predictions and scores using validate
        y_pred = rf.predict(X_validate)

        cm = confusion_matrix(y_validate, y_pred)
        model_scores(cm, i, j)
        
        if rf.score(X_validate, y_validate) >= best_acc:
            best_acc = rf.score(X_validate, y_validate)
            best_i = i
            best_j = j
            
print('best model had ', best_i,' for min_samples_leaf and', best_j, 'for max_depth')
print('And an accuracy of', best_acc)

Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 1

Accuracy: 0.79
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 2

Accuracy: 0.79
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 3

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 4

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 5

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 6

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 7

Accuracy: 0.86
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 8

Accuracy: 0.86
Model stats for Random Forest with:

     min_samples_leaf = 1
        and max_depth = 9

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_l

Model stats for Random Forest with:

     min_samples_leaf = 8
        and max_depth = 10

Accuracy: 0.85
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 1

Accuracy: 0.79
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 2

Accuracy: 0.79
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 3

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 4

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 5

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 6

Accuracy: 0.83
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 7

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_leaf = 9
        and max_depth = 8

Accuracy: 0.84
Model stats for Random Forest with:

     min_samples_

In [57]:
# when using validation data the best model was the one with 4 min samples per leaf, and max depth of 3

In [58]:
# comparing models with combinations of min_samples and max_depths

metrics = []

for i in range(1,11):
    for j in range(1,11):
        # create and fit the classifier using train
        rf = RandomForestClassifier(min_samples_leaf=i,
                            max_depth=j, 
                            random_state=123)
        rf.fit(X_train, y_train)
        
        y_train_pred = rf.predict(X_train)
        y_val_pred = rf.predict(X_validate)

        cm_train = confusion_matrix(y_train, y_train_pred)
        cm_validate = confusion_matrix(y_validate, y_val_pred)
        
        acc_train = rf.score(X_train, y_train)
        acc_val = rf.score(X_validate, y_validate)
        
        output = {
            "min_samples_per_leaf": i,
            "max_depth": j,
            "train_accuracy": round(acc_train,4),
            "validate_accuracy": round(acc_val,4)
            }
        metrics.append(output)
        
metrics = pd.DataFrame(metrics)
metrics["difference"] = abs(metrics.train_accuracy - metrics.validate_accuracy)




In [59]:
metrics.sort_values(['difference', 'validate_accuracy'], ascending=True)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
77,8,8,0.8592,0.8601,0.0009
72,8,3,0.8263,0.8252,0.0011
97,10,8,0.8380,0.8392,0.0012
98,10,9,0.8380,0.8392,0.0012
99,10,10,0.8380,0.8392,0.0012
...,...,...,...,...,...
5,1,6,0.9249,0.8531,0.0718
6,1,7,0.9366,0.8601,0.0765
7,1,8,0.9601,0.8601,0.1000
9,1,10,0.9836,0.8601,0.1235


# K Nearest Neighbors

In [60]:
from sklearn.neighbors import KNeighborsClassifier

Continue working in your model file with the titanic dataset.



#### Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)


In [61]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
130,0,3,male,33.0,0,0,7.8958,C,Cherbourg,1,0,1,1,0,0
391,1,3,male,21.0,0,0,7.7958,S,Southampton,1,0,1,0,0,1
785,0,3,male,25.0,0,0,7.25,S,Southampton,1,0,1,0,0,1
162,0,3,male,26.0,0,0,7.775,S,Southampton,1,0,1,0,0,1
16,0,3,male,2.0,4,1,29.125,Q,Queenstown,0,0,1,0,1,0


In [62]:
# create X and y version of train

X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [63]:
# create and fit model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [64]:
# make predictions and estimate probability
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

### Evaluate your results using the model score, confusion matrix, and classification report.

In [65]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.77


In [66]:
cm = confusion_matrix(y_train, y_pred)
print(pd.DataFrame(cm))


     0    1
0  215   39
1   60  112


In [67]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.85      0.81       254
           1       0.74      0.65      0.69       172

    accuracy                           0.77       426
   macro avg       0.76      0.75      0.75       426
weighted avg       0.77      0.77      0.76       426



### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [68]:
def knn_scores(cm, n):
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    TP = cm[1,1]
    ALL = TP + FP + FN + TN
    
    print('Model stats for KNN with:')
    print("")
    print('        # of neighbors =',n)
    print("")

    # accuracy
    acc = (TP + TN) / ALL
    print('Accuracy: {:.2f}'.format(acc))
    # true positive rate, also recall
    TPR = recall = TP/ (TP + FN)
    print('True Positive Rate: {:.2f}'.format(TPR))
    # false positive rate
    FPR = FP / (FP + TN)
    print('False Positive Rate: {:.2f}'.format(FPR))
    # true negative rate
    TNR = TN / (TN + FP)
    print('True Negative Rate: {:.2f}'.format(TNR))
    # false negative rate
    FNR = FN / (FN + TP)
    print('Flase Negative Rate: {:.2f}'.format(FNR))
    # precision
    precision = TP / (TP + FP)
    print('Precision: {:.2f}'.format(precision))
    # recall
    print('Recall: {:.2f}'.format(recall))
    # f1
    f1_score = 2 * (precision*recall) / (precision+recall)
    print('f1 score: {:.2f}'.format(f1_score))
    # support
    support_pos = TP + FN
    print('Positive support:',support_pos)
    support_neg = FP + TN
    print('Negative support:',support_neg)
    print('-----------------------------------------')

In [69]:
knn_scores(cm, 5)

Model stats for KNN with:

        # of neighbors = 5

Accuracy: 0.77
True Positive Rate: 0.65
False Positive Rate: 0.15
True Negative Rate: 0.85
Flase Negative Rate: 0.35
Precision: 0.74
Recall: 0.65
f1 score: 0.69
Positive support: 172
Negative support: 254
-----------------------------------------


## Run through steps 2-4 setting k to 10

In [70]:
# create and fit model
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn10.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [71]:
# make predictions and estimate probability
y_pred10 = knn10.predict(X_train)
y_pred_proba10 = knn10.predict_proba(X_train)

In [72]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.73


In [73]:
cm10 = confusion_matrix(y_train, y_pred10)
print(pd.DataFrame(cm10))

     0   1
0  219  35
1   78  94


In [74]:
print(classification_report(y_train, y_pred10))


              precision    recall  f1-score   support

           0       0.74      0.86      0.79       254
           1       0.73      0.55      0.62       172

    accuracy                           0.73       426
   macro avg       0.73      0.70      0.71       426
weighted avg       0.73      0.73      0.73       426



In [75]:
knn_scores(cm10, 10)

Model stats for KNN with:

        # of neighbors = 10

Accuracy: 0.73
True Positive Rate: 0.55
False Positive Rate: 0.14
True Negative Rate: 0.86
Flase Negative Rate: 0.45
Precision: 0.73
Recall: 0.55
f1 score: 0.62
Positive support: 172
Negative support: 254
-----------------------------------------


## Run through setps 2-4 setting k to 20



In [76]:
# create and fit model
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn20.fit(X_train, y_train)

# make predictions and estimate probability
y_pred20 = knn20.predict(X_train)
y_pred_proba20 = knn20.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn20.score(X_train, y_train)))

cm20 = confusion_matrix(y_train, y_pred20)
print(pd.DataFrame(cm20))
print(classification_report(y_train, y_pred20))

knn_scores(cm20, 20)


Accuracy of KNN classifier on training set: 0.73
     0   1
0  216  38
1   77  95
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       254
           1       0.71      0.55      0.62       172

    accuracy                           0.73       426
   macro avg       0.73      0.70      0.71       426
weighted avg       0.73      0.73      0.72       426

Model stats for KNN with:

        # of neighbors = 20

Accuracy: 0.73
True Positive Rate: 0.55
False Positive Rate: 0.15
True Negative Rate: 0.85
Flase Negative Rate: 0.45
Precision: 0.71
Recall: 0.55
f1 score: 0.62
Positive support: 172
Negative support: 254
-----------------------------------------


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

#### The model with fewer nearest neighbors performs better on the training data

## Which model performs best on our out-of-sample data from validate?

In [77]:
y_pred = knn.predict(X_validate)
y_pred10 = knn10.predict(X_validate)
y_pred20 = knn20.predict(X_validate)

print('Accuracy of KNN classifier on validation set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))
print('Accuracy of KNN10 classifier on validation set: {:.2f}'
     .format(knn10.score(X_validate, y_validate)))
print('Accuracy of KNN20 classifier on validation set: {:.2f}'
     .format(knn20.score(X_validate, y_validate)))



Accuracy of KNN classifier on validation set: 0.68
Accuracy of KNN10 classifier on validation set: 0.73
Accuracy of KNN20 classifier on validation set: 0.66


In [78]:
cm = confusion_matrix(y_validate, y_pred)
cm10 = confusion_matrix(y_validate, y_pred10)
cm20 = confusion_matrix(y_validate, y_pred20)

knn_scores(cm, 5)
knn_scores(cm10, 10)
knn_scores(cm20, 20)


Model stats for KNN with:

        # of neighbors = 5

Accuracy: 0.68
True Positive Rate: 0.53
False Positive Rate: 0.22
True Negative Rate: 0.78
Flase Negative Rate: 0.47
Precision: 0.62
Recall: 0.53
f1 score: 0.57
Positive support: 58
Negative support: 85
-----------------------------------------
Model stats for KNN with:

        # of neighbors = 10

Accuracy: 0.73
True Positive Rate: 0.53
False Positive Rate: 0.13
True Negative Rate: 0.87
Flase Negative Rate: 0.47
Precision: 0.74
Recall: 0.53
f1 score: 0.62
Positive support: 58
Negative support: 85
-----------------------------------------
Model stats for KNN with:

        # of neighbors = 20

Accuracy: 0.66
True Positive Rate: 0.52
False Positive Rate: 0.24
True Negative Rate: 0.76
Flase Negative Rate: 0.48
Precision: 0.60
Recall: 0.52
f1 score: 0.56
Positive support: 58
Negative support: 85
-----------------------------------------


The third model (20 neighbors) performs best on the validate data set.  

# Logistic Regression

#### Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [79]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,S,Southampton,0,0,1,0,0,1
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg,0,1,0,1,0,0
2,1,3,female,26.0,0,0,7.925,S,Southampton,1,1,0,0,0,1
3,1,1,female,35.0,1,0,53.1,S,Southampton,0,1,0,0,0,1
4,0,3,male,35.0,0,0,8.05,S,Southampton,1,0,1,0,0,1


In [80]:
from sklearn.linear_model import LogisticRegression

#### Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [81]:
# use specified features
features = ['age', 'fare', 'pclass']

# create X and y version of train, validate, and test
X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [82]:
# make the model
logit = LogisticRegression(random_state=123)

#fit the model
logit.fit(X_train, y_train)


LogisticRegression(random_state=123)

In [83]:
# evaluate the intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.04416486  0.00428582 -1.21730482]]
Intercept: 
 [3.45642644]


In [84]:
# make predictions
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [85]:
# evaluate accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.73


In [86]:
# evaluate accuracy against validation set
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on validation set: 0.69


##### The model beat baseline with age, fare, and p-class as features

### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [89]:
# use specified features
features = ['age', 'fare', 'pclass', 'sex_male']

# create X and y version of train, validate, and test
X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [90]:
# make the model
logit = LogisticRegression(random_state=123)

#fit the model
logit.fit(X_train, y_train)

LogisticRegression(random_state=123)

In [91]:
# evaluate the intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-4.08516719e-02  2.25422643e-03 -1.29705245e+00 -2.47395007e+00]]
Intercept: 
 [5.09994635]


In [92]:
# make predictions
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [93]:
# evaluate accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
# evaluate accuracy against validation set
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on training set: 0.82
Accuracy of Logistic Regression classifier on validation set: 0.84


### Try out other combinations of features and models.



In [94]:
# make the model
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

#fit the model
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [95]:
# evaluate the intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-3.47419999e-02  1.71550522e-03 -1.06720823e+00 -2.34704264e+00]]
Intercept: 
 [8.9079236]


In [96]:
# make predictions
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [97]:
# evaluate accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
# evaluate accuracy against validation set
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on training set: 0.40
Accuracy of Logistic Regression classifier on validation set: 0.41


### try adjusting features on original model by adding alone

In [103]:
#  add alone
features = features = ['age', 'pclass', 'fare', 'alone', 'sex_male']

# create X and y version of train, validate, and test
X_train = train[features]
y_train = train.survived

X_validate = validate[features]
y_validate = validate.survived

X_test = test[features]
y_test = test.survived

In [104]:
# make the model
logit = LogisticRegression(random_state=123)

#fit the model
logit.fit(X_train, y_train)

LogisticRegression(random_state=123)

In [105]:
# evaluate the intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-3.94120980e-02 -1.29906355e+00  1.65458829e-03 -2.33152902e-01
  -2.43491042e+00]]
Intercept: 
 [5.1852801]


In [106]:
# evaluate accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
# evaluate accuracy against validation set
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_validate, y_validate)))

Accuracy of Logistic Regression classifier on training set: 0.83
Accuracy of Logistic Regression classifier on validation set: 0.84


### Use your best 3 models to predict and evaluate on the validate sample

##### Already done on previous steps

### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [108]:
# evaluate accuracy
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logit.score(X_test, y_test)))

Accuracy of Logistic Regression classifier on test set: 0.70


In [None]:
p