In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

# Logistic Regression

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [3]:
# prepped titanic data: train, validate, test samples

train, validate, test = prep_titanic()
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [4]:
# baseline model: not survived is most common
survived = train.survived.value_counts()
print(survived, '\n')

# died/total passengers (accuracy)
baseline_accuracy = survived[0]/survived.sum()
print('Baseline Acccuracy:', baseline_accuracy)

0    307
1    190
Name: survived, dtype: int64 

Baseline Acccuracy: 0.6177062374245473


In [5]:
# MODEL 1: includes age, pclass, fare

# A. create model object
logit = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train = train[['age', 'fare', 'pclass']]
Y_train = train.survived

X_validate = validate[['age', 'fare', 'pclass']]
Y_validate = validate.survived

X_test = test[['age', 'fare', 'pclass']]
Y_test = test.survived

# C. Fit to X and Y train
logit = logit.fit(X_train, Y_train)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03051881  0.00266519 -0.97983178]]
Intercept: 
 [2.52970125]


In [6]:
#D. Predict Values on X_train
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

#accuracy
print('Accuracy: ', logit.score(X_train, Y_train), '\n')

#confusion matrix
print(confusion_matrix(Y_train, y_pred))

#classification report
print(classification_report(Y_train, y_pred))

Accuracy:  0.716297786720322 

[[265  42]
 [ 99  91]]
              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [7]:
# create dummy variable for sex as is_male
train_dummies = pd.get_dummies(train[['sex']], drop_first=True)
train = pd.concat([train, train_dummies], axis=1)

validate_dummies = pd.get_dummies(validate[['sex']], drop_first=True)
validate = pd.concat([validate, validate_dummies], axis=1)

test_dummies = pd.get_dummies(test[['sex']], drop_first=True)
test = pd.concat([test, test_dummies], axis=1)

train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,sex_male
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,0


In [8]:
# MODEL 2: includes age, pclass, fare, sex

# A. create model object
logit2 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train2 = train[['age', 'fare', 'pclass', 'sex_male']]
Y_train2 = train.survived

X_validate2 = validate[['age', 'fare', 'pclass', 'sex_male']]
Y_validate2 = validate.survived

X_test2 = test[['age', 'fare', 'pclass', 'sex_male']]
Y_test2 = test.survived

# C. Fit to X and Y train
logit2 = logit2.fit(X_train2, Y_train2)

print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-2.66594879e-02  9.02716903e-04 -1.11402368e+00 -2.45878213e+00]]
Intercept: 
 [4.30664987]


In [9]:
#D. Predict Values on X_train
y_pred2 = logit2.predict(X_train2)
y_pred_proba2 = logit2.predict_proba(X_train2)

#accuracy
print('Accuracy: ', logit2.score(X_train2, Y_train2), '\n')

#confusion matrix
print(confusion_matrix(Y_train2, y_pred2))

#classification report
print(classification_report(Y_train2, y_pred2))

Accuracy:  0.7987927565392354 

[[263  44]
 [ 56 134]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



3. Try out other combinations of features and models.

In [10]:
# MODEL 3: includes age, pclass, fare, sex, and alone

# A. create model object
logit3 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train3 = train[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_train3 = train.survived

X_validate3 = validate[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_validate3 = validate.survived

X_test3 = test[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_test3 = test.survived

# C. Fit to X and Y train
logit3 = logit3.fit(X_train3, Y_train3)

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

Coefficient: 
 [[-2.55633270e-02  5.53491876e-04 -1.10859199e+00 -2.41546062e+00
  -1.58214588e-01]]
Intercept: 
 [4.33705386]


In [11]:
#D. Predict Values on X_train
y_pred3 = logit3.predict(X_train3)
y_pred_proba3 = logit3.predict_proba(X_train3)

#accuracy
print('Accuracy: ', logit3.score(X_train3, Y_train3), '\n')

#confusion matrix
print(confusion_matrix(Y_train3, y_pred3))

#classification report
print(classification_report(Y_train3, y_pred3))

Accuracy:  0.7967806841046278 

[[263  44]
 [ 57 133]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.70      0.72       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.79      0.80      0.80       497



4. Use you best 3 models to predict and evaluate on your validate sample.

In [12]:
# Validate Accuracy
y_pred = logit.predict(X_validate)
y_pred2 = logit2.predict(X_validate2)
y_pred3 = logit3.predict(X_validate3)

print("model 1\n", logit.score(X_validate, Y_validate))
print("model 2\n", logit2.score(X_validate2, Y_validate2))
print("model 3\n", logit3.score(X_validate3, Y_validate3))

model 1
 0.7289719626168224
model 2
 0.7850467289719626
model 3
 0.7850467289719626


In [13]:
# Validate Confusion Matrix
print("model 1\n", confusion_matrix(Y_validate, y_pred))
print("model 2\n", confusion_matrix(Y_validate2, y_pred2))
print("model 3\n", confusion_matrix(Y_validate3, y_pred3))

model 1
 [[116  16]
 [ 42  40]]
model 2
 [[111  21]
 [ 25  57]]
model 3
 [[110  22]
 [ 24  58]]


In [14]:
# Validate Classification Report
print("model 1\n", classification_report(Y_validate, y_pred))
print("model 2\n", classification_report(Y_validate2, y_pred2))
print("model 3\n", classification_report(Y_validate3, y_pred3))

model 1
               precision    recall  f1-score   support

           0       0.73      0.88      0.80       132
           1       0.71      0.49      0.58        82

    accuracy                           0.73       214
   macro avg       0.72      0.68      0.69       214
weighted avg       0.73      0.73      0.72       214

model 2
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       132
           1       0.73      0.70      0.71        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78       214

model 3
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       132
           1       0.72      0.71      0.72        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78    

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [15]:
y_pred3 = logit3.predict(X_test3)
y_pred_proba3 = logit3.predict_proba(X_test3)

print("Model 3: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit3.score(X_test3, Y_test3)))

print(confusion_matrix(Y_test3, y_pred3))

print(classification_report(Y_test, y_pred3))

Model 3: solver = lbfgs, c = 1
Accuracy: 0.81
[[92 18]
 [16 52]]
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.74      0.76      0.75        68

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



# Decision Trees

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [17]:
X_train = train.drop(columns=['passenger_id', 'sex', 'embark_town', 'embarked', 'class', 'survived'])
y_train = train[['survived']]

In [18]:
# Decision Tree Object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit model to training data
clf = clf.fit(X_train, y_train)

#Estimate Survived
y_pred = clf.predict(X_train)

# Estimate probability of survived
y_pred_proba = clf.predict_proba(X_train)

clf, y_pred[:5], y_pred_proba[:5]

(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=3, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=123, splitter='best'),
 array([0, 1, 0, 1, 1]),
 array([[0.51666667, 0.48333333],
        [0.0326087 , 0.9673913 ],
        [0.88      , 0.12      ],
        [0.0326087 , 0.9673913 ],
        [0.0326087 , 0.9673913 ]]))

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [19]:
# Compute Accuracy
acc = round(clf.score(X_train, y_train),2)

# Create Confustion Matrix
labels = sorted(y_train.survived.unique())

cm = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [20]:
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

# Create Classification Report
cr = classification_report(y_train, y_pred)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [21]:
print('Accuracy: ', acc, '\n')
print('True Positive Rate: ', TPR,  '\n')
print('False Positive Rate: ', FPR, '\n')
print('True Negative Rate: ', TNR, '\n')
print('False Negative Rate: ', FNR, '\n')
print('Classification Report: \n', cr)
print('Confustion Matrix: \n', cm)

Accuracy:  0.82 

True Positive Rate:  0.8181818181818182 

False Positive Rate:  0.1794871794871795 

True Negative Rate:  0.8205128205128205 

False Negative Rate:  0.18181818181818182 

Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497

Confustion Matrix: 
      0    1
0  279   28
1   62  128


4. Run through steps 2-4 using a different max_depth value.

In [22]:
# Decision Tree Object
clf2 = DecisionTreeClassifier(max_depth=5, random_state=123)

# Fit model to training data
clf2 = clf2.fit(X_train, y_train)

#Estimate Survived
y_pred2 = clf2.predict(X_train)

# Estimate probability of survived
y_pred_proba2 = clf2.predict_proba(X_train)

clf2, y_pred2[:5], y_pred_proba2[:5]

(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=5, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=123, splitter='best'),
 array([0, 1, 0, 1, 1]),
 array([[0.60416667, 0.39583333],
        [0.        , 1.        ],
        [0.87777778, 0.12222222],
        [0.        , 1.        ],
        [0.        , 1.        ]]))

In [23]:
# Compute Accuracy
acc2 = round(clf2.score(X_train, y_train),2)

# Create Confustion Matrix
labels2 = sorted(y_train.survived.unique())

cm2 = pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)
cm2

Unnamed: 0,0,1
0,285,22
1,49,141


In [24]:
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]

TPR2 = TP2/(TP2+FN2)
FPR2 = FP2/(FP2+TN2)
TNR2 = TN2/(TN2+FP2)
FNR2 = FN2/(FN2+TP2)

# Create Classification Report
cr2 = classification_report(y_train, y_pred2)

In [25]:
print('Accuracy 2: ', acc2, '\n'),
print('True Positive Rate 2: ', TPR2,  '\n')
print('False Positive Rate 2: ', FPR2, '\n')
print('True Negative Rate 2: ', TNR2, '\n')
print('False Negative Rate 2: ', FNR2, '\n')
print('Classification Report 2: \n', cr2),
print('Confustion Matrix 2: \n', cm2)

Accuracy 2:  0.86 

True Positive Rate 2:  0.8532934131736527 

False Positive Rate 2:  0.13496932515337423 

True Negative Rate 2:  0.8650306748466258 

False Negative Rate 2:  0.1467065868263473 

Classification Report 2: 
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       307
           1       0.87      0.74      0.80       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.84       497
weighted avg       0.86      0.86      0.85       497

Confustion Matrix 2: 
      0    1
0  285   22
1   49  141


5. Which performs better on your in-sample data?

In [26]:
d = {'Results': ['Accuracy', 
                 'True Postive Rate', 
                 'False Positive Rate', 
                 'True Negative Rate', 
                 'False Negative Rate'], 
     'Model 1': [acc, TPR, FPR, TNR, FNR], 
     'Model 2':[acc2, TPR2, FPR2, TNR2, FNR2]
    }

best_model = pd.DataFrame(data=d)
best_model

Unnamed: 0,Results,Model 1,Model 2
0,Accuracy,0.82,0.86
1,True Postive Rate,0.818182,0.853293
2,False Positive Rate,0.179487,0.134969
3,True Negative Rate,0.820513,0.865031
4,False Negative Rate,0.181818,0.146707


In [27]:
print('Model 1: \n', cr, '\n', 'Model 2: \n', cr2)

Model 1: 
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497
 
 Model 2: 
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       307
           1       0.87      0.74      0.80       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.84       497
weighted avg       0.86      0.86      0.85       497



In [28]:
import graphviz

from graphviz import Graph

dot_data = export_graphviz(clf,                        
                           feature_names= X_train.columns,                      
                           class_names= {0:'not survived', 1:'survived'},                         
                           rounded=True,   
                           filled=True,                         
                           out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

# Random Forests

In [63]:
from sklearn.ensemble import RandomForestClassifier

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [64]:
# create random forest object
rf = RandomForestClassifier(max_depth=20, 
                            min_samples_leaf=2, 
                            random_state=123)
#fit to the train data
rf = rf.fit(X_train, y_train)

#predict survival based on rf model
y_pred = rf.predict(X_train)

#predict probability survival based on rf model
y_pred_proba = rf.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [65]:
acc = rf.score(X_train, y_train)

cm = confusion_matrix(y_train, y_pred)

cr = classification_report(y_train, y_pred)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [66]:
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

In [67]:
print('RANDOM FOREST: MODEL 1 \n')
print('Accuracy: ', acc, '\n')
print('True Positive Rate: ', TPR,  '\n')
print('False Positive Rate: ', FPR, '\n')
print('True Negative Rate: ', TNR, '\n')
print('False Negative Rate: ', FNR, '\n')
print('Classification Report: \n', cr)
print('Confusion Matrix: \n', cm)

RANDOM FOREST: MODEL 1 

Accuracy:  0.9356136820925554 

True Positive Rate:  0.9771986970684039 

False Positive Rate:  0.13157894736842105 

True Negative Rate:  0.868421052631579 

False Negative Rate:  0.02280130293159609 

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       307
           1       0.96      0.87      0.91       190

    accuracy                           0.94       497
   macro avg       0.94      0.92      0.93       497
weighted avg       0.94      0.94      0.93       497

Confusion Matrix: 
 [[300   7]
 [ 25 165]]


4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [68]:
# create random forest object
rf2 = RandomForestClassifier(max_depth=3, 
                            min_samples_leaf=5, 
                            random_state=123)
#fit to the train data
rf2 = rf2.fit(X_train, y_train)

#predict survival based on rf model
y_pred2 = rf2.predict(X_train)

In [69]:
acc2 = rf2.score(X_train, y_train)

cm2 = confusion_matrix(y_train, y_pred2)

cr2 = classification_report(y_train, y_pred2)

In [70]:
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]

TPR2 = TP2/(TP2+FN2)
FPR2 = FP2/(FP2+TN2)
TNR2 = TN2/(TN2+FP2)
FNR2 = FN2/(FN2+TP2)

In [71]:
print('RANDOM FOREST: MODEL 2 \n')
print('Accuracy: ', acc2, '\n')
print('True Positive Rate: ', TPR2,  '\n')
print('False Positive Rate: ', FPR2, '\n')
print('True Negative Rate: ', TNR2, '\n')
print('False Negative Rate: ', FNR2, '\n')
print('Classification Report: \n', cr2)
print('Confusion Matrix: \n', cm2)

RANDOM FOREST: MODEL 2 

Accuracy:  0.8309859154929577 

True Positive Rate:  0.9381107491856677 

False Positive Rate:  0.34210526315789475 

True Negative Rate:  0.6578947368421053 

False Negative Rate:  0.06188925081433225 

Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.94      0.87       307
           1       0.87      0.66      0.75       190

    accuracy                           0.83       497
   macro avg       0.84      0.80      0.81       497
weighted avg       0.84      0.83      0.83       497

Confusion Matrix: 
 [[288  19]
 [ 65 125]]


5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [72]:
d = {'Results': ['Accuracy', 
                 'True Postive Rate', 
                 'False Positive Rate', 
                 'True Negative Rate', 
                 'False Negative Rate'], 
     'Model 1': [acc, TPR, FPR, TNR, FNR], 
     'Model 2':[acc2, TPR2, FPR2, TNR2, FNR2]
    }

best_model = pd.DataFrame(data=d)
best_model

Unnamed: 0,Results,Model 1,Model 2
0,Accuracy,0.935614,0.830986
1,True Postive Rate,0.977199,0.938111
2,False Positive Rate,0.131579,0.342105
3,True Negative Rate,0.868421,0.657895
4,False Negative Rate,0.022801,0.061889


In [73]:
print('Model 1 (min_samples_leaf = 1, max_depth = 20): \n', cr, '\n', 'Model 2 (min_samples_leaf = 5, max_depth = 3): \n', cr2)

Model 1 (min_samples_leaf = 1, max_depth = 20): 
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       307
           1       0.96      0.87      0.91       190

    accuracy                           0.94       497
   macro avg       0.94      0.92      0.93       497
weighted avg       0.94      0.94      0.93       497
 
 Model 2 (min_samples_leaf = 5, max_depth = 3): 
               precision    recall  f1-score   support

           0       0.82      0.94      0.87       307
           1       0.87      0.66      0.75       190

    accuracy                           0.83       497
   macro avg       0.84      0.80      0.81       497
weighted avg       0.84      0.83      0.83       497



In [None]:
#model 1 has better evaluation metrics because it has more depth

# K-Nearest Neighbors

In [74]:
from sklearn.neighbors import KNeighborsClassifier

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [75]:
# create KNN object
knn = KNeighborsClassifier()

knn = knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [76]:
acc = knn.score(X_train, y_train)

cm = confusion_matrix(y_train, y_pred)

cr = classification_report(y_train, y_pred)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [77]:
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

In [78]:
print('KNN: MODEL 1 \n')
print('Accuracy: ', acc, '\n')
print('True Positive Rate: ', TPR,  '\n')
print('False Positive Rate: ', FPR, '\n')
print('True Negative Rate: ', TNR, '\n')
print('False Negative Rate: ', FNR, '\n')
print('Classification Report: \n', cr)
print('Confusion Matrix: \n', cm)

KNN: MODEL 1 

Accuracy:  0.7746478873239436 

True Positive Rate:  0.8338762214983714 

False Positive Rate:  0.32105263157894737 

True Negative Rate:  0.6789473684210526 

False Negative Rate:  0.16612377850162866 

Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       307
           1       0.72      0.68      0.70       190

    accuracy                           0.77       497
   macro avg       0.76      0.76      0.76       497
weighted avg       0.77      0.77      0.77       497

Confusion Matrix: 
 [[256  51]
 [ 61 129]]


4. Run through steps 2-4 setting k to 10

In [79]:
knn2 = KNeighborsClassifier(n_neighbors=10)

knn2 = knn2.fit(X_train, y_train)

y_pred2 = knn2.predict(X_train)

In [80]:
acc2 = knn2.score(X_train, y_train)

cm2 = confusion_matrix(y_train, y_pred2)

cr2 = classification_report(y_train, y_pred2)

In [81]:
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]

TPR2 = TP2/(TP2+FN2)
FPR2 = FP2/(FP2+TN2)
TNR2 = TN2/(TN2+FP2)
FNR2 = FN2/(FN2+TP2)

In [82]:
print('KNN: MODEL 2 \n')
print('Accuracy: ', acc2, '\n')
print('True Positive Rate: ', TPR2,  '\n')
print('False Positive Rate: ', FPR2, '\n')
print('True Negative Rate: ', TNR2, '\n')
print('False Negative Rate: ', FNR2, '\n')
print('Classification Report: \n', cr2)
print('Confusion Matrix: \n', cm2)

KNN: MODEL 2 

Accuracy:  0.7605633802816901 

True Positive Rate:  0.9185667752442996 

False Positive Rate:  0.49473684210526314 

True Negative Rate:  0.5052631578947369 

False Negative Rate:  0.08143322475570032 

Classification Report: 
               precision    recall  f1-score   support

           0       0.75      0.92      0.83       307
           1       0.79      0.51      0.62       190

    accuracy                           0.76       497
   macro avg       0.77      0.71      0.72       497
weighted avg       0.77      0.76      0.75       497

Confusion Matrix: 
 [[282  25]
 [ 94  96]]


5. Run through setps 2-4 setting k to 20

In [83]:
knn3 = KNeighborsClassifier(n_neighbors=20)

knn3 = knn3.fit(X_train, y_train)

y_pred3 = knn3.predict(X_train)

In [84]:
acc3 = knn3.score(X_train, y_train)

cm3 = confusion_matrix(y_train, y_pred3)

cr3 = classification_report(y_train, y_pred3)

In [85]:
TP3 = cm2[0][0]
FN3 = cm2[0][1]
FP3 = cm2[1][0]
TN3 = cm2[1][1]

TPR3 = TP3/(TP3+FN3)
FPR3 = FP3/(FP3+TN3)
TNR3 = TN3/(TN3+FP3)
FNR3 = FN3/(FN3+TP3)

In [86]:
print('KNN: MODEL 3 \n')
print('Accuracy: ', acc3, '\n')
print('True Positive Rate: ', TPR3,  '\n')
print('False Positive Rate: ', FPR3, '\n')
print('True Negative Rate: ', TNR3, '\n')
print('False Negative Rate: ', FNR3, '\n')
print('Classification Report: \n', cr3)
print('Confusion Matrix: \n', cm3)

KNN: MODEL 3 

Accuracy:  0.7183098591549296 

True Positive Rate:  0.9185667752442996 

False Positive Rate:  0.49473684210526314 

True Negative Rate:  0.5052631578947369 

False Negative Rate:  0.08143322475570032 

Classification Report: 
               precision    recall  f1-score   support

           0       0.72      0.90      0.80       307
           1       0.72      0.43      0.54       190

    accuracy                           0.72       497
   macro avg       0.72      0.66      0.67       497
weighted avg       0.72      0.72      0.70       497

Confusion Matrix: 
 [[276  31]
 [109  81]]


6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [93]:
d = {'Results': ['Accuracy', 
                 'True Postive Rate', 
                 'False Positive Rate', 
                 'True Negative Rate', 
                 'False Negative Rate'], 
     'Model 1': [acc, TPR, FPR, TNR, FNR], 
     'Model 2':[acc2, TPR2, FPR2, TNR2, FNR2],
     'Model 3': [acc3, TPR3, FPR3, TNR3, FNR3]
    }

best_model = pd.DataFrame(data=d)
best_model

Unnamed: 0,Results,Model 1,Model 2,Model 3
0,Accuracy,0.774648,0.760563,0.71831
1,True Postive Rate,0.833876,0.918567,0.918567
2,False Positive Rate,0.321053,0.494737,0.494737
3,True Negative Rate,0.678947,0.505263,0.505263
4,False Negative Rate,0.166124,0.081433,0.081433


In [88]:
print('Model 1 (k=5): \n', cr, '\n', 
      'Model 2 (k=10): \n', cr2, '\n',
      'Model 3 (k=20): \n', cr3)

Model 1 (k=5): 
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       307
           1       0.72      0.68      0.70       190

    accuracy                           0.77       497
   macro avg       0.76      0.76      0.76       497
weighted avg       0.77      0.77      0.77       497
 
 Model 2 (k=10): 
               precision    recall  f1-score   support

           0       0.75      0.92      0.83       307
           1       0.79      0.51      0.62       190

    accuracy                           0.76       497
   macro avg       0.77      0.71      0.72       497
weighted avg       0.77      0.76      0.75       497
 
 Model 3 (k=20): 
               precision    recall  f1-score   support

           0       0.72      0.90      0.80       307
           1       0.72      0.43      0.54       190

    accuracy                           0.72       497
   macro avg       0.72      0.66      0.67       497
weighted avg     

In [None]:
# model 1 takes the lead, they look very similar 
# with k=5 as a smaller source of data points 

**Test**

For both the iris and the titanic data,

1. Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).

2. Create a new dataframe with top 4 features.

3. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).

4. Run your final model on your out-of-sample dataframe (test_df). Evaluate the results.

**Feature Engineering**

- Titanic Data
    - Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?
    - Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?

- Iris Data
    - Create features named petal_area and sepal_area.