In [9]:
from pydataset import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import \
accuracy_score,\
recall_score,\
precision_score,\
confusion_matrix,\
classification_report
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = get_titanic_data('titanic_db')
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
def train_val_test(df, strat, seed = 123):
    train, val_test = train_test_split(df, train_size = 0.8, random_state = seed, stratify = df[strat])
    val, test = train_test_split(val_test, train_size = 0.5, random_state = seed, stratify = val_test[strat])
    return train, val, test

In [5]:
train, validate, test = train_val_test(df, 'survived')

In [6]:
train = train.drop(columns=['embark_town', 'class', 'deck', 'age'])
validate = validate.drop(columns=['embark_town', 'class', 'deck', 'age'])
test = test.drop(columns=['embark_town', 'class', 'deck', 'age'])

In [7]:

target = 'survived'

X_train = train.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])
X_test = test.drop(columns=['survived', 'sex', 'embarked', 'passenger_id'])


y_validate = validate[target]
y_test = test[target]

In [11]:
# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn

In [12]:
knn.fit(X_train, y_train)


In [13]:
y_pred = knn.predict(X_train)


In [14]:
# Evaluate your results using the model score, confusion matrix, and classification report.
print('Accuracy of KNN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('~~~~~')
print(confusion_matrix(y_train, y_pred))
print('~~~~~')
print(classification_report(y_train, y_pred))



Accuracy of KNN classifier on training set: 0.76
~~~~~
[[369  70]
 [ 99 174]]
~~~~~
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       439
           1       0.71      0.64      0.67       273

    accuracy                           0.76       712
   macro avg       0.75      0.74      0.74       712
weighted avg       0.76      0.76      0.76       712



In [17]:
# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, 
# recall, f1-score, and support.
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")
confu = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.7626404494382022

True Positive Rate/Sensitivity/Recall/Power: 0.6373626373626373
False Positive Rate/False Alarm Ratio/Fall-out: 0.15945330296127563
True Negative Rate/Specificity/Selectivity: 0.8405466970387244
False Negative Rate/Miss Rate: 0.3626373626373626

Precision/PPV: 0.7131147540983607
F1 Score: 0.6731141199226305

Support (0): 273
Support (1): 439


In [18]:
# Run through steps 1-3 setting k to 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
confu = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)


Accuracy: 0.7289325842696629

True Positive Rate/Sensitivity/Recall/Power: 0.4945054945054945
False Positive Rate/False Alarm Ratio/Fall-out: 0.1252847380410023
True Negative Rate/Specificity/Selectivity: 0.8747152619589977
False Negative Rate/Miss Rate: 0.5054945054945055

Precision/PPV: 0.7105263157894737
F1 Score: 0.5831533477321814

Support (0): 273
Support (1): 439


In [19]:
# Run through steps 1-3 setting k to 20
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
confu = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)


Accuracy: 0.7064606741573034

True Positive Rate/Sensitivity/Recall/Power: 0.4468864468864469
False Positive Rate/False Alarm Ratio/Fall-out: 0.13211845102505695
True Negative Rate/Specificity/Selectivity: 0.8678815489749431
False Negative Rate/Miss Rate: 0.5531135531135531

Precision/PPV: 0.6777777777777778
F1 Score: 0.5386313465783664

Support (0): 273
Support (1): 439


In [None]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# it got slightly less accurate with each larger k value. k of 5 worked best on my train sample data because it was not overfitting and still gave a better accuracy


In [20]:
# Which model performs best on our out-of-sample data from validate?
# still k of 5 with the highest accuracy, and it is almost identical to the train sample's accuracy
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_validate, y_validate)
y_pred = knn.predict(X_validate)
confu = confusion_matrix(y_validate, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)


Accuracy: 0.7640449438202247

True Positive Rate/Sensitivity/Recall/Power: 0.6764705882352942
False Positive Rate/False Alarm Ratio/Fall-out: 0.18181818181818182
True Negative Rate/Specificity/Selectivity: 0.8181818181818182
False Negative Rate/Miss Rate: 0.3235294117647059

Precision/PPV: 0.696969696969697
F1 Score: 0.6865671641791046

Support (0): 34
Support (1): 55


In [21]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_validate, y_validate)
y_pred = knn.predict(X_validate)
confu = confusion_matrix(y_validate, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.7528089887640449

True Positive Rate/Sensitivity/Recall/Power: 0.6176470588235294
False Positive Rate/False Alarm Ratio/Fall-out: 0.16363636363636364
True Negative Rate/Specificity/Selectivity: 0.8363636363636363
False Negative Rate/Miss Rate: 0.38235294117647056

Precision/PPV: 0.7
F1 Score: 0.65625

Support (0): 34
Support (1): 55


In [22]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_validate, y_validate)
y_pred = knn.predict(X_validate)
confu = confusion_matrix(y_validate, y_pred)
TN, FP, FN, TP = confu.ravel()
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.6629213483146067

True Positive Rate/Sensitivity/Recall/Power: 0.20588235294117646
False Positive Rate/False Alarm Ratio/Fall-out: 0.05454545454545454
True Negative Rate/Specificity/Selectivity: 0.9454545454545454
False Negative Rate/Miss Rate: 0.7941176470588235

Precision/PPV: 0.7
F1 Score: 0.3181818181818182

Support (0): 34
Support (1): 55
