In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

## Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

- What is your baseline prediction? <b> predicting survivability </b>

- What is your baseline accuracy? <b> 61.6%</b> remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

- Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample) <b> complete</b>

- Evaluate your in-sample results using the model score, confusion matrix, and classification report. <b> complete</b>

- Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support. <b>complete, see classification report</b>

- Run through steps 2-4 using a different max_depth value. <b> complete, depth value of 3 works best</b> 

- Which model performs better on your in-sample data? <b> model with depth of 3</b>

- Which model performs best on your out-of-sample data, the validate set? <b> The model with a depth of 3</b>

In [2]:
df = pd.read_csv('/Users/davidberchelmann/codeup-data-science/classification-exercises/titanic_df.csv')

def handle_missing_values(df):
    return df.assign(
        embark_town=df.embark_town.fillna('Other'),
        embarked=df.embarked.fillna('O'),
    )

def remove_columns(df):
    return df.drop(columns=['deck'])

def encode_embarked(df):
    encoder = LabelEncoder()
    encoder.fit(df.embarked)
    return df.assign(embarked_encode = encoder.transform(df.embarked))

def prep_titanic_data(df):
    df = df\
        .pipe(handle_missing_values)\
        .pipe(remove_columns)\
        .pipe(encode_embarked)
    return df

def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test





In [3]:
# check out data columns first before cleaning. Get rid of 'Unnamed', 'passenger_id', 'pclass', 'age', 'sibsp', 'parch'
# rename survived column to 'yes' 'no' and use as target variable

df.head()


Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
# clean data followed by creating train/validate/test function

def clean_titanic():
    '''
    clean_titanic will take a dataframe acquired as df and remove columns that are:
    duplicates,
    have too many nulls,
    and will fill in smaller amounts of nulls in embark_town
    encode sex and embark_town columns
    
    return: single cleaned dataframe
    '''
    df.drop_duplicates(inplace=True)
    dropcols = ['age', 'deck', 'embarked', 'Unnamed: 0', 'passenger_id', 'pclass', 'sibsp', 'parch']
    df.drop(columns=dropcols, inplace=True)
    df['embark_town'] = df['embark_town'].fillna('Southampton')
    dummies = pd.get_dummies(df[['embark_town', 'sex', 'class']], drop_first=True)
    return pd.concat([df, dummies], axis=1)


def train_validate_test_split(df, seed=123):
    train_and_validate, test = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df.survived
    )
    train, validate = train_test_split(
        train_and_validate,
        test_size=0.3,
        random_state=seed,
        stratify=train_and_validate.survived,
    )
    return train, validate, test



In [5]:
# clean data using function from above

df = clean_titanic()

In [6]:
# rename survied column to using a no/yes in place of 0/1

#df['survived'] = df['survived'].replace([0,1],['no', 'yes'])

In [7]:
# check columns to make sure changes have been made

df.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
0,0,male,7.25,Third,Southampton,0,0,1,1,0,1
1,1,female,71.2833,First,Cherbourg,0,0,0,0,0,0
2,1,female,7.925,Third,Southampton,1,0,1,0,0,1
3,1,female,53.1,First,Southampton,0,0,1,0,0,0
4,0,male,8.05,Third,Southampton,1,0,1,1,0,1


In [8]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = train_validate_test_split(df, seed=123)
train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,0,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,1,male,20.525,Third,Southampton,0,0,1,1,0,1
50,0,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,1,female,26.0,Second,Southampton,0,0,1,0,1,0
306,1,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [9]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_test = test.survived

In [10]:
# check training data

train.head()

Unnamed: 0,survived,sex,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male,class_Second,class_Third
583,0,male,40.125,First,Cherbourg,1,0,0,1,0,0
165,1,male,20.525,Third,Southampton,0,0,1,1,0,1
50,0,male,39.6875,Third,Southampton,0,0,1,1,0,1
259,1,female,26.0,Second,Southampton,0,0,1,0,1,0
306,1,female,110.8833,First,Cherbourg,1,0,0,0,0,0


In [11]:
df.groupby('survived').sex.value_counts()

survived  sex   
0         male      468
          female     81
1         female    233
          male      109
Name: sex, dtype: int64

In [12]:
train.groupby('survived').sex.value_counts()

survived  sex   
0         male      265
          female     42
1         female    133
          male       58
Name: sex, dtype: int64

In [13]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [14]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [15]:
train.survived.unique()

array([0, 1])

In [16]:
clf.classes_

array([0, 1])

In [17]:
train["most_frequent"] = 0

In [18]:
train['most_frequent'].value_counts()

0    498
Name: most_frequent, dtype: int64

In [19]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [20]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [None]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

In [None]:
y_train.head(3)

In [None]:

baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy 

In [None]:
# Let's evaluate the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
accuracy = clf.score(X_train, y_train)
accuracy

In [None]:
conf = confusion_matrix(y_train, y_pred)
conf

In [None]:
class_report = classification_report(y_train, y_pred, output_dict=True)

In [None]:
pd.DataFrame(class_report).T

In [None]:
conf_df = pd.DataFrame(conf, columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

conf_df

In [None]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

In [None]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

In [None]:
y_validate.head(3)

In [None]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

In [None]:
not_male = train[train.sex_male > 0.5]

In [None]:
fare = not_male[not_male.fare <= 18.275]

In [None]:
fare.head()

In [None]:
alone = fare[fare.alone <=.5]

In [None]:
alone.survived.value_counts()

In [None]:
# Visualize the model so it can explain itself!

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

In [None]:
# run new model with depth of 2
clf1 = DecisionTreeClassifier(max_depth=2, random_state=123)


In [None]:
# Now let's train our model on the training data
# fitting == training the model
clf1 = clf1.fit(X_train, y_train)
clf1

In [None]:
y_pred = clf1.predict(X_train)
y_pred[0:3]

In [None]:
# Estimate the probabilities for each class
y_pred_proba = clf1.predict_proba(X_train)
y_pred_proba[0:3]

In [None]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf1.score(X_validate, y_validate)

In [None]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))

In [45]:
#----------------------------------------------------------------------

from sklearn.ensemble import RandomForestClassifier

### Random Forest Exercises

- Continue working in your model file with titanic data to do the following:

- Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

- Evaluate your results using the model score, confusion matrix, and classification report.

- Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Run through steps increasing your min_samples_leaf and decreasing your max_depth.

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [46]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [47]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [48]:
print(rf.feature_importances_)

[0.48108242 0.03828706 0.01592272 0.02759923 0.33183925 0.02041746
 0.08485187]


In [49]:
y_pred = rf.predict(X_train)

In [50]:
y_pred_proba = rf.predict_proba(X_train)

In [51]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [52]:
print(confusion_matrix(y_train, y_pred))

[[302   5]
 [ 27 164]]


In [53]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       307
           1       0.97      0.86      0.91       191

    accuracy                           0.94       498
   macro avg       0.94      0.92      0.93       498
weighted avg       0.94      0.94      0.93       498



In [54]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.77


In [55]:
# create a function to calculate these metrics
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report
    

In [56]:
class_report1 = get_metrics_binary(rf)
class_report1


    The accuracy for our model is 0.9357
    The True Positive Rate is 0.859, The False Positive Rate is 0.0163,
    The True Negative Rate is 0.984, and the False Negative Rate is 0.141
    


Unnamed: 0,precision,recall,f1-score,support
0,0.917933,0.983713,0.949686,307.0
1,0.970414,0.858639,0.911111,191.0
accuracy,0.935743,0.935743,0.935743,0.935743
macro avg,0.944174,0.921176,0.930398,498.0
weighted avg,0.938061,0.935743,0.934891,498.0


In [57]:
#-------------------------------------

In [None]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
rf2.fit(X_train, y_train)

In [None]:
print(rf2.feature_importances_)

In [None]:
y_pred = rf2.predict(X_train)

In [None]:
y_pred_proba = rf2.predict_proba(X_train)

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

In [None]:
class_report2 = get_metrics_binary(rf2)

class_report2

In [None]:
# get predictions for our validation sets
y_val_pred_1 = rf.predict(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']))
y_val_pred_2 = rf2.predict(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']))

In [None]:
# get validation accuracy
accuracy_v_1 = rf.score(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']), validate.survived)
accuracy_v_2 = rf2.score(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']), validate.survived)
accuracy_v_3 = rf3.score(validate.drop(columns=['survived', 'sex', 'embark_town', 'class']), validate.survived)

In [None]:
accuracy_v_1

In [None]:
accuracy_v_2

In [None]:
accuracy_v_3

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

    <b>The first model performed best on the in sample data which had a depth of 10 and min samples of 1.</b>
    
- After making a few models, which one has the best performance (or closest metrics) on both train and validate?

    <b>The second model performed slightly better on validate (1 percentage point) while the first model had a higher performance on train data.</b>

In [None]:
rf3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [None]:
rf3.fit(X_train, y_train)

In [None]:
print(rf3.feature_importances_)

In [None]:
y_pred = rf3.predict(X_train)

In [None]:
y_pred_proba = rf3.predict_proba(X_train)

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf3.score(X_train, y_train)))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf3.score(X_validate, y_validate)))

In [None]:
class_report3 = get_metrics_binary(rf3)

class_report3

In [None]:
print('------------------------\n Model #1')

print(class_report1) 

print('------------------------\n Model #2')

print(class_report2)

print('------------------------\n Model #3')

print(class_report3)

# KNN Exercises

- Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

- Evaluate your results using the model score, confusion matrix, and classification report.

- Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

- Run through steps 2-4 setting k to 10

- Run through setps 2-4 setting k to 20

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- Which model performs best on our out-of-sample data from validate?

In [21]:
from sklearn.neighbors import KNeighborsClassifier

##### Note to self: when running your inputs and target variables, you can't have 'most_frequent' as a column in X_train or else you validate and test won't work properly.

In [22]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived', 'sex', 'embark_town', 'class', 'most_frequent'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town', 'class'])
y_test = test.survived

In [23]:
knn = KNeighborsClassifier()

In [24]:
# Now let's train the model

knn.fit(X_train, y_train)

KNeighborsClassifier()

In [25]:
# Let's check the accuracy
accuracy = knn.score(X_train, y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.819


In [36]:
# Evaluate the model
y_pred = knn.predict(X_train)

In [37]:
# Let's see how well this model performs on out of sample data!
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.75


In [38]:
# Obtain the predictions from the model
y_pred1 = knn.predict(X_validate)

In [39]:
def get_metrics_binary(knn):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = knn.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [40]:
class_report_knn = get_metrics_binary(knn)

class_report_knn


    The accuracy for our model is 0.8193
    The True Positive Rate is 0.723, The False Positive Rate is 0.121,
    The True Negative Rate is 0.879, and the False Negative Rate is 0.277
    


Unnamed: 0,precision,recall,f1-score,support
0,0.835913,0.879479,0.857143,307.0
1,0.788571,0.722513,0.754098,191.0
accuracy,0.819277,0.819277,0.819277,0.819277
macro avg,0.812242,0.800996,0.805621,498.0
weighted avg,0.817756,0.819277,0.817622,498.0


In [65]:
# Let's check our other classification metrics
print(classification_report(y_validate, y_pred1))

              precision    recall  f1-score   support

           0       0.74      0.85      0.79       132
           1       0.68      0.52      0.59        82

    accuracy                           0.72       214
   macro avg       0.71      0.69      0.69       214
weighted avg       0.72      0.72      0.72       214



In [58]:
knn = KNeighborsClassifier(10)

In [59]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [60]:
# Let's check the accuracy
accuracy = knn.score(X_train, y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.789


In [61]:
# Evaluate the model
y_pred = knn.predict(X_train)

In [62]:
# Let's see how well this model performs on out of sample data!
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.72


In [67]:
# Obtain the predictions from the model
y_pred1 = knn.predict(X_validate)

In [68]:
def get_metrics_binary(knn):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = knn.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [69]:
class_report_knn_10 = get_metrics_binary(knn)

class_report_knn_10


    The accuracy for our model is 0.7892
    The True Positive Rate is 0.628, The False Positive Rate is 0.111,
    The True Negative Rate is 0.889, and the False Negative Rate is 0.372
    


Unnamed: 0,precision,recall,f1-score,support
0,0.793605,0.889251,0.83871,307.0
1,0.779221,0.628272,0.695652,191.0
accuracy,0.789157,0.789157,0.789157,0.789157
macro avg,0.786413,0.758762,0.767181,498.0
weighted avg,0.788088,0.789157,0.783842,498.0


In [70]:
# Let's check our other classification metrics
print(classification_report(y_validate, y_pred1))

              precision    recall  f1-score   support

           0       0.74      0.85      0.79       132
           1       0.68      0.52      0.59        82

    accuracy                           0.72       214
   macro avg       0.71      0.69      0.69       214
weighted avg       0.72      0.72      0.72       214



In [71]:
knn = KNeighborsClassifier(20)

In [72]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [73]:
# Let's check the accuracy
accuracy = knn.score(X_train, y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.743


In [74]:
# Evaluate the model
y_pred = knn.predict(X_train)

In [75]:
# Let's see how well this model performs on out of sample data!
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.70


In [76]:
# Obtain the predictions from the model
y_pred1 = knn.predict(X_validate)

In [77]:
def get_metrics_binary(knn):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = knn.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [78]:
class_report_knn_20 = get_metrics_binary(knn)

class_report_knn_20


    The accuracy for our model is 0.743
    The True Positive Rate is 0.55, The False Positive Rate is 0.137,
    The True Negative Rate is 0.863, and the False Negative Rate is 0.45
    


Unnamed: 0,precision,recall,f1-score,support
0,0.754986,0.863192,0.805471,307.0
1,0.714286,0.549738,0.621302,191.0
accuracy,0.742972,0.742972,0.742972,0.742972
macro avg,0.734636,0.706465,0.713386,498.0
weighted avg,0.739376,0.742972,0.734836,498.0


In [79]:
# Let's check our other classification metrics
print(classification_report(y_validate, y_pred1))

              precision    recall  f1-score   support

           0       0.73      0.83      0.77       132
           1       0.64      0.50      0.56        82

    accuracy                           0.70       214
   macro avg       0.68      0.66      0.67       214
weighted avg       0.69      0.70      0.69       214



In [None]:
outcomes = []
for k in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(X_validate, y_validate)
    test_accuracy = knn.score(X_test, y_test)
    output = {}
    output["k"] = k
    output["accuracy"] = accuracy
    output["validate_accuracy"] = validate_accuracy
    output["test_accuracy"] = test_accuracy
    outcomes.append(output)

In [None]:
outcomes = []
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(X_validate, y_validate)
    test_accuracy = knn.score(X_test, y_test)
    output = {}
    output["k"] = k
    output["accuracy"] = accuracy
    output["validate_accuracy"] = validate_accuracy
    output["test_accuracy"] = test_accuracy
    outcomes.append(output)