In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score #classification_report

In [None]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

## KNN

In [None]:
# Preparing to Hyper tune RFC using GridSearchCV
cv = 5                             # Set how many cross validations you would like.
neighbors = list(range(1,13))      #
scoring = ['accuracy', 'f1']

In [None]:
# This only needs to be run on the first use of this notebook, or if changes have been made to the variables above.
knn = KNeighborsClassifier()

param_grid = [                     # GridSearchCV params requires a 'list', so we created a dictionary within the list to pass multiple params.
    {'n_neighbors': neighbors}
] 

grid_search = GridSearchCV(knn, param_grid, cv=cv,          # Just passing in the variables declared above
                           scoring=scoring,
                           refit='f1',
                           n_jobs=-2,                        # the number of processors your computer will use to run this model
                           return_train_score=True,          # 
                           verbose=50)                       # verbose > 0 gives us a progress bar to check on.

# running a grid search through range of estimators and range of depth resulting in 48 fit tests.
grid_search.fit(X_train, y_train) 

#### Find optimal value for K

In [None]:
# This only needs to be run on the first use of this notebook, or if changes have been made 
k_range = range(1, 13)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_predict = knn.predict(X_test)
    score = f1_score(y_test, y_predict, average='weighted')
    k_scores.append(score)
    print("When k={}, f1 score={}".format(k,round(score, 4)))
%store k_scores

In [None]:
%store -r k_scores
high_score = 0
for index, element in enumerate(k_scores):
    if element > high_score:
        high_score = element
        ind = index
        
print("The highest score is {}; when k = {}".format(high_score, ind + 1))
plt.plot(range(1,13),k_scores)

#### The optimal k value is 11; As made evident by the plot.

### Begin KNN modeling using optimal K

How to plot Confusion Matrix

https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

In [None]:
# Initialize the KNN class
knn = KNeighborsClassifier(n_neighbors=11)

# Use training data to fit onto knn
knn.fit(X_train, y_train)

# Use the fit to predict y hat
y_pred_class = knn.predict(X_test)

# from sklearn.metrics, we use confusion_matrix and pass in the y_test and y hat; saving the result as cm.
# this will calculate how many true positives, true negatives, false positives, and false negatives there are.
cm = confusion_matrix(y_test,y_pred_class)

# because we're aiming to predict a value over/under 50K, we set the classes appropriately. 
classes = ['<=50K', '>50K']

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
#         if normalized, convert the confusion matrix to a float, then divide the top quadrants by the summation of the top row
#           and bottom quadrants by the summation of the bottom row to get a percentage for each quadrant.
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 
        print("Normalized confusion matrix")
    else:
        print('Confusion Matrix, without normalization')

    print(cm)

#     Set the plt with the confusion matrix, the colors defined (in this case default)
    plt.imshow(cm, interpolation='nearest', cmap=cmap) # <- What is interpolation?
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes)) 
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

# what is all of this?
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(cm, classes=classes)

In [None]:
plot_confusion_matrix(cm, classes=classes, normalize=True)

The number of Positives that were correctly identified are 57% of the actual Positives that exist in the dataset. This is also known as Recall.

We want to use Recall as a way to compare the effectiveness of different algorithms because we would rather falsely predict a lower salary of an individual(False Negatives), so that they could adequately prepare themselves financially. We would not want to falsely predict higher salaries for individuals, so that they will not have to deal with financial complications(False Positives).

In [None]:
# Storing KNN scores for comparisons.
knn = {
    'accuracy': accuracy_score(y_test, y_pred_class),
    'f1': f1_score(y_test, y_pred_class)
}
%store knn