In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

In [3]:
# Importing the method needed to apply KNN classification

from sklearn.neighbors import KNeighborsClassifier

### About the method KNeighborsClassifier() that we are about to apply

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Minkowski_distance:

https://en.wikipedia.org/wiki/Minkowski_distance

# KNN classification on the Default dataset

### Finding a good K

The __book approach__ (= the standard approach) to find a good K is to try several Ks and __select the best one based on CV__. The best K is the one leading to the lowest test error.

Let's try this approach now! 

We will be selecting the K leading to the highest accuracy (which is the same as the one leading to the lowest error)

__Choosing K via CV based on overall accuracy__

__CV is applied using GridSearch()__

In [4]:
Default_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Default.csv')

In [5]:
Default_df_dummies= pd.get_dummies(Default_df,columns=['student'], drop_first=True)

In [6]:
X_train_def, X_test_def, y_train_def, y_test_def= train_test_split (Default_df_dummies.iloc[:,1:], Default_df_dummies['default'], test_size=0.2, random_state=1)

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
pipe_default_CVSearch = make_pipeline(StandardScaler(), KNeighborsClassifier())

In [10]:
pipe_default_CVSearch.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'kneighborsclassifier', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'kneighborsclassifier__algorithm', 'kneighborsclassifier__leaf_size', 'kneighborsclassifier__metric', 'kneighborsclassifier__metric_params', 'kneighborsclassifier__n_jobs', 'kneighborsclassifier__n_neighbors', 'kneighborsclassifier__p', 'kneighborsclassifier__weights'])

In [11]:
# Trying K from 1 to 100

k= np.arange(1,101)

In [15]:
hyperparam_grid = {
    'kneighborsclassifier__n_neighbors': k,
    'kneighborsclassifier__algorithm':['brute'],
    'kneighborsclassifier__weights':['uniform', 'distance']
}

In [16]:
grid_search= GridSearchCV(estimator= pipe_default_CVSearch, param_grid=hyperparam_grid, cv=10, scoring='accuracy')

__DO NOT RUN NEXT CELL !!!!__

In [17]:
grid_search.fit(X_train_def, y_train_def)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__algorithm': ['brute'],
                         'kneighborsclassifier__n_neighbors': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100]),
                         'kneighborsclassifier__weights': ['uniform',
           

In [18]:
grid_search.best_params_

{'kneighborsclassifier__algorithm': 'brute',
 'kneighborsclassifier__n_neighbors': 11,
 'kneighborsclassifier__weights': 'uniform'}

In [19]:
pipe_default= make_pipeline(StandardScaler(), KNeighborsClassifier(algorithm='brute', n_neighbors=11, weights= 'uniform'))

In [20]:
pipe_default.fit(X_train_def, y_train_def)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='brute', n_neighbors=11))])

__Note__: Notice that, as we discussed we went over the slides, the KNN method does not fit any model.

Therefore, when we call the fit() method on a KNN classifier, we are just telling the method to save the training data in memory. The method uses this data to classify new test observations. 

The training data stored in memory when fit() is called is the __reference data__ used to compute the distances and __identify the K nearest neighbors to the test observations that we want to classify.__

In [21]:
y_predicted_test_default= pipe_default.predict(X_test_def)

In [22]:
confusion_matrix (y_test_def, y_predicted_test_default)

array([[1924,   17],
       [  40,   19]], dtype=int64)

In [23]:
print (classification_report (y_test_def, y_predicted_test_default))

              precision    recall  f1-score   support

          No       0.98      0.99      0.99      1941
         Yes       0.53      0.32      0.40        59

    accuracy                           0.97      2000
   macro avg       0.75      0.66      0.69      2000
weighted avg       0.97      0.97      0.97      2000



__Changing the probability threshold for the above classifier (the KNN witk k=11)__

In [35]:
array_prob= np.arange(0.05, 0.51, 0.05)

In [36]:
prob_yes_default= pipe_default.predict_proba(X_test_def)[:,1]

In [37]:
dict_predictions_default= dict()

In [38]:
dict_f1_scores_default= dict()

In [39]:
for j in array_prob:
    dict_predictions_default[j]=np.empty(y_test_def.size, dtype=object)
    for i in np.arange(0, dict_predictions_default[j].size):
        if prob_yes_default[i] > j:
            dict_predictions_default[j][i]= 'Yes'
        else:
            dict_predictions_default[j][i]= 'No'
    dict_f1_scores_default[j]= np.round (f1_score(y_test_def, dict_predictions_default[j],pos_label='Yes'),3)

In [40]:
dict_f1_scores_default

{0.05: 0.261,
 0.1: 0.357,
 0.15000000000000002: 0.357,
 0.2: 0.369,
 0.25: 0.369,
 0.3: 0.393,
 0.35000000000000003: 0.393,
 0.4: 0.396,
 0.45: 0.396,
 0.5: 0.4}

Nothing to change because the max f1 score happens using the default threshold of 0.5 (the result with threshold 0.5 were obtained above already (when predict() is called, it uses a threshold of 0.5)

__Choosing K via CV based on the f1-score__

__CV is applied using GridSearch()__

In [24]:
from sklearn.metrics import f1_score

In [25]:
from sklearn.metrics import make_scorer

In [26]:
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [27]:
grid_search_f1= GridSearchCV(estimator= pipe_default_CVSearch, param_grid=hyperparam_grid, cv=10, scoring=f1_scorer)

__DO NOT RUN NEXT CELL !!!!__

In [28]:
grid_search_f1.fit(X_train_def, y_train_def)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__algorithm': ['brute'],
                         'kneighborsclassifier__n_neighbors': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100]),
                         'kneighborsclassifier__weights': ['uniform',
           

In [29]:
grid_search_f1.best_params_

{'kneighborsclassifier__algorithm': 'brute',
 'kneighborsclassifier__n_neighbors': 7,
 'kneighborsclassifier__weights': 'uniform'}

In [30]:
pipe_default2= make_pipeline(StandardScaler(), KNeighborsClassifier(algorithm='brute', n_neighbors=7, weights= 'uniform'))

In [31]:
pipe_default2.fit(X_train_def, y_train_def)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='brute', n_neighbors=7))])

In [32]:
y_predicted_test_default2= pipe_default2.predict(X_test_def)

In [33]:
confusion_matrix (y_test_def, y_predicted_test_default2)

array([[1920,   21],
       [  37,   22]], dtype=int64)

In [34]:
print (classification_report (y_test_def, y_predicted_test_default2))

              precision    recall  f1-score   support

          No       0.98      0.99      0.99      1941
         Yes       0.51      0.37      0.43        59

    accuracy                           0.97      2000
   macro avg       0.75      0.68      0.71      2000
weighted avg       0.97      0.97      0.97      2000



__Choosing K via CV based on overall accuracy__

__CV is applied using cross_val_score() method__

In [42]:
from sklearn.model_selection import cross_val_score

In [43]:
overall_accuracy_k =[]
for i in k:
    pipe_loop = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=i, algorithm='brute', weights= 'uniform'))
    cv_scores= cross_val_score(pipe_loop, X_train_def, y_train_def, scoring= 'accuracy',cv=10)
    overall_accuracy_k.append(np.round(np.mean(cv_scores),4))

In [44]:
# For what value of K does the max accuracy happen?
# Let's get the index at which the max happens and lets use that index to retrieve the K later

overall_accuracy_k.index(max(overall_accuracy_k))

10

In [46]:
# For what value of K does the max accuracy happens?

k[overall_accuracy_k.index(max(overall_accuracy_k))]

11

We also got k=11 when we applied CV via GridSearch() and used the accuracy as the metric

__Choosing K via CV based on f1-score__

__CV is applied using cross_val_score() method__

WORK ON IT INDEPENDENTLY FOR 5 MINS !