# kNN Hyperparameter Tuning

k Nearest Neighbors has shown to be even better than SVMs at predicting anomalies in our dataset. As such, this notebook aims to determine the optimal hyperparameters for the kNN and our dataset.

In [35]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.metrics import log_loss

from sklearn.model_selection import GridSearchCV

random_state = 50

## Data Prep
We make sure that the training dataset and the testing dataset has the same ratio of hospitalizations, as there could be important anomalies which could be removed by random chance.

This has been copied from the other kNN notebook in this directory.

In [36]:
df = pd.read_csv('../data/frequencies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ProviderID_127318.0,ProviderID_203236159.0,ProviderID_486278.0,ServiceTypeName_AC Agency with Choice,ServiceTypeName_AC Attendant Care,ServiceTypeName_AC Companion Care,ServiceTypeName_AC Family Non Resident U7,ServiceTypeName_AC Family Resident,ServiceTypeName_AC Family Resident U7,ServiceTypeName_AC Family non resident,ServiceTypeName_AC Home Maker,ServiceTypeName_AC Respite,ServiceTypeName_AC Spouse,ServiceTypeName_DDD Habilitation,ServiceTypeName_DDD-Attendant Care,ServiceTypeName_DDD-Respite,ServiceTypeName_VA Home Care,TaskCodeGroupName_Ambulation:Light exercise (11),TaskCodeGroupName_Ambulation:Walker (12),TaskCodeGroupName_Bath:Complete Shower (14),TaskCodeGroupName_Bath:Sponge Bath (15),TaskCodeGroupName_Catheter Care:Empty bag (16),TaskCodeGroupName_Companion Care Escort Visit:Companion Care Visit,TaskCodeGroupName_Dressing:Dressing (17),...,TaskCodeGroupName_Incontinence Care:Bowel brief - diaper (41),TaskCodeGroupName_Incontinence Care:Incontinence (42),TaskCodeGroupName_Medication:Reminder (43),TaskCodeGroupName_Nutrition:Assist with Feeding (46),TaskCodeGroupName_Nutrition:Encourage Fluids (47),TaskCodeGroupName_Nutrition:Meal Prep (48),TaskCodeGroupName_Nutrition:Restrict Fluids (49),TaskCodeGroupName_Oral Care:Brush Dentures (50),TaskCodeGroupName_Oral Care:Brush Teeth (51),TaskCodeGroupName_Positioning:Specify (52),TaskCodeGroupName_Respite Services:RSP Services (45),TaskCodeGroupName_Shave:Trim facial hair (53),TaskCodeGroupName_Skin:Deodorant (54),TaskCodeGroupName_Skin:Lotion (55),TaskCodeGroupName_Skin:Skin Care (57),TaskCodeGroupName_Toileting:Bathroom (58),TaskCodeGroupName_Toileting:Urinal (60),TaskCodeGroupName_Transfer:Chair (61),TaskCodeGroupName_Transfer:Gait Belt (62),TaskCodeGroupName_Transfer:Hoyer (63),TaskCodeGroupName_Transfer:Transfer (65),TaskCodeGroupName_Transfer:Walker (66),TaskCodeGroupName_Transfer:Wheelchair (67),TaskCodeGroupName_Transportation:Client Errands - Do Not Transport Client (68),hasHospitilization
0,0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0
1,1,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0
2,2,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0
3,3,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0
4,4,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0


In [37]:
def balanceData(df, percentTrain):
   global random_state
   
   positive = shuffle(df[df['hasHospitilization'] == 1], random_state = random_state)
   negative = shuffle(df[df['hasHospitilization'] == 0], random_state = random_state)
   
   sizePos = int(len(positive) * percentTrain)
   sizeNeg = int(len(negative) * percentTrain)
   
   trainPos = positive[:sizePos]
   trainNeg = negative[:sizeNeg]
   testPos = positive[sizePos:]
   testNeg = negative[sizeNeg:]
   
   train = pd.concat([trainPos, trainNeg])
   test = pd.concat([testPos, testNeg])
   
   train_x = train.loc[:, train.columns != 'hasHospitilization'].to_numpy()
   train_y = train['hasHospitilization'].to_numpy()
   test_x = test.loc[:, test.columns != 'hasHospitilization'].to_numpy()
   test_y = test['hasHospitilization'].to_numpy()
   
   return train_x, train_y, test_x, test_y

In [38]:
train_x,train_y, test_x, test_y = balanceData(df, .7)

## Hyperparameter Grid Search
As with other anomaly detection models, we use the GridSearchCV method created by SciKitLearn to run many different models and determine which is the best using F1 score.

In [39]:
#parameters
n_neighbors = range(1,10)
weights = ["uniform", "distance"]
p = [1,2]

param_grid = {
   "n_neighbors" : n_neighbors,
   "weights" : weights,
   "p" : p
}

In [40]:
grid_search = GridSearchCV(estimator=KNeighborsClassifier(algorithm='ball_tree'), param_grid=param_grid, scoring='f1', n_jobs=5)
grid_search.fit(train_x, train_y)

GridSearchCV(estimator=KNeighborsClassifier(algorithm='ball_tree'), n_jobs=5,
             param_grid={'n_neighbors': range(1, 10), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             scoring='f1')

In [41]:
print("Best Parameters: ",grid_search.best_params_)
pd.set_option('display.max_columns', 50)
results = pd.DataFrame(grid_search.cv_results_)
print(results)

Best Parameters:  {'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.568012      0.085114         0.500645        0.090740   
1        0.419654      0.066813         0.256700        0.017024   
2        0.380083      0.020616         0.323033        0.009998   
3        0.306478      0.020565         0.131879        0.004671   
4        0.359769      0.059735         0.542600        0.113744   
5        0.405828      0.058719         0.287105        0.021774   
6        0.338015      0.028856         0.297328        0.020668   
7        0.385921      0.061439         0.175258        0.027908   
8        0.410314      0.016127         0.531929        0.052253   
9        0.451328      0.121824         0.287291        0.026547   
10       0.329919      0.008074         0.328947        0.028507   
11       0.371238      0.020998         0.182081        0.030991   
12       0.412036      0.033286         0.420476 

## Writing Results to CSV

In [42]:
results_file_path = "KNN_hyperparams_results.csv"
results.to_csv(results_file_path)

## Testing Best Parameter Model on New Dataset

In [43]:
preds = grid_search.predict(test_x)

print("Precision Score: ", precision_score(test_y, preds))
print("Recall Score: ", recall_score(test_y, preds))
print("F1 Score: ", f1_score(test_y, preds))

print(classification_report(test_y, preds))

Precision Score:  0.9423076923076923
Recall Score:  0.9074074074074074
F1 Score:  0.9245283018867925
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     13976
         1.0       0.94      0.91      0.92        54

    accuracy                           1.00     14030
   macro avg       0.97      0.95      0.96     14030
weighted avg       1.00      1.00      1.00     14030



In [44]:
prob_preds = grid_search.predict_proba(test_x)
print(prob_preds)
probs = prob_preds[:, 1]
loss = log_loss(test_y, probs)
print(loss)

[[0.33333333 0.66666667]
 [1.         0.        ]
 [0.         1.        ]
 ...
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]]
0.0060869842563392685
