In [56]:
##Importing libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

##Algorithms used for prediction
from sklearn.neighbors import KNeighborsClassifier

##sklearn tools used for standardizing, normalising, predicting and reporting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [57]:
##Reading dataset (you can find it easily at my github project folder)

dataset = pd.read_csv('wifi_localization.csv')

In [58]:
##Peeking at data

dataset.head()

Unnamed: 0,WS1,WS2,WS3,WS4,WS5,WS6,WS7,Class
0,-64,-56,-61,-66,-71,-82,-81,1
1,-68,-57,-61,-65,-71,-85,-85,1
2,-63,-60,-60,-67,-76,-85,-84,1
3,-61,-60,-68,-62,-77,-90,-80,1
4,-63,-65,-60,-63,-77,-81,-87,1


In [59]:
##Splitting the dataset

features = dataset['Class']
outcome = dataset.drop(['Class'], axis=1)

In [60]:
##Splitting the data into test (30%) and training (70%)

outcome_train, outcome_test, features_train, features_test = train_test_split(outcome, features, test_size = 0.3, random_state = 0)

In [61]:
##KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(outcome_train, features_train)

accuracy_knn = round(knn.score(outcome_train, features_train) * 100, 2)

accuracy_knn

99.36

In [62]:
##KNN with hyperparameters + cross validation

knn = KNeighborsClassifier()

k_range = range(1, 31)
leaf_range = 2 ** np.arange(10)
weight_options = ['uniform', 'distance']
algorithm_options = ['ball_tree', 'kd_tree', 'brute']

hyperK = dict(n_neighbors = k_range, algorithm = algorithm_options, weights = weight_options, leaf_size = leaf_range)

gridK = GridSearchCV(knn, hyperK, cv = 10, verbose = 1, n_jobs = -1)
bestK = gridK.fit(outcome_train, features_train)

bestK.best_estimator_.get_params()

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


{'algorithm': 'ball_tree',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 4,
 'p': 2,
 'weights': 'uniform'}

In [63]:
##Other metrics to use as reference

classes = bestK.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train, average='micro')
f1_score = metrics.f1_score(classes, features_train, average='micro')
recall = metrics.recall_score(classes, features_train, average='micro')

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           1       1.00      0.99      1.00       346
           2       0.99      1.00      0.99       346
           3       0.99      0.98      0.99       349
           4       0.99      1.00      0.99       359

    accuracy                           0.99      1400
   macro avg       0.99      0.99      0.99      1400
weighted avg       0.99      0.99      0.99      1400



In [64]:
##Model accuracy after use of GridSearch

print('Model accuracy is',bestK.score(outcome_train, features_train))

Model accuracy is 0.9921428571428571


In [65]:
##Using StandardScalar to normalise the dataset

standard_scalar = StandardScaler()
outcome_train = pd.DataFrame(standard_scalar.fit_transform(outcome_train), columns = outcome.columns)
outcome_test = pd.DataFrame(standard_scalar.transform(outcome_test), columns = outcome.columns)

In [66]:
outcome_train.head()

Unnamed: 0,WS1,WS2,WS3,WS4,WS5,WS6,WS7
0,-0.49918,-0.084794,0.55746,-0.041795,1.147387,-0.908277,-0.348835
1,-0.411703,-0.957676,2.065929,-0.55809,0.709811,-0.908277,-0.500361
2,0.375588,0.497127,0.55746,-0.041795,0.491023,-0.908277,-1.106465
3,-0.23675,0.788088,-0.385333,-1.160433,-0.165341,0.142994,0.560322
4,3.349797,0.497127,-0.385333,3.314118,-0.712312,-0.157369,1.621004


In [67]:
##KNN with hyperparameters + cross validation + normalised dataset

knn = KNeighborsClassifier()

k_range = range(1, 31)
leaf_range = 2 ** np.arange(10)
weight_options = ['uniform', 'distance']
algorithm_options = ['ball_tree', 'kd_tree', 'brute']

hyperK = dict(n_neighbors = k_range, algorithm = algorithm_options, weights = weight_options, leaf_size = leaf_range)

gridK = GridSearchCV(knn, hyperK, cv = 10, verbose = 1, n_jobs = -1)
bestK = gridK.fit(outcome_train, features_train)

bestK.best_estimator_.get_params()

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


{'algorithm': 'ball_tree',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 4,
 'p': 2,
 'weights': 'distance'}

In [68]:
##New metrics result

classes = bestK.predict(outcome_train)
    
accuracy = metrics.accuracy_score(classes, features_train)
precision = metrics.precision_score(classes, features_train, average='micro')
f1_score = metrics.f1_score(classes, features_train, average='micro')
recall = metrics.recall_score(classes, features_train, average='micro')

print(metrics.classification_report(classes, features_train))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       343
           2       1.00      1.00      1.00       349
           3       1.00      1.00      1.00       347
           4       1.00      1.00      1.00       361

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400



In [69]:
##Model accuracy after use of GridSearch with normalised data

print('Model accuracy is',bestK.score(outcome_train, features_train))

Model accuracy is 1.0
