In [142]:
##Importing libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import utils
from mlxtend.plotting import plot_decision_regions
%matplotlib inline

##Algorithms used for prediction
from sklearn.neighbors import KNeighborsClassifier

##sklearn tools used for standardizing, normalising, predicting and reporting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [143]:
##Reading dataset (you can find it easily at my github project folder)

dataset = pd.read_csv('wifi_localization.csv')

In [144]:
##Peeking at data

dataset.head()

Unnamed: 0,WS1,WS2,WS3,WS4,WS5,WS6,WS7,Class
0,-64,-56,-61,-66,-71,-82,-81,1
1,-68,-57,-61,-65,-71,-85,-85,1
2,-63,-60,-60,-67,-76,-85,-84,1
3,-61,-60,-68,-62,-77,-90,-80,1
4,-63,-65,-60,-63,-77,-81,-87,1


In [145]:
##Splitting the dataset

features = dataset['Class']
outcome = dataset.drop(['Class'], axis=1)

In [146]:
##Splitting the data into test (30%) and training (70%)

outcome_train, outcome_test, features_train, features_test = train_test_split(outcome, features, test_size = 0.3, random_state = 0)

In [147]:
##KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(outcome_train, features_train)

accuracy_knn = round(knn.score(outcome_test, features_test) * 100, 2)

accuracy_knn

97.83

In [148]:
##KNN with hyperparameters + cross validation

knn = KNeighborsClassifier()

k_range = range(1, 31)
leaf_range = 2 ** np.arange(10)
weight_options = ['uniform', 'distance']
algorithm_options = ['ball_tree', 'kd_tree', 'brute']

hyperK = dict(n_neighbors = k_range, algorithm = algorithm_options, weights = weight_options, leaf_size = leaf_range)

gridK = GridSearchCV(knn, hyperK, cv = 10, verbose = 1, n_jobs = -1)
bestK = gridK.fit(outcome_train, features_train)

bestK.best_estimator_.get_params()

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


{'algorithm': 'ball_tree',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 4,
 'p': 2,
 'weights': 'uniform'}

In [149]:
##Other metrics to use as reference

classes = bestK.predict(outcome_test)
    
accuracy = metrics.accuracy_score(classes, features_test)
precision = metrics.precision_score(classes, features_test, average='micro')
f1_score = metrics.f1_score(classes, features_test, average='micro')
recall = metrics.recall_score(classes, features_test, average='micro')

print(metrics.classification_report(classes, features_test))

              precision    recall  f1-score   support

           1       1.00      0.99      0.99       159
           2       0.97      0.99      0.98       148
           3       0.98      0.97      0.97       155
           4       0.99      1.00      1.00       138

    accuracy                           0.98       600
   macro avg       0.99      0.99      0.99       600
weighted avg       0.99      0.98      0.99       600



In [150]:
##Model accuracy after use of GridSearch

print('Model accuracy is',bestK.score(outcome_test, features_test))

Model accuracy is 0.985


In [151]:
##Reading dataset (you can find it easily at my github project folder)

dataset = pd.read_csv('wifi_localization.csv')

In [152]:
##Using StandardScalar to normalise the dataset

standard_scalar = StandardScaler()
dataset = pd.DataFrame(standard_scalar.fit_transform(dataset), columns = dataset.columns)

In [153]:
dataset.head()

Unnamed: 0,WS1,WS2,WS3,WS4,WS5,WS6,WS7,Class
0,-1.03098,-0.11019,-1.135684,-1.084085,-0.918342,-0.155793,0.111457,-1.341641
1,-1.384373,-0.402858,-1.135684,-0.996895,-0.918342,-0.616266,-0.502211,-1.341641
2,-0.942631,-1.280864,-0.947533,-1.171276,-1.467623,-0.616266,-0.348794,-1.341641
3,-0.765935,-1.280864,-2.452747,-0.735323,-1.577479,-1.383721,0.264874,-1.341641
4,-0.942631,-2.744208,-0.947533,-0.822513,-1.577479,-0.002302,-0.809045,-1.341641


In [154]:
##Splitting the dataset

features = dataset['Class']
outcome = dataset.drop(['Class'], axis=1)

lab_enc = preprocessing.LabelEncoder()
features_encoded = lab_enc.fit_transform(features)
print(training_scores_encoded)
print(utils.multiclass.type_of_target(features))
print(utils.multiclass.type_of_target(features.astype('int')))
print(utils.multiclass.type_of_target(training_scores_encoded))

[0 0 0 ... 3 3 3]
continuous
multiclass
multiclass


In [155]:
##Splitting the data into test (30%) and training (70%)

outcome_train, outcome_test, features_train, features_test = train_test_split(outcome, features_encoded, test_size = 0.3, random_state = 0)

In [156]:
##KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(outcome_train, features_train)

accuracy_knn = round(knn.score(outcome_test, features_test) * 100, 2)

accuracy_knn

97.67

In [157]:
##KNN normalized with hyperparameters + cross validation

knn = KNeighborsClassifier()

k_range = range(1, 31)
leaf_range = 2 ** np.arange(10)
weight_options = ['uniform', 'distance']
algorithm_options = ['ball_tree', 'kd_tree', 'brute']

hyperK = dict(n_neighbors = k_range, algorithm = algorithm_options, weights = weight_options, leaf_size = leaf_range)

gridK = GridSearchCV(knn, hyperK, cv = 10, verbose = 1, n_jobs = -1)
bestK = gridK.fit(outcome_train, features_train)

bestK.best_estimator_.get_params()

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


{'algorithm': 'ball_tree',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 3,
 'p': 2,
 'weights': 'uniform'}

In [158]:
##Other metrics

classes = bestK.predict(outcome_test)
    
accuracy = metrics.accuracy_score(classes, features_test)
precision = metrics.precision_score(classes, features_test, average='micro')
f1_score = metrics.f1_score(classes, features_test, average='micro')
recall = metrics.recall_score(classes, features_test, average='micro')

print(metrics.classification_report(classes, features_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       158
           1       0.93      0.99      0.96       143
           2       0.98      0.93      0.96       161
           3       0.99      1.00      1.00       138

    accuracy                           0.98       600
   macro avg       0.98      0.98      0.98       600
weighted avg       0.98      0.98      0.98       600



In [159]:
##Model accuracy after use of GridSearch with normalised data

print('Model accuracy is',bestK.score(outcome_test, features_test))

Model accuracy is 0.9766666666666667
