## Classic Classifiers

In [1]:
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC

from sklearn.model_selection import GridSearchCV

In [2]:
from utilities import *
from models import *

from sklearn.preprocessing import LabelEncoder
import joblib

In [3]:
pin_file = "../Data/pin.csv"

pin = read_pin(pin_file)

In [4]:
filename = "../Data/rssi4.csv"
B1 = "0117C55D14E4"

data = read_data(filename, B1)

All beacons: ['0117C55D14E4']
Selecting 0117C55D14E4


In [5]:
data[scanners] = minMaxScaling(data[scanners])

## Train Validation Test Split

In [6]:
train, validation, test = train_validation_test_split(data)

In [7]:
train.groupby("location")[scanners].count()

Unnamed: 0_level_0,C400A2E19293,CD4533FFC0E1,D2B6503554D7,DB8B36A69C56,DD697EA75B68,DF231643E227,E13B805C6CB0,E43355CA8B96,E6D9D20DD197,E8FD0B453DC4,E96AF2C858BA,EC72840D9AD3,F1307ECB3B90,F1EDAF28E08A,F69A86823B96,FB2EE01C18CE,FDAE5980F28C
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
V1_11,55,71,68,96,126,82,60,105,120,49,48,130,129,27,45,20,84
V1_12,67,76,43,84,79,76,56,58,153,51,74,151,126,79,106,56,71
V1_13,60,141,47,111,42,86,101,111,229,13,33,107,80,92,150,14,63
V1_14,36,154,45,82,56,89,261,66,210,14,105,50,78,83,164,63,86
V1_20,105,92,94,77,124,98,61,82,72,31,72,90,169,15,74,23,69
V1_21,64,132,78,72,90,90,120,52,132,17,86,93,78,64,85,70,62
V1_22,61,70,58,62,68,155,205,27,233,27,136,57,49,11,109,90,51
V1_23,57,86,53,47,35,202,123,46,290,31,194,58,79,9,107,116,61
V1_24,22,67,53,55,39,108,95,47,441,17,184,54,64,33,87,135,26
V1_28,99,96,93,52,109,75,96,63,108,70,46,67,57,97,86,27,64


In [8]:
validation.shape

(6034, 19)

In [9]:
test.shape

(6035, 19)

In [10]:
train.sort_values("time", inplace=True)
validation.sort_values("time", inplace=True)
test.sort_values("time", inplace=True)

In [11]:
train_rolled = train.groupby("location").rolling(15, min_periods=1).mean().reset_index()
validation_rolled = validation.groupby("location").rolling(15, min_periods=1).mean().reset_index()
test_rolled = test.groupby("location").rolling(15, min_periods=1).mean().reset_index()

In [12]:
train_rolled

Unnamed: 0,location,level_1,C400A2E19293,CD4533FFC0E1,D2B6503554D7,DB8B36A69C56,DD697EA75B68,DF231643E227,E13B805C6CB0,E43355CA8B96,E6D9D20DD197,E8FD0B453DC4,E96AF2C858BA,EC72840D9AD3,F1307ECB3B90,F1EDAF28E08A,F69A86823B96,FB2EE01C18CE,FDAE5980F28C
0,V1_11,24304,,,,0.55,,,,,,,,,,,,,
1,V1_11,24306,,,,0.55,0.516667,,,,,,,,,,,,
2,V1_11,24307,,,,0.55,0.516667,,0.333333,,,,,,,,,,
3,V1_11,24308,,0.333333,,0.55,0.516667,,0.333333,,,,,,,,,,
4,V1_11,24310,,0.333333,,0.55,0.516667,0.15,0.333333,,,,0.100000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18097,V1_32,22547,0.183333,0.283333,0.333333,,,0.55,0.433333,,0.516667,,0.700000,0.25,,,0.258333,,
18098,V1_32,22549,0.183333,0.283333,0.333333,,0.233333,0.55,0.433333,,0.500000,,0.700000,0.25,0.3,,0.258333,,
18099,V1_32,22551,0.183333,0.283333,0.333333,,0.233333,0.55,0.433333,,0.505556,,0.700000,,0.3,,0.258333,,
18100,V1_32,22552,0.183333,0.283333,0.333333,,0.233333,,0.433333,,0.505556,,0.679167,,0.3,,0.258333,,


## Imputation
1. Forward fill
2. Fill NaN

In [13]:
train_imputed = train.set_index("location").groupby("location").ffill()
train_imputed.fillna(0, inplace=True)
train_imputed.reset_index(inplace=True)

In [14]:
validation_imputed = validation.set_index("location").groupby("location").ffill()
validation_imputed.fillna(0, inplace=True)
validation_imputed.reset_index(inplace=True)

In [15]:
test_imputed = test.set_index("location").groupby("location").ffill()
test_imputed.fillna(0, inplace=True)
test_imputed.reset_index(inplace=True)

## Create Label Encoding for Each Fingerprint Location

In [16]:
X_train, y_train = train_imputed[scanners].values, train_imputed["location"].values
X_validation, y_validation = validation_imputed[scanners].values, validation_imputed["location"].values
X_test, y_test = test_imputed[scanners].values, test_imputed["location"].values

In [17]:
enc = LabelEncoder()

y_train = enc.fit_transform(y_train)
y_validation = enc.transform(y_validation)
y_test = enc.transform(y_test)

In [18]:
y_train.shape

(18102,)

In [19]:
joblib.dump(enc, "../Models/MLP_Classification_Rolling_FFill_MinMax_Encoder.joblib")

['../Models/MLP_Classification_Rolling_FFill_MinMax_Encoder.joblib']

## Model Training

In [20]:
classifiers = [
    KNeighborsClassifier(n_neighbors=10, p=1),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(n_estimators=50),
    RandomForestClassifier(n_estimators=150),
    SVC(C=10)
]

In [21]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    print(clf, clf.score(X_validation, y_validation), clf.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=1,
                     weights='uniform') 0.967848856479947 0.9676884838442419
GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 0.964534305601591 0.9821043910521955
BaggingClassifier(base_estimator=None, bootstrap=True, bootstr



SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False) 0.9353662578720583 0.9411764705882353


In [22]:
def run(clf):
    clf.fit(X_train, y_train)
    print("Validation score:", clf.score(X_validation, y_validation))
    print("Test score", clf.score(X_test, y_test))

In [23]:
parameters = {
    'n_neighbors': [1, 5, 10],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]
}

clf = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    1.4s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:    1.9s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:    2.3s
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   10.2s
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   11.9s
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:   13.4s
[Parallel(n_jobs=7)]: Done  58 tasks      | elapsed:   23.5s
[Parallel(n_jobs=7)]: Done  71 tasks      | elapsed:   26.0s
[Parallel(n_jobs=7)]: Done  87 out of  90 | elapsed:   41.5s remaining:    1.4s
[Parallel(n_jobs=7)]: Done  90 out of  90 | elapsed:   44.8s finished


Validation score: 0.967848856479947
Test score 0.9676884838442419


In [24]:
clf.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=1,
                     weights='uniform')

In [25]:
clf = KNeighborsClassifier(n_neighbors=10, p=1)
run(clf)

Validation score: 0.967848856479947
Test score 0.9676884838442419


In [26]:
parameters = {
    'loss': ['deviance'],
    'learning_rate': [0.01, 0.1, 1]
}

clf = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 out of  15 | elapsed:   29.7s remaining:  1.4min
[Parallel(n_jobs=7)]: Done   6 out of  15 | elapsed:   31.1s remaining:   46.7s
[Parallel(n_jobs=7)]: Done   8 out of  15 | elapsed:   40.4s remaining:   35.3s
[Parallel(n_jobs=7)]: Done  10 out of  15 | elapsed:   44.4s remaining:   22.2s
[Parallel(n_jobs=7)]: Done  12 out of  15 | elapsed:   49.4s remaining:   12.3s
[Parallel(n_jobs=7)]: Done  15 out of  15 | elapsed:  1.5min finished


Validation score: 0.9635399403380842
Test score 0.9821043910521955


In [27]:
clf.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [28]:
clf = GradientBoostingClassifier()
run(clf)

Validation score: 0.963705667882002
Test score 0.9821043910521955


In [29]:
parameters = {
    'n_estimators': [5, 10, 20, 50]
}

clf = GridSearchCV(BaggingClassifier(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done  10 out of  20 | elapsed:    1.2s remaining:    1.2s
[Parallel(n_jobs=7)]: Done  13 out of  20 | elapsed:    1.8s remaining:    1.0s
[Parallel(n_jobs=7)]: Done  16 out of  20 | elapsed:    2.7s remaining:    0.7s
[Parallel(n_jobs=7)]: Done  20 out of  20 | elapsed:    3.4s finished


Validation score: 0.9405038117335102
Test score 0.9514498757249379


In [30]:
clf.best_estimator_

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=20,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [31]:
clf = BaggingClassifier()
run(clf)

Validation score: 0.9310573417301956
Test score 0.9415078707539354


In [32]:
parameters = {
    'n_estimators': [500, 1000, 1500]
}

clf = GridSearchCV(ExtraTreesClassifier(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 out of  15 | elapsed:    4.6s remaining:   12.6s
[Parallel(n_jobs=7)]: Done   6 out of  15 | elapsed:    8.1s remaining:   12.2s
[Parallel(n_jobs=7)]: Done   8 out of  15 | elapsed:    9.1s remaining:    7.9s
[Parallel(n_jobs=7)]: Done  10 out of  15 | elapsed:   12.3s remaining:    6.2s
[Parallel(n_jobs=7)]: Done  12 out of  15 | elapsed:   15.6s remaining:    3.9s
[Parallel(n_jobs=7)]: Done  15 out of  15 | elapsed:   19.0s finished


Validation score: 0.9847530659595625
Test score 0.9830985915492958


In [33]:
clf.best_estimator_

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [34]:
clf = ExtraTreesClassifier(n_estimators=50)
run(clf)

Validation score: 0.9821014252568777
Test score 0.9824357912178956


In [35]:
parameters = {
    'n_estimators': [100, 150, 200, 250]
}

clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    1.9s
[Parallel(n_jobs=7)]: Done  10 out of  20 | elapsed:    3.4s remaining:    3.4s
[Parallel(n_jobs=7)]: Done  13 out of  20 | elapsed:    4.8s remaining:    2.6s
[Parallel(n_jobs=7)]: Done  16 out of  20 | elapsed:    5.5s remaining:    1.4s
[Parallel(n_jobs=7)]: Done  20 out of  20 | elapsed:    7.0s finished


Validation score: 0.9751408684123302
Test score 0.9827671913835957


In [36]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
clf = RandomForestClassifier(n_estimators=150)
run(clf)

Validation score: 0.9764666887636725
Test score 0.9822700911350456


In [38]:
parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [1, 10, 50]
}

clf = GridSearchCV(SVC(), parameters, n_jobs=7, verbose=10, cv=5)
run(clf)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:    1.7s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:   13.6s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:   15.2s
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   21.3s
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   24.4s
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:   26.7s
[Parallel(n_jobs=7)]: Done  54 out of  60 | elapsed:   27.7s remaining:    3.1s
[Parallel(n_jobs=7)]: Done  60 out of  60 | elapsed:   28.6s finished


Validation score: 0.9353662578720583
Test score 0.9411764705882353


In [39]:
clf.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [40]:
clf = SVC(C=10)
run(clf)



Validation score: 0.9353662578720583
Test score 0.9411764705882353
