In [1]:
from Pipeline import BLEPipeline, WifiPipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random, time

from itertools import izip, combinations

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Plot ROC
from sklearn.metrics import roc_curve, roc_auc_score
import scikitplot as skplt

# Tuning 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [24]:
w = WifiPipeline()
b = BLEPipeline()

**Dataframe Creation**

In [3]:
df = w.make_dataframe()

Mini       104280
Router     103593
Dropcam     64568
Kasa        23753
Netcam3      4867
Netcam1      4446
Netcam2      4407
Switch2      3046
Switch1      2668
Switch3      2634
Insight      2556
Switch4      2206
Lifx2         627
TpPlug        587
Lifx1         540
TpBulb        202
Name: Name, dtype: int64


In [25]:
b_df = b.make_dataframe()

August2    224739
Home1       58810
Home2       58614
Push        32761
Kevo        21107
August1     17314
Weather      8643
Room2        8133
Room1        7728
Door1        7374
Door2        4154
Name: Name, dtype: int64


In [4]:
df_train = df[df["Set"]=="train"]
df_test = df[df["Set"]=="test"]

In [31]:
b_df_train = b_df[b_df["Set"]=="train"]
b_df_test = b_df[b_df["Set"]=="test"]




**Resampling**

In [5]:
# Wifi: Sampling by DeviceType
category = "DeviceType"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_undersample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_oversample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Mid-sampling:
kind = "mid"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_midsample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [28]:
print "Undersample:", df_undersample_devtype["DeviceType"].value_counts()[0]
print "Midsample:", df_midsample_devtype["DeviceType"].value_counts()[0]
print "Oversample:", df_oversample_devtype["DeviceType"].value_counts()[0]

Undersample: 1369
Midsample: 76214
Oversample: 143804


In [32]:
# BLE: Sampling by DeviceType
category = "DeviceType"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_undersample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_oversample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Mid-sampling:
kind = "mid"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_midsample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [33]:
print "Undersample:", b_df_undersample_devtype["DeviceType"].value_counts()[0]
print "Midsample:", b_df_midsample_devtype["DeviceType"].value_counts()[0]
print "Oversample:", b_df_oversample_devtype["DeviceType"].value_counts()[0]

Undersample: 42575
Midsample: 57265
Oversample: 349537


In [6]:
# # Sampling by Name
# category = "Name"

# # Undersampling:
# kind = "under"
# df_train_sampled = w.resample(df_train, kind=kind, category=category)
# df_test_sampled = w.resample(df_test, kind=kind, category=category)
# df_undersample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# # Oversampling:
# kind = "over"
# df_train_sampled = w.resample(df_train, kind=kind, category=category)
# df_test_sampled = w.resample(df_test, kind=kind, category=category)
# df_oversample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [10]:
# Set up vsall classification
# all_under_devtype = w.one_vs_all_classify(df_undersample, features_list, y_list, output='plot')

Total time (one vs all_classify): 3.20186901093



**Tuning using GridSearchCV**

**Wifi Devices**

In [116]:
# Wifi: Define which features to use
features_list = [
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
         "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null",

        # Associated Packets
        "Assoc_Packets"]

# Define what the response classes are
y_list = ["camera", "bulb", "plug"]

In [117]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
solver = ['lsqr']
n_components = np.arange(1,4)
lda_param_grid = dict(solver=solver, n_components=n_components)

# QDA
qda_param_grid = dict()

In [118]:
# DF Setup
df_to_use = df_midsample_devtype
df_train = df_to_use[df_to_use['Set']=='train']
df_test = df_to_use[df_to_use['Set']=='test']
X_train = df_train[features_list]
X_test = df_test[features_list]

folds = 10

In [119]:
start_time = time.time()
# KNN GridSearch Setup
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=folds)

w_knn_grid_results = []
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = knn_grid.fit(X_train, y_train)
    w_knn_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""   
    
end_time = time.time() -start_time
print end_time

Device:  camera
Best params: {'n_neighbors': 7} score =  0.9862618778460295

Device:  bulb
Best params: {'n_neighbors': 1} score =  0.9999863799185519

Device:  plug
Best params: {'n_neighbors': 7} score =  0.9850723907328965

17435.578325


In [120]:
start_time = time.time()
# RF GridSearch Setup
rf = RandomForestClassifier(n_estimators=1000)
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=folds)

w_rf_grid_results = []
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = rf_grid.fit(X_train, y_train)
    w_rf_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""
    
end_time = time.time() -start_time
print end_time

Device:  camera
Best params: {'max_features': 11} score =  0.9875603256107471

Device:  bulb
Best params: {'max_features': 2} score =  1.0

Device:  plug
Best params: {'max_features': 8} score =  0.9877510067510203

6986.4978931


In [121]:
start_time = time.time()
# LDA GridSearch Setup
lda = LinearDiscriminantAnalysis()
lda_grid = GridSearchCV(estimator=lda, param_grid=lda_param_grid, cv=folds)

w_lda_grid_results = []
# Run GridSearch for each device type
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = lda_grid.fit(X_train, y_train)
    w_lda_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""
    
end_time = time.time() -start_time
print end_time

Device:  camera
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9737904232667312

Device:  bulb
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9785256715835161

Device:  plug
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9737904232667312

7.1946709156


**BLE Devices**

In [122]:
# BLE: Feature list and response variables
features_list = [
#     'AccessAddr', 'AdvertAddr', 'ScanAddr',
    'BLE_LL_Length', 'TxAddr', 'CompanyID',
    'RFChannel',
    'PacketLength', 'Time', 'Assoc_Packets',
    'ADV_DIRECT_IND', 'ADV_IND', 'ADV_NONCONN_IND', 
    'ADV_SCAN_IND', 'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP']

y_list = ["door", "lock", "temp"]

In [123]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
solver = ['lsqr']
n_components = np.arange(1,4)
lda_param_grid = dict(solver=solver, n_components=n_components)

In [124]:
# DF Setup
df_to_use = b_df_midsample_devtype
df_train = df_to_use[df_to_use['Set']=='train']
df_test = df_to_use[df_to_use['Set']=='test']
X_train = df_train[features_list]
X_test = df_test[features_list]

folds = 10

In [125]:
start_time = time.time()
# KNN GridSearch Setup
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=folds)

b_knn_grid_results = []
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = knn_grid.fit(X_train, y_train)
    b_knn_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""
    
end_time = time.time() -start_time
print end_time

Device:  door
Best params: {'n_neighbors': 1} score =  0.950147900892833

Device:  lock
Best params: {'n_neighbors': 1} score =  0.9746872371027707

Device:  temp
Best params: {'n_neighbors': 1} score =  0.9747890037721512

491.940316916


In [126]:
start_time = time.time()
# RF GridSearch Setup
rf = RandomForestClassifier(n_estimators=1000)
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=folds)

b_rf_grid_results = []
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = rf_grid.fit(X_train, y_train)
    b_rf_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""
    
end_time = time.time() -start_time
print end_time

Device:  door
Best params: {'max_features': 11} score =  0.9789139461043719

Device:  lock
Best params: {'max_features': 2} score =  0.9999592933322479

Device:  temp
Best params: {'max_features': 5} score =  0.9794499172297756

9211.42286301


In [127]:
start_time = time.time()
# LDA GridSearch Setup
lda = LinearDiscriminantAnalysis()
lda_grid = GridSearchCV(estimator=lda, param_grid=lda_param_grid, cv=folds)

b_lda_grid_results = []
# Run GridSearch for each device type
for device in y_list:
    # Adjust y_train, y_test
    y_train = df_train[device]
    y_test = df_test[device]
    
    grid_result = lda_grid.fit(X_train, y_train)
    b_lda_grid_results.append(grid_result.cv_results_)
    print "Device: ", device
    print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_
    print ""
    
end_time = time.time() -start_time
print end_time

Device:  door
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9300862981356346

Device:  lock
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9660642079839344

Device:  temp
Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.980019810578306

5.17680811882


**Determine best hyperparameters**

In [128]:
wifi_grid_results = [w_knn_grid_results, w_rf_grid_results, w_lda_grid_results]
ble_grid_results = [b_knn_grid_results, b_rf_grid_results, b_lda_grid_results]
classifiers = ['knn', 'rf', 'lda']
wifi_devices = ["camera", "bulb", "plug"]
ble_devices = ["door", "lock", "temp"]

In [133]:
for x in w_knn_grid_results:
    print x['mean_test_score']

[0.97566999 0.98582604 0.98579426 0.98626188 0.98552639]
[0.99998638 0.99994552 0.99991374 0.99989558 0.99988196]
[0.98087741 0.98414623 0.98178087 0.98507239 0.98373308]


In [None]:
for results,c in zip(wifi_grid_results, classifiers):
    print c
    for result,w in zip(results, wifi_devices):
        print w
        print result.best_params_