In [125]:
from Pipeline import BLEPipeline, WifiPipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random, time

from itertools import izip, combinations

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

from imblearn.under_sampling import RandomUnderSampler

# Plot ROC
from sklearn.metrics import roc_curve, roc_auc_score
import scikitplot as skplt

# Tuning 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

## Wi-Fi

In [5]:
w = WifiPipeline()

In [6]:
df = w.make_dataframe()

Router     101880
Mini        75782
Dropcam     45912
Kasa        17156
Netcam2      3056
Netcam3      2961
Netcam1      2314
Switch2      2204
Switch1      1963
Switch3      1955
Insight      1738
Switch4      1504
Lifx2         545
Lifx1         495
TpPlug        366
TpBulb        191
Name: Name, dtype: int64


In [10]:
df_train = df[df["Set"]=="train"]
df_test = df[df["Set"]=="test"]

### Down-sampling

In [12]:
# List variables that do not work with resampling methods
dataframe_vars = {'Name', 'SourceAddr', 'Set', 'SubtypeNum'}
onehotEncoded_vars = {'Belkin', 'Data', 'Dropcam', 
                      'Lifi', 'Netgear', 'QoS_Data', 'QoS_Null', 'Tp-link'}
response_vars = {"DeviceType","bulb","camera","router","plug"}

# Prep X, y
y = df_train["DeviceType"]
col_drop = response_vars | dataframe_vars | set(['Subtype', 'Vendor'])
X = df_train.drop(col_drop, axis=1)

In [58]:
rds = RandomUnderSampler()
X_downsampled, y_downsampled = rds.fit_resample(X, y)

In [64]:
# Recreate dataframe
X_downsampled = pd.DataFrame(data=X_downsampled, columns=X.columns)
y_downsampled = pd.DataFrame(data=y_downsampled, columns=['DeviceType'])

# Onehot encode 'DeviceType'
devicetype_series = pd.get_dummies(y_downsampled['DeviceType'])
y_downsampled = pd.concat([y_downsampled, devicetype_series], axis=1)
y_downsampled = y_downsampled.drop(['DeviceType'],axis=1)

# Combine X and y into one dataframe
df_train_downsampled = pd.concat([X_downsampled, y_downsampled], axis=1)

### Hyperparameter tuning

In [67]:
def tune_gridsearch(classifier, param_grid, df_train, features_list, y_list):
    start_time = time.time()
    
    X_train = df_train[features_list]
    
    grid = GridSearchCV(estimator=classifier, scoring='roc_auc', 
                        param_grid=param_grid, cv=10)
    grid_results = dict()
    
    for device in y_list:
        # Adjust y_train, y_test
        y_train = df_train[device]

        grid_result = grid.fit(X_train, y_train)
        grid_results[device] = grid_result.cv_results_

    end_time = time.time() -start_time
    
    return dict(end_time=end_time, grid_results=grid_results)

In [68]:
# Wifi: Define which features to use
features_list = [
        # Packet info
        "PacketLength", 
        
        # Vendor 
         "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null",

        # Associated Packets
        "Assoc_Packets"]

# Define what the response classes are
y_list = ["camera", "bulb", "plug"]

In [69]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
n_components = np.arange(1,4)
lda_param_grid = dict(n_components=n_components)

In [70]:
# Set up classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()

In [71]:
df_train_downsampled.columns

Index([u'Time', u'PacketLength', u'Belkin', u'Dropcam', u'Lifi', u'Netgear',
       u'Tp-link', u'Data', u'QoS_Data', u'QoS_Null', u'Assoc_Packets',
       u'bulb', u'camera', u'plug'],
      dtype='object')

In [76]:
w_knn = tune_gridsearch(knn, knn_param_grid, df_train_downsampled, 
                        features_list, y_list)
w_rf = tune_gridsearch(rf, rf_param_grid, df_train_downsampled, 
                       features_list, y_list)
w_lda = tune_gridsearch(lda, lda_param_grid, df_train_downsampled, 
                        features_list, y_list)

## BLE

In [82]:
b = BLEPipeline()

### Dataframe Creation and Resampling

In [83]:
df = b.make_dataframe()

August2    210644
Home1       54902
Home2       54516
Push        30661
Kevo        19430
August1     15047
Weather      8101
Room2        7698
Room1        7239
Door1        6696
Door2        3587
Name: Name, dtype: int64


In [84]:
df_train = df[df["Set"]=="train"]
df_test = df[df["Set"]=="test"]

### Down-sampling

In [85]:
# List variables that do not work with resampling methods
dataframe_vars = {'Name', 'DeviceName', 'Set', 'AccessAddr', 
                  'AdvertAddr','PDUTypeNum'}
onehotEncoded_vars = {'ADV_DIRECT_IND', 'ADV_IND', 'ADV_NONCONN_IND', 
                      'ADV_SCAN_IND', 'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP'}
response_vars = {"DeviceType","door","lock","temp"}

# Prep X, y
y = df_train["DeviceType"]
col_drop = response_vars | dataframe_vars | onehotEncoded_vars
X = df_train.drop(col_drop, axis=1)

In [111]:
rds = RandomUnderSampler()
X_downsampled, y_downsampled = rds.fit_resample(X, y)

In [112]:
# Onehot encode RFChannel and PDUType
X_downsampled = pd.DataFrame(data=X_downsampled, 
                             columns=['BLE_LL_Length', 'RFChannel', 
                                      'PacketLength', 'Time', 'PDUType',
                                      'Assoc_Packets'])
rfchannel_series = pd.get_dummies(X_downsampled["RFChannel"])
pdutype_series = pd.get_dummies(X_downsampled["PDUType"])
X_downsampled = pd.concat([X_downsampled, rfchannel_series, pdutype_series], 
                          axis=1)
X_downsampled = X_downsampled.drop(['RFChannel','PDUType'], axis=1)

# Onehot encode 'DeviceType'
y_downsampled = pd.DataFrame(data=y_downsampled, columns=['DeviceType'])
devicetype_series = pd.get_dummies(y_downsampled['DeviceType'])
y_downsampled = pd.concat([y_downsampled, devicetype_series], axis=1)
y_downsampled = y_downsampled.drop(['DeviceType'],axis=1)

# Combine X and y into one dataframe
df_train_downsampled = pd.concat([X_downsampled, y_downsampled], axis=1)

### Hyperparameter tuning

In [118]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
n_components = np.arange(1,4)
lda_param_grid = dict(n_components=n_components)

In [119]:
# Set up classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()

In [121]:
b_knn = tune_gridsearch(knn, knn_param_grid, df_train_downsampled, 
                        X_downsampled.columns, y_downsampled.columns)
b_rf = tune_gridsearch(rf, rf_param_grid, df_train_downsampled, 
                       X_downsampled.columns, y_downsampled.columns)
b_lda = tune_gridsearch(lda, lda_param_grid, df_train_downsampled, 
                        X_downsampled.columns, y_downsampled.columns)

# Find best hyperparameters

In [122]:
# Find best performing hyperparameter across all device types
def find_best_params(grid_result, labels, print_tuning=False):
    # Get list of all test scores for each device
    mean_test_scores = [val['mean_test_score'] for key,val 
                        in grid_result['grid_results'].items()]

    # Store into dataframe
    tuning_results = (pd.DataFrame(data=mean_test_scores, index=labels)).T

    # Calculate mean test score across all devices
    tuning_results['Mean'] = tuning_results.mean(axis=1)
    
    if print_tuning:
        print tuning_results
        
    # Find best mean test score
    best_mean_score = max(tuning_results['Mean'])
    best_mean_params_index = tuning_results['Mean'].idxmax(axis=0)

    # Get a key from the grid_result dict to access the 'params'
    key = grid_result['grid_results'].keys()[0]
    
    # Return best parameters    
    return dict(best_params=grid_result['grid_results'][key]
                ['params'][best_mean_params_index], 
                best_mean_score=best_mean_score)

In [123]:
wifi_grid_results = [w_knn, w_rf, w_lda]
classifiers = ['knn', 'rf', 'lda']

print "Best performing hyperparameters for Wifi devices:\n"
for result, classifier in zip(wifi_grid_results,classifiers):
    x = find_best_params(result, ["camera", "bulb", "plug"])
    print '{classifier}: {best_parameter}     {best_score}'.format(
        classifier=classifier, best_parameter=x['best_params'], 
        best_score=x['best_mean_score'])


Best performing hyperparameters for Wifi devices:

knn: {'n_neighbors': 9}     0.995989406049
rf: {'max_features': 4}     0.9988501889
lda: {'n_components': 1}     0.996893196182


In [124]:
ble_grid_results = [b_knn, b_rf, b_lda]
classifiers = ['knn', 'rf', 'lda']
print "Best performing hyperparameters for BLE devices:\n"
for result, classifier in zip(ble_grid_results,classifiers):
    x = find_best_params(result, ["door", "lock", "temp"])
    print '{classifier}: {best_parameter}     {best_score}'.format(
        classifier=classifier, best_parameter=x['best_params'], 
        best_score=x['best_mean_score'])

Best performing hyperparameters for BLE devices:

knn: {'n_neighbors': 1}     0.682970945855
rf: {'max_features': 7}     0.997270464851
lda: {'n_components': 1}     0.849935029112
