In [4]:
from Pipeline import BLEPipeline, WifiPipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random, time

from itertools import izip, combinations

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Plot ROC
from sklearn.metrics import roc_curve, roc_auc_score
import scikitplot as skplt

# Tuning 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [5]:
w = WifiPipeline()
b = BLEPipeline()

**Dataframe Creation**

In [6]:
df = w.make_dataframe()

Mini       104280
Router     103593
Dropcam     64568
Kasa        23753
Netcam3      4867
Netcam1      4446
Netcam2      4407
Switch2      3046
Switch1      2668
Switch3      2634
Insight      2556
Switch4      2206
Lifx2         627
TpPlug        587
Lifx1         540
TpBulb        202
Name: Name, dtype: int64


In [7]:
b_df = b.make_dataframe()

August2    224739
Home1       58810
Home2       58614
Push        32761
Kevo        21107
August1     17314
Weather      8643
Room2        8133
Room1        7728
Door1        7374
Door2        4154
Name: Name, dtype: int64


In [8]:
df_train = df[df["Set"]=="train"]
df_test = df[df["Set"]=="test"]

In [9]:
b_df_train = b_df[b_df["Set"]=="train"]
b_df_test = b_df[b_df["Set"]=="test"]

**Resampling**

In [10]:
# Wifi: Sampling by DeviceType
category = "DeviceType"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_undersample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_oversample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Mid-sampling:
kind = "mid"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_midsample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [11]:
print "Undersample:", df_undersample_devtype["DeviceType"].value_counts()[0]
print "Midsample:", df_midsample_devtype["DeviceType"].value_counts()[0]
print "Oversample:", df_oversample_devtype["DeviceType"].value_counts()[0]

Undersample: 1369
Midsample: 76214
Oversample: 143804


In [12]:
# BLE: Sampling by DeviceType
category = "DeviceType"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_undersample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_oversample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Mid-sampling:
kind = "mid"
df_train_sampled = w.resample(b_df_train, kind=kind, category=category)
df_test_sampled = w.resample(b_df_test, kind=kind, category=category)
b_df_midsample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [13]:
print "Undersample:", b_df_undersample_devtype["DeviceType"].value_counts()[0]
print "Midsample:", b_df_midsample_devtype["DeviceType"].value_counts()[0]
print "Oversample:", b_df_oversample_devtype["DeviceType"].value_counts()[0]

Undersample: 42575
Midsample: 57265
Oversample: 349537


In [6]:
# # Sampling by Name
# category = "Name"

# # Undersampling:
# kind = "under"
# df_train_sampled = w.resample(df_train, kind=kind, category=category)
# df_test_sampled = w.resample(df_test, kind=kind, category=category)
# df_undersample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# # Oversampling:
# kind = "over"
# df_train_sampled = w.resample(df_train, kind=kind, category=category)
# df_test_sampled = w.resample(df_test, kind=kind, category=category)
# df_oversample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [10]:
# Set up vsall classification
# all_under_devtype = w.one_vs_all_classify(df_undersample, features_list, y_list, output='plot')

Total time (one vs all_classify): 3.20186901093



**Tuning using GridSearchCV**

In [147]:
def tune_gridsearch(classifier, param_grid, df_train, features_list, y_list):
    start_time = time.time()
    
    X_train = df_train[features_list]
    
    # Scoring documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    grid = GridSearchCV(estimator=classifier, scoring='roc_auc', param_grid=param_grid, cv=10)
    grid_results = dict()
    
    for device in y_list:
        # Adjust y_train, y_test
        y_train = df_train[device]

        grid_result = grid.fit(X_train, y_train)
        grid_results[device] = grid_result.cv_results_

    end_time = time.time() -start_time
    
    return dict(end_time=end_time, grid_results=grid_results)

**Wifi Devices**

In [148]:
# Wifi: Define which features to use
features_list = [
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
         "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null",

        # Associated Packets
        "Assoc_Packets"]

# Define what the response classes are
y_list = ["camera", "bulb", "plug"]

In [149]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
n_components = np.arange(1,4)
lda_param_grid = dict(n_components=n_components)

In [150]:
# DF Setup
df_to_use = df_midsample_devtype
df_train = df_to_use[df_to_use['Set']=='train']
df_test = df_to_use[df_to_use['Set']=='test']
X_train = df_train[features_list]
X_test = df_test[features_list]

In [151]:
# Set up classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()

In [152]:
w_knn = tune_gridsearch(knn, knn_param_grid, df_train, features_list, y_list)
w_rf = tune_gridsearch(rf, rf_param_grid, df_train, features_list, y_list)
w_lda = tune_gridsearch(lda, lda_param_grid, df_train, features_list, y_list)

**BLE Devices**

In [153]:
# BLE: Feature list and response variables
features_list = [
#     'AccessAddr', 'AdvertAddr', 'ScanAddr',
    'BLE_LL_Length', 'TxAddr', 'CompanyID',
    'RFChannel',
    'PacketLength', 'Time', 'Assoc_Packets',
    'ADV_DIRECT_IND', 'ADV_IND', 'ADV_NONCONN_IND', 
    'ADV_SCAN_IND', 'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP']

y_list = ["door", "lock", "temp"]

In [154]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
n_components = np.arange(1,4)
lda_param_grid = dict(n_components=n_components)

In [155]:
# DF Setup
df_to_use = b_df_midsample_devtype
df_train = df_to_use[df_to_use['Set']=='train']
df_test = df_to_use[df_to_use['Set']=='test']
X_train = df_train[features_list]
X_test = df_test[features_list]

In [156]:
# Set up classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()

In [157]:
b_knn = tune_gridsearch(knn, knn_param_grid, df_train, features_list, y_list)
b_rf = tune_gridsearch(rf, rf_param_grid, df_train, features_list, y_list)
b_lda = tune_gridsearch(lda, lda_param_grid, df_train, features_list, y_list)

**Determine best hyperparameters**

In [158]:
# Find best performing hyperparameter across all device types
def find_best_params(grid_result, labels, print_tuning=False):
    # Get list of all test scores for each device
    mean_test_scores = [val['mean_test_score'] for key,val in grid_result['grid_results'].items()]

    # Store into dataframe
    tuning_results = (pd.DataFrame(data=mean_test_scores, index=labels)).T

    # Calculate mean test score across all devices
    tuning_results['Mean'] = tuning_results.mean(axis=1)
    
    if print_tuning:
        print tuning_results
        
    # Find best mean test score
    best_mean_score = max(tuning_results['Mean'])
    best_mean_params_index = tuning_results['Mean'].idxmax(axis=0)

    # Get a key from the grid_result dict to access the 'params'
    key = grid_result['grid_results'].keys()[0]
    
    # Return best parameters    
    return dict(best_params=grid_result['grid_results'][key]['params'][best_mean_params_index], best_mean_score=best_mean_score)

In [159]:
wifi_grid_results = [w_knn, w_rf, w_lda]
ble_grid_results = [b_knn, b_rf, b_lda]
classifiers = ['knn', 'rf', 'lda']

In [160]:
for result,c in zip(wifi_grid_results, classifiers):
    print c, ": ", result['end_time']
    
for result,c in zip(ble_grid_results, classifiers):
    print c, ": ", result['end_time']

knn :  17792.3922579
rf :  78.4105529785
lda :  17.0793259144
knn :  588.389669895
rf :  101.951203823
lda :  14.065310955


In [161]:
print "Best performing hyperparameters for Wifi devices:\n"
for result, classifier in zip(wifi_grid_results,classifiers):
    x = find_best_params(result, ["camera", "bulb", "plug"])
    print '{classifier}: {best_parameter}     {best_score}'.format(
        classifier=classifier, best_parameter=x['best_params'], best_score=x['best_mean_score'])

print "\n\n"
print "Best performing hyperparameters for BLE devices:\n"
for result, classifier in zip(ble_grid_results,classifiers):
    x = find_best_params(result, ["door", "lock", "temp"])
    print '{classifier}: {best_parameter}     {best_score}'.format(
        classifier=classifier, best_parameter=x['best_params'], best_score=x['best_mean_score'])

Best performing hyperparameters for Wifi devices:

knn: {'n_neighbors': 9}     0.998520117677
rf: {'max_features': 8}     0.999548723989
lda: {'n_components': 1}     0.998712625449



Best performing hyperparameters for BLE devices:

knn: {'n_neighbors': 3}     0.973305152936
rf: {'max_features': 14}     0.998289121573
lda: {'n_components': 1}     0.979018881822
