In [1]:
from Pipeline import BLEPipeline, WifiPipeline

# General data processing
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

# ML libraries
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_curve, roc_auc_score

# System libraries
from itertools import izip, combinations
import random, time

# Warning filtering
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Create dataset

In [2]:
time_start = time.time()

In [3]:
b = BLEPipeline()

In [4]:
# w.extract_packet_features(create_master=True)

In [5]:
df = b.make_dataframe()

August2    210644
Home1       54902
Home2       54516
Push        30661
Kevo        19430
August1     15047
Weather      8101
Room2        7698
Room1        7239
Door1        6696
Door2        3587
Name: Name, dtype: int64


## Resample

### Downsampling

In [6]:
# Remove router device type and plug device type (not using plug because not enough devices)
df = df[(df["DeviceType"]!="router") & (df["DeviceType"]!="plug")]

df_train = df[df['Set']=='train']

# List variables that do not work with resampling methods
dataframe_vars = {'Name', 'DeviceName', 'Set', 
                  'AccessAddr', 'AdvertAddr','PDUTypeNum'}
onehotEncoded_vars = {'ADV_DIRECT_IND', 'ADV_IND', 
                      'ADV_NONCONN_IND', 'ADV_SCAN_IND', 
                      'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP'}
response_vars = {"DeviceType","door","lock","temp"}
ble_devicetypes = ["door","lock","temp"]

# Prep X, y
y = df_train["DeviceType"]
col_drop = response_vars | dataframe_vars | onehotEncoded_vars
X = df_train.drop(col_drop, axis=1)

### Create trial dataframes

In [35]:
df_trials = []

for i in range(0,10):
    df_downsampled = b.downsample(X, y, df[df["Set"]=="test"])
    df_trials.append(df_downsampled)

In [33]:
.remove('Set', 'door', 'lock', 'temp')

Index([u'BLE_LL_Length', u'PacketLength', u'Time', u'Assoc_Packets',
       u'Channel_0', u'Channel_12', u'Channel_39', u'ADV_DIRECT_IND',
       u'ADV_IND', u'ADV_NONCONN_IND', u'ADV_SCAN_IND', u'SCAN_REQ',
       u'SCAN_RSP', u'door', u'lock', u'temp', u'Set'],
      dtype='object')

## Run trials

In [16]:
features_list = [
    # Packet info
    "PacketLength", "BLE_LL_Length", "Time", 
    
    # Associate Packets
    "Assoc_Packets",
    
    # Channel number
    "Channel_0", "Channel_12", "Channel_39",
    
    # PDU Type
    "SCAN_RSP", "ADV_IND", "SCAN_REQ", 
    "CONNECT_REQ", "ADV_NONCONN_IND", "ADV_DIRECT_IND"]

y_list = ["door", "lock", "temp"]

In [48]:
df_results = []
for i, trial in enumerate(df_trials):
    print "Trial", i
    result = b.one_vs_all_classify(trial, [x for x in features_list if x in trial.columns], y_list)
    df_results.append(result)

Trial 0
Device Type: door
RF


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3416  211090  7252   171  0.033448   0.000809  0.047672  0.001592
rest  211090    3416   171  7252  0.033448   0.679790  0.033214  0.063333
---------------------------------------------------------
KNN


Metrics
          FN      FP     TN     TP  Accuracy  Precision    Recall        F1
door    2762  190518  27824    825  0.129091   0.004312  0.229997  0.008465
rest  190518    2762    825  27824  0.129091   0.909697  0.127433  0.223551
---------------------------------------------------------
LDA


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3573  210662  7680    14  0.034669   0.000066  0.003903  0.000131
rest  210662    3573    14  7680  0.034669   0.682485  0.035174  0.066900
---------------------------------------------------------
Total time (classifiers): 6.9734120369

Device Type: lock
RF


Metrics
          FN    

Device Type: door
RF


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3409  209684  8658   178  0.039815   0.000848  0.049624  0.001668
rest  209684    3409   178  8658  0.039815   0.717494  0.039653  0.075153
---------------------------------------------------------
KNN


Metrics
          FN      FP     TN     TP  Accuracy  Precision    Recall        F1
door    2812  191003  27339    775   0.12668   0.004041  0.216058  0.007934
rest  191003    2812    775  27339   0.12668   0.906736  0.125212  0.220038
---------------------------------------------------------
LDA


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3573  210662  7680    14  0.034669   0.000066  0.003903  0.000131
rest  210662    3573    14  7680  0.034669   0.682485  0.035174  0.066900
---------------------------------------------------------
Total time (classifiers): 7.00360894203

Device Type: lock
RF


Metrics
          FN      FP   

Device Type: door
RF


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3373  210610  7732   214  0.035804   0.001015  0.059660  0.001996
rest  210610    3373   214  7732  0.035804   0.696263  0.035412  0.067397
---------------------------------------------------------
KNN


Metrics
          FN      FP     TN     TP  Accuracy  Precision    Recall        F1
door    2794  190362  27980    793   0.12965   0.004148  0.221076  0.008144
rest  190362    2794    793  27980   0.12965   0.909209  0.128148  0.224634
---------------------------------------------------------
LDA


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3573  210662  7680    14  0.034669   0.000066  0.003903  0.000131
rest  210662    3573    14  7680  0.034669   0.682485  0.035174  0.066900
---------------------------------------------------------
Total time (classifiers): 6.55016899109

Device Type: lock
RF


Metrics
          FN      FP   

Device Type: door
RF


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3401  210545  7797   186  0.035971   0.000883  0.051854  0.001736
rest  210545    3401   186  7797  0.035971   0.696285  0.035710  0.067936
---------------------------------------------------------
KNN


Metrics
          FN      FP     TN     TP  Accuracy  Precision    Recall        F1
door    2797  190080  28262    790  0.130907   0.004139  0.220240  0.008125
rest  190080    2797    790  28262  0.130907   0.909946  0.129439  0.226639
---------------------------------------------------------
LDA


Metrics
          FN      FP    TN    TP  Accuracy  Precision    Recall        F1
door    3573  210662  7680    14  0.034669   0.000066  0.003903  0.000131
rest  210662    3573    14  7680  0.034669   0.682485  0.035174  0.066900
---------------------------------------------------------
Total time (classifiers): 7.07037901878

Device Type: lock
RF


Metrics
          FN      FP   

## Reports

### Confusion Matrix

In [None]:
# Extract confusion matrices
classifiers = ['KNN', 'LDA', 'RF']
device_cms = []
for device in sorted(ble_devicetypes):
    classifier_cms = []
    for classifier in sorted(classifiers):
        classifier_cms.append(all_under_devtype[0][device][classifier]['CM'])
    device_cms.append(classifier_cms)


In [None]:
sns.set(font_scale=1.5)
for device_list in device_cms:
    for classifier_cm in device_list:
        fig, ax = plt.subplots(figsize=(11,8))
        ax.tick_params(labelsize='large')
        ax.xaxis.set_ticks_position('top')
        
        # Just to handle 'temp' confusion matrix not being in alphabetical order
        if 'temp' in classifier_cm.columns:
            classifier_cm = classifier_cm[['temp','rest']].sort_index(ascending=False)
            
        sns.heatmap(classifier_cm, annot=True, annot_kws={"size":40},square=True, fmt='d', cmap='Blues', cbar=False);

### Accuracy, Precision, Recall

In [None]:
# Extract metrics
device_metrics = []
for device in sorted(ble_devicetypes):
    classifier_metrics = []
    for classifier in sorted(classifiers):
        classifier_metrics.append(all_over_device[0][device][classifier]['Metrics'][['Accuracy','Precision','Recall']].loc[device])
    device_metrics.append(classifier_metrics)


In [None]:
for device_list in device_metrics:
    for classifier_metrics, c in zip(device_list, classifiers):
        print c
        print classifier_metrics, "\n"

### AUC

In [None]:
# Plotting ROC curves
sns.set(font_scale=1)

# Extract devices and classifiers
dev_pairs = all_under_devtype[0].keys()
classifiers = all_under_devtype[0][dev_pairs[0]].keys()

for device in dev_pairs:
    for classifier in classifiers:
        # Extract predicted probas and y_true
        pred_proba = all_under_devtype[0][device][classifier]['Classifier']['Pred_Proba']
        true = all_under_devtype[0][device][classifier]['Classifier']['True']
        
        # Plot ROC curve
        fpr, tpr, threshold = roc_curve(true, pred_proba[:,1], pos_label=1)
        auc = roc_auc_score(true, pred_proba[:,1])
        title = str(device).capitalize() + " vs Rest -- " + str(classifier).upper() 
        skplt.metrics.plot_roc(true, pred_proba, title=title, plot_micro=False, plot_macro=False, 
                               classes_to_plot=[1], figsize=(8,6));

### Time Performance

In [None]:
print "Total time:", all_under_devtype[1], "seconds"

for device in ble_devicetypes:
    for classifier in classifiers:
        title = "("+ str(device).capitalize() + ", "+str(classifier).upper() + ")"
        print title, ":", all_under_devtype[0][device][classifier]['Classifier']['Time'], "seconds"

### Feature Importance

In [None]:
# Include source: https://towardsdatascience.com/running-random-forests-inspect-the-feature-importances-with-this-code-2b00dd72b92e
feature_importances = pd.DataFrame(b.randomforest.feature_importances_,
                                   index = X_downsampled.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances)

### Plots

In [None]:
# Resample packets based on DeviceType
# Useful info used from https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets#
devtype_df = df["DeviceType"].value_counts().sort_index()
devtype_df.plot(kind='bar', title="Packet Counts Prior to Resampling");

In [None]:
plt.subplots(1, 2, figsize=(14,4)) # number of rows, number of columns, figure size=(width, height)

# Plot training set
df_train = df[df["Set"]=="train"]
plt.subplot(1, 2, 1)
df_train["DeviceType"].value_counts().sort_index().plot(kind='bar', title="Training Packet Counts Prior to Resampling");

# Plot test set
df_test = df[df["Set"]=="test"]
plt.subplot(1, 2, 2)
df_test["DeviceType"].value_counts().sort_index().plot(kind='bar', title="Test Packet Counts Prior to Resampling");


In [None]:
plt.subplots(1, 2, figsize=(14,4)) # number of rows, number of columns, figure size=(width, height)

# Plot training set
df_train_downsampled['DeviceType'] = df_train_downsampled[ble_devicetypes].idxmax(1)
plt.subplot(1, 2, 1)
df_train_downsampled['DeviceType'].value_counts().sort_index().plot(kind='bar', title="Training Packet Counts After Resampling");

# Plot test set
df_test = df[df["Set"]=="test"]
plt.subplot(1, 2, 2)
df_test["DeviceType"].value_counts().sort_index().plot(kind='bar', title="Test Packet Counts After Resampling");
