In [170]:
from Pipeline import BLEPipeline, WifiPipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random, time

from itertools import izip, combinations

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Plot ROC
from sklearn.metrics import roc_curve, roc_auc_score
import scikitplot as skplt

# Tuning 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [3]:
w = WifiPipeline()

In [4]:
df = w.make_dataframe()

Mini       104280
Router     103593
Dropcam     64568
Kasa        23753
Netcam3      4867
Netcam1      4446
Netcam2      4407
Switch2      3046
Switch1      2668
Switch3      2634
Insight      2556
Switch4      2206
Lifx2         627
TpPlug        587
Lifx1         540
TpBulb        202
Name: Name, dtype: int64


In [23]:
df_train = df[df["Set"]=="train"]
df_test = df[df["Set"]=="test"]

In [None]:
# Sampling by DeviceType
category = "DeviceType"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_undersample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_oversample_devtype = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [None]:
# Sampling by Name
category = "Name"

# Undersampling:
kind = "under"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_undersample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

# Oversampling:
kind = "over"
df_train_sampled = w.resample(df_train, kind=kind, category=category)
df_test_sampled = w.resample(df_test, kind=kind, category=category)
df_oversample_name = pd.concat([df_train_sampled, df_test_sampled], axis=0)

In [6]:
# Define which features to use
features_list = [
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
         "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null",

        # Associated Packets
        "Assoc_Packets"]

# Define what the response classes are
y_list = ["camera", "bulb", "plug"]

In [10]:
# Set up vsall classification
all_under_devtype = w.one_vs_all_classify(df_undersample, features_list, y_list, output='plot')

Total time (one vs all_classify): 3.20186901093



**Tuning using GridSearchCV**

In [129]:
device = 'camera'
df_to_use = df
# df_to_use = df_undersample
df_train = df_to_use[df_to_use['Set']=='train']
X_train = df_train[features_list]
y_train = df_train[device]

df_test = df_to_use[df_to_use['Set']=='test']
X_test = df_test[features_list]
y_test = df_test[device]

In [130]:
# Set up control classifier (no tuning)
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
print knn.score(X_test, y_test)

0.2833645443196005


In [193]:
# Set up classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=1000)
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()

In [138]:
k_max = np.sqrt(len(X_train))/2
k_max

217.5701036447793

In [223]:
# Define grid values
# KNN
n_neighbors = np.arange(1,11,2)
knn_param_grid = dict(n_neighbors=n_neighbors)

# RF
max_features = np.linspace(2, len(features_list), num=5, dtype=int)
rf_param_grid = dict(max_features=max_features)

# LDA
solver = ['lsqr']
n_components = np.arange(1,4)
print n_components
lda_param_grid = dict(solver=solver, n_components=n_components)

# QDA
qda_param_grid = dict()

[1 2 3]


In [224]:
# Init grid searches
folds = 10
knn_grid = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=folds)
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=folds)
lda_grid = GridSearchCV(estimator=lda, param_grid=lda_param_grid, cv=folds)
qda_grid = GridSearchCV(estimator=qda, param_grid=qda_param_grid, cv=folds)

In [154]:
%%time
knn_grid_result = knn_grid.fit(X_train, y_train)
print "KNN_Grid Best params:", knn_grid_result.best_params_, "score = ", knn_grid_result.best_score_

KNN_Grid Best params: {'n_neighbors': 3} score =  0.9828357460112914
CPU times: user 2h 29min 12s, sys: 230 ms, total: 2h 29min 13s
Wall time: 2h 29min 5s


In [155]:
knn_grid_result.cv_results_

{'mean_fit_time': array([70.67118621, 70.2830287 , 70.33850598, 70.70554957, 70.15340869]),
 'mean_score_time': array([10.49609258, 10.51325469, 10.50272162, 10.51065383, 10.51851656]),
 'mean_test_score': array([0.9643987 , 0.98283575, 0.96588274, 0.96593556, 0.96565037]),
 'mean_train_score': array([0.97339688, 0.97603754, 0.97372725, 0.97359228, 0.97390036]),
 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 3},
  {'n_neighbors': 5},
  {'n_neighbors': 7},
  {'n_neighbors': 9}],
 'rank_test_score': array([5, 1, 3, 2, 4], dtype=int32),
 'split0_test_score': array([0.68441065, 0.87626743, 0.70600972, 0.70574567, 0.70595691]),
 'split0_train_score': array([0.97208514, 0.99283497, 0.97026014, 0.97023666, 0.97021906]),
 'split1_test_score': array([0.98304727, 0.9832057 , 0.98331133, 0.98410351, 0.98410351]),
 'split1_train_sco

In [156]:
%%time
rf_grid_result = rf_grid.fit(X_train, y_train)
print "RF_Grid Best params:", rf_grid_result.best_params_, "score = ", rf_grid_result.best_score_

RF_Grid Best params: {'max_features': 11} score =  0.9752623490205813
CPU times: user 29min 5s, sys: 19.6 s, total: 29min 24s
Wall time: 40min 9s


In [158]:
rf_grid_result.cv_results_

{'mean_fit_time': array([83.58195257, 23.46767893, 24.73769701, 26.20158   , 28.62654252]),
 'mean_score_time': array([0.94631853, 0.91396887, 0.89171066, 0.88854291, 0.87409623]),
 'mean_test_score': array([0.97463915, 0.974655  , 0.97466028, 0.97496131, 0.97526235]),
 'mean_train_score': array([0.98207173, 0.98207173, 0.98207173, 0.98207173, 0.98207173]),
 'param_max_features': masked_array(data=[2, 4, 6, 8, 11],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_features': 2},
  {'max_features': 4},
  {'max_features': 6},
  {'max_features': 8},
  {'max_features': 11}],
 'rank_test_score': array([5, 4, 3, 2, 1], dtype=int32),
 'split0_test_score': array([0.85144698, 0.85144698, 0.85139417, 0.85128855, 0.85181665]),
 'split0_train_score': array([0.99459542, 0.99459542, 0.99459542, 0.99459542, 0.99459542]),
 'split1_test_score': array([0.96186955, 0.96186955, 0.96197518, 0.9650911 , 0.96524954]),
 'split1_train_

In [225]:
%%time
lda_grid_result = lda_grid.fit(X_train, y_train)
print "LDA_Grid Best params:", lda_grid_result.best_params_, "score = ", lda_grid_result.best_score_

LDA_Grid Best params: {'n_components': 1, 'solver': 'lsqr'} score =  0.9518555878889023
CPU times: user 4.12 s, sys: 40 ms, total: 4.16 s
Wall time: 2.12 s


In [226]:
lda_grid_result.cv_results_

{'mean_fit_time': array([0.04995093, 0.05008829, 0.04886222]),
 'mean_score_time': array([0.0018441 , 0.00172205, 0.00175855]),
 'mean_test_score': array([0.95185559, 0.95185559, 0.95185559]),
 'mean_train_score': array([0.96949518, 0.96949518, 0.96949518]),
 'param_n_components': masked_array(data=[1, 2, 3],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_solver': masked_array(data=['lsqr', 'lsqr', 'lsqr'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_components': 1, 'solver': 'lsqr'},
  {'n_components': 2, 'solver': 'lsqr'},
  {'n_components': 3, 'solver': 'lsqr'}],
 'rank_test_score': array([1, 1, 1], dtype=int32),
 'split0_test_score': array([0.60583016, 0.60583016, 0.60583016]),
 'split0_train_score': array([0.99092782, 0.99092782, 0.99092782]),
 'split1_test_score': array([0.94290996, 0.94290996, 0.94290996]),
 'split1_train_score': array([0.97239631, 0.97239631,

In [None]:
# Run GridSearch for each device type
# for device in y_list:
#     y_train = df_train[device]
#     y_test = df_test[device]
#     grid_result = grid.fit(X_train, y_train)
    
#     print "Best params:", grid_result.best_params_, "score = ", grid_result.best_score_