# Initialization

In [1]:
import imp
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
from pull_data import Pull
import os
from sklearn.metrics import confusion_matrix
from prettytable import PrettyTable
from statistics import mean
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [9]:
# List of available classess in dataset
# Uncomment any class to include it into evaluation
DATA_CLASS = {
    # CESNET DATASET
    "IKEA_APP" : "dev-annotated-datasets/ikea-app/train",
    "IKEA_HOMEKIT_CLEAR" : "dev-annotated-datasets/ikea-app/anomaly",
#    "IKEA_HOMEKIT" : "dev-annotated-datasets/ikea-homekit/train",
    "IP_CAM" : "dev-annotated-datasets/ipcam/train",
#    "IP_CAM_MISCONFIG" : "dev-annotated-datasets/ipcam/anomaly_cam",
    "NORMAL_USER" : "dev-annotated-datasets/normal-user/train",
    "VOICE_ASSISTANT" : "dev-annotated-datasets/voice-assistant/train",
    
    # UNSW DATASET (https://iotanalytics.unsw.edu.au/iottraces)
    ## HUBS
    "AMAZON_ECHO" : "../data-sets/unsw-traces-device/Amazon-Echo",
    "SMART_THINGS" : "../data-sets/unsw-traces-device/Smart-Things",
    ## CAMERAS
    "NETAMO_WELCOME" : "../data-sets/unsw-traces-device/Netatmo-Welcome",
    #"TP-Link-Day-Night-Cloud-Camera" : "../data-sets/unsw-traces-device/TP-Link-Day-Night-Cloud-Camera",
    "Samsung-Smart-Cam" : "../data-sets/unsw-traces-device/Samsung-Smart-Cam",
    "INSTEON_CAM" : "../data-sets/unsw-traces-device/Insteon-Camera",
    #"DROP_CAM" : "../data-sets/unsw-traces-device/Dropcam",
    "WITHINGS_SMART_BABY_MONITOR" : "../data-sets/unsw-traces-device/Withings-Smart-Baby-Monitor",
    ## SWITCHES AND TRIGGERS
    "BELKIN_WEMO_SWITCH" : "../data-sets/unsw-traces-device/Belkin-Wemo-Switch",
    "TP-Link-Smart-Plug" : "../data-sets/unsw-traces-device/TP-Link-Smart-Plug",
    #"iHome" : "../data-sets/unsw-traces-device/iHome", -> not available in recorded first week
    "BELKIN_WEMO_MOTION_SENSOR" : "../data-sets/unsw-traces-device/Belkin-Wemo-Motion-Sensor",
    ## AIR QUALITY SENSORS
    #"NEST-Protect-Smoke-Alarm" : "../data-sets/unsw-traces-device/NEST-Protect-Smoke-Alarm", not available in recorded first one week
    "Netatmo-Weather-Station" : "../data-sets/unsw-traces-device/Netatmo-Weather-Station",
    ## HEATLTHCARE DEVICE
    #"Withings-Smart-Scale" : "../data-sets/unsw-traces-device/Withings-Smart-Scale", -> not available in recorded first one week
    #"Blipcare-Blood-Pressure-Meter" : "../data-sets/unsw-traces-device/Blipcare-Blood-Pressure-Meter", -> not available in recorded first one week
    "Withings-Aura-Smart-Sleep-Sensor" : "../data-sets/unsw-traces-device/Withings-Aura-Smart-Sleep-Sensor",
    ## LIGHT BULBS
    "Light-Bulbs-LiFX-Smart-Bulb" : "../data-sets/unsw-traces-device/Light-Bulbs-LiFX-Smart-Bulb",
    ## ELECTRONIC
    #"Triby-Speaker" : "../data-sets/unsw-traces-device/Triby-Speaker", -> not available in recorded first one week
    #"PIX-STAR-Photo-Frame" : "../data-sets/unsw-traces-device/PIX-STAR-Photo-Frame", -> not available in recorded first one week
    "HP-Printer" : "../data-sets/unsw-traces-device/HP-Printer",
    ## NON-IOT
    "Laptop" : "../data-sets/unsw-traces-device/Laptop",
    "ANDROID_PHONE" : "../data-sets/unsw-traces-device/Android-Phone",
    "Samsung-Galaxy-Tab" : "../data-sets/unsw-traces-device/Samsung-Galaxy-Tab",
    #"IPhone" : "../data-sets/unsw-traces-device/IPhone", -> not available in recorded first one week
    
    # CTU13 BOTNET ATTACKS DATASET (https://www.stratosphereips.org/datasets-ctu13)
#    "BOTNET_SOGOU" : "../data-sets/botnet/sogou",
#    "BOTNET_RBOT" : "../data-sets/botnet/rbot",
#    "BOTNET_NERIS" : "../data-sets/botnet/neris"
}

# Function Definitions

In [3]:
# Basic initial evaluation of different models
def runModel(models):
    for key, model in models.items():
        print("### Model Name:",key," ###")
        kf = KFold(5, True)
        iteration_cnt = 0
        for train_index, test_index in kf.split(c_data,c_target):
            iteration_cnt += 1
            #Train
            model.fit(c_data[train_index],c_target[train_index])
            #Evaluate 
            p_class = model.predict(c_data[test_index])
            y_pred_valid = model.predict_proba(c_data[test_index])

# Pull Datasets

# MODELS

In [4]:
MODELS = {}
MODELS["RandomForest"] = {}
MODELS["AdaBoost"] = {}
MODELS["GradientBoosting"] = {}
rng = np.random.RandomState(12345)

MODELS["RandomForest"]["RF1"] = RandomForestClassifier(random_state=rng,n_estimators=10)
MODELS["AB1"] = AdaBoostClassifier(random_state=rng)
MODELS["GradientBoosting"] = GradientBoostingClassifier(random_state=rng)


In [5]:
# run specific set of models based on above definition
#runModel(MODELS["RandomForest"])

# Evaluation
Generates results for experiments section in the paper

In [13]:
UPPER_LIMIT = 400 # set upper limit for flows training dataset
ANOMALY = ["IKEA_HOMEKIT_CLEAR"]
class_index = 1
c_data = None
c_target = None
c_anomaly_data = None
c_anomaly_target = None

for data_cl in DATA_CLASS: 
    
    # Pull anomaly classes
    if data_cl in ANOMALY:
        print("Loading Anomaly",data_cl," dataset")
        a = Pull(DATA_CLASS[data_cl],999)
        # Set upper limit due to possible unbiased results
        if len(a.data) < UPPER_LIMIT:
            max_limit = len(a.data)
        else:
            max_limit = UPPER_LIMIT
        # uniform random sample
        idx = np.random.choice(len(a.data), max_limit, replace=False)
        
        if c_anomaly_data is None:
            c_anomaly_data = np.array(a.data)[idx]
            c_anomaly_target = np.array(a.labels)[idx]
        else:
            c_anomaly_data = np.concatenate((c_anomaly_data,np.array(a.data)[idx]))
            c_anomaly_target = np.concatenate((c_anomaly_target,np.array(a.labels)[idx]),axis=None)
        continue
        
    # Pull valid classes 
    a = Pull(DATA_CLASS[data_cl],class_index)
    # Set upper limit due to possible unbiased results
    if len(a.data) < UPPER_LIMIT:
        max_limit = len(a.data)
    else:
        max_limit = UPPER_LIMIT
    # uniform random sample
    idx = np.random.choice(len(a.data), max_limit, replace=False)
        
    print("Loading",data_cl," dataset")
    if c_data is None:
        c_data = np.array(a.data)[idx]
        c_target = np.array(a.labels)[idx]
    else:
        c_data = np.concatenate((c_data,np.array(a.data)[idx]))
        c_target = np.concatenate((c_target,np.array(a.labels)[idx]),axis=None)
        
    class_index += 1

Loading IKEA_APP  dataset
Loading Anomaly IKEA_HOMEKIT_CLEAR  dataset
Loading IP_CAM  dataset
Loading NORMAL_USER  dataset
Loading VOICE_ASSISTANT  dataset
Loading AMAZON_ECHO  dataset
Loading SMART_THINGS  dataset
Loading NETAMO_WELCOME  dataset
Loading Samsung-Smart-Cam  dataset
Loading INSTEON_CAM  dataset
Loading WITHINGS_SMART_BABY_MONITOR  dataset
Loading BELKIN_WEMO_SWITCH  dataset
Loading TP-Link-Smart-Plug  dataset
Loading BELKIN_WEMO_MOTION_SENSOR  dataset
Loading Netatmo-Weather-Station  dataset
Loading Withings-Aura-Smart-Sleep-Sensor  dataset
Loading Light-Bulbs-LiFX-Smart-Bulb  dataset
Loading HP-Printer  dataset
Loading Laptop  dataset
Loading ANDROID_PHONE  dataset
Loading Samsung-Galaxy-Tab  dataset


# Single Models

## Random Forest

In [7]:
# Generate format of structure for evaluation  
def createResultDict(no_classes):
    tmp_struct = {}
    for i in range(no_classes):
        tmp_struct[i+1] = {}
        for j in range(no_classes):
            tmp_struct[i+1][j+1] = 0
    return tmp_struct
        

In [11]:
rng = np.random.RandomState(12345)
model = RandomForestClassifier(random_state=rng,n_estimators=10)

kf = KFold(5, True)
iteration_cnt = 0

# Generate struct for defined number of input classes
result_struct = createResultDict(20)

# Split dataset 
for train_index, test_index in kf.split(c_data,c_target):
    tmp_result_struct = {}
    iteration_cnt += 1
    # Train
    model.fit(c_data[train_index],c_target[train_index])
    # Evaluate 
    y_class = model.predict(c_data[test_index])
    y_pred = model.predict_proba(c_data[test_index])
    
    # Count classification frequency
    for i in range(len(test_index)):
        try:
            tmp_result_struct[c_target[test_index[i]]][y_class[i]] += 1
        except Exception as e:
            try:
                tmp_result_struct[c_target[test_index[i]]][y_class[i]] = 1
            except Exception as e3:
                tmp_result_struct[c_target[test_index[i]]] = {}
                tmp_result_struct[c_target[test_index[i]]][y_class[i]] = 1
                
    # Create classification frequency in percentage
    for t_class,t_value in tmp_result_struct.items():
        p_sum = 0
        for p_class,p_value in t_value.items():
            p_sum += p_value
        for p_class,p_value in t_value.items():
            if iteration_cnt == 1:
                result_struct[t_class][p_class] = (p_value/p_sum)
            else:
                result_struct[t_class][p_class] = (result_struct[t_class][p_class]+(p_value/p_sum))/2

# print classification frequency results
for key,val in result_struct.items():
    print(key,":",val)
  
print(classification_report(y_class,c_target[test_index],output_dict=False))
print("Cross-validation score:",cross_val_score(model, c_data, c_target, cv=5))
print("Total classification accuracy:",metrics.accuracy_score(y_class,c_target[test_index]))
print("===================")
# Predict class for anomaly (unknown) traffic
y_class = model.predict(c_anomaly_data)
y_pred = model.predict_proba(c_anomaly_data)
# Measure accuracy (classification frequency) against defined classess
print(metrics.accuracy_score(y_class,[1]*len(c_anomaly_data.data)))
print(metrics.accuracy_score(y_class,[2]*len(c_anomaly_data.data)))
print(metrics.accuracy_score(y_class,[3]*len(c_anomaly_data.data)))
print(metrics.accuracy_score(y_class,[4]*len(c_anomaly_data.data)))


1 : {1: 0.9936032957486792, 2: 0, 3: 0.007352941176470588, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0.012281494876431584, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0.012658227848101266}
2 : {1: 0.030509935773093667, 2: 0.6197546358730569, 3: 0.012469287469287469, 4: 0.019079229605545395, 5: 0.33791674877201194, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0.006756756756756757, 18: 0, 19: 0.008771929824561403, 20: 0.00909090909090909}
3 : {1: 0.02125506072874494, 2: 0.0058823529411764705, 3: 0.8790877788362308, 4: 0.01282051282051282, 5: 0.016443895371913948, 6: 0, 7: 0.01636302294197031, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0.00641025641025641, 17: 0, 18: 0.012767424783678653, 19: 0.04742299753909661, 20: 0.012593276176867507}
4 : {1: 0.03595627242939616, 2: 0.03636162687886826, 3: 0.012558836531960264, 4: 0.8184491535340928, 5: 0.039109390440526345, 6: 0.014705882352941176, 7: 0.018073542414313207, 8: 0, 9: 0

# Below models don't have sufficient results

## AdaBoost

## GradientBoosting