In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras 

In [2]:
train = pd.read_csv('Data/train.csv')
test  = pd.read_csv('Data/test.csv')

x_train = train.drop(columns=['target']).values
y_train = train['target'].values

x_test = test.drop(columns=['target']).values
y_test = test['target'].values

In [3]:
train.columns

Index(['tcp.flags', 'tcp.time_delta', 'tcp.len', 'mqtt.conack.flags',
       'mqtt.conack.val', 'mqtt.conflag.passwd', 'mqtt.conflags',
       'mqtt.dupflag', 'mqtt.hdrflags', 'mqtt.kalive', 'mqtt.len',
       'mqtt.msgid', 'mqtt.msgtype', 'mqtt.retain', 'target'],
      dtype='object')

# 0. Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [5]:
import Resampler as res
import Metrics as met

dist = res.class_distribution(y_train)
dist

[(0, 10170), (1, 428), (2, 8340692), (3, 91465), (4, 7637), (5, 6433)]

In [6]:
def build_model():
    model = keras.Sequential()
    model.add(keras.layers.Dense(50, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(keras.layers.Dense(30, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(keras.layers.Dense(20, kernel_initializer='normal'))
    model.add(keras.layers.Dense(6,activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

monitor = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

# 1. No Resampling

In [7]:
model = build_model()
model.fit(x_train,y_train,
        validation_data=(x_test,y_test),
        callbacks=[monitor],
        verbose=1,
        epochs=20,
        batch_size=1024)

2022-12-02 17:33:52.486404: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-02 17:33:52.487371: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Epoch 1/20


2022-12-02 17:33:53.090598: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-02 17:33:53.442391: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-02 17:35:02.619294: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: early stopping


<keras.callbacks.History at 0x29545df70>

In [8]:
met.eval_tf(model, x_train, x_test, y_train, y_test)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-12-02 17:48:32.952663: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-02 17:48:32.952925: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


  14/1033 [..............................] - ETA: 4s  

2022-12-02 17:48:33.461092: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-02 17:48:33.536644: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Training Result:  Accuracy: 0.9822 F1 Score: [0.0003, 0.0, 0.9923, 0.0035, 0.1352, 0.0]
Testing  Result:  Accuracy: 0.9824 F1 Score: [0.0, 0.0, 0.9924, 0.0032, 0.1486, 0.0]


## 2. Undersampling + Oversampling

In [9]:
n_majority = int(x_train.shape[0] * 0.7)

n_minorities = np.array(dist)[[0,1,3,4,5],1]
weights = n_minorities / n_minorities.sum()
n_minorities = (weights * n_majority).astype(int)

undersample_strategy = {2: n_majority}
oversample_strategy = {
    0: n_minorities[0],
    1: n_minorities[1],
    3: n_minorities[2],
    4: n_minorities[3],
    5: n_minorities[4],
}


In [22]:
x_resampled, y_resampled = res.random_undersample(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.random_oversample(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [23]:
model2 = build_model()
model2.fit(x_resampled, y_resampled)

In [24]:
met.eval(model2, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.9335 F1 Score: [0.8035, 0.6722, 0.9598, 0.9376, 0.7743, 0.763]
Testing  Result:  Accuracy: 0.9849 F1 Score: [0.7399, 0.6263, 0.9928, 0.5766, 0.667, 0.7092]


## 3. Undersampling + SMOTE (Synthetic Minority Over-sampling )

In [13]:
x_resampled, y_resampled = res.random_undersample(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.smote(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [14]:
model3 = build_model()
model3.fit(x_resampled, y_resampled)

In [15]:
met.eval(model3, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.8833 F1 Score: [0.7765, 0.678, 0.9165, 0.8719, 0.7037, 0.7322]
Testing  Result:  Accuracy: 0.9936 F1 Score: [0.7491, 0.635, 0.9972, 0.7487, 0.6601, 0.7035]


## 4. NearMiss + SMOTE

In [19]:
x_resampled, y_resampled = res.near_miss(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.smote(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [20]:
model4 = build_model()
model4.fit(x_resampled, y_resampled)

In [21]:
met.eval(model4, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.8827 F1 Score: [0.7764, 0.6779, 0.9159, 0.8711, 0.7037, 0.7322]
Testing  Result:  Accuracy: 0.9895 F1 Score: [0.7498, 0.635, 0.9951, 0.7486, 0.1702, 0.7035]
