In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('Data/train.csv')
test  = pd.read_csv('Data/test.csv')

In [3]:
x_train = train.drop(columns=['target']).values
y_train = train['target'].values

x_test = test.drop(columns=['target']).values
y_test = test['target'].values

# 1. Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 2. Augmentation

In [5]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter

In [6]:
print(sorted(Counter(y_train).items()))

[(0, 10170), (1, 428), (2, 8340692), (3, 91465), (4, 7637), (5, 6433)]


In [8]:
major_count = 8340692

strategy = {
    0:major_count//5,
    1:major_count//5,
    2:major_count,
    3:major_count//5,
    4:major_count//5,
    5:major_count//5
}

## Oversampling

In [9]:
ros = RandomOverSampler(sampling_strategy=strategy, random_state=42)
x_over, y_over = ros.fit_resample(x_train, y_train)

print(sorted(Counter(y_over).items()))

[(0, 1668138), (1, 1668138), (2, 8340692), (3, 1668138), (4, 1668138), (5, 1668138)]


## SMOTE (Synthetic Minority Over-sampling )

In [15]:
sm = SMOTE(sampling_strategy=strategy,random_state=42)
x_smote, y_smote = sm.fit_resample(x_train, y_train)

print(sorted(Counter(y_smote).items()))

[(0, 1668138), (1, 1668138), (2, 8340692), (3, 1668138), (4, 1668138), (5, 1668138)]


# SMOTE + Tomek links

In [11]:
sme = SMOTETomek(sampling_strategy=strategy,random_state=42)
x_smotetomek, y_smotetomek = sme.fit_resample(x_train, y_train)

print(sorted(Counter(y_smotetomek).items()))

# Example ML

In [10]:
from sklearn.metrics import accuracy_score, f1_score

def eval(model, x_tr, x_te, y_tr, y_te):
    yh = model.predict(x_tr)
    print(accuracy_score(y_tr, yh), f1_score(y_tr, yh, average=None))

    yh = model.predict(x_te)
    print(accuracy_score(y_te, yh), f1_score(y_te, yh, average=None))

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
rf = DecisionTreeClassifier(random_state=42)

cols = [5,7]
rf.fit(x_train[:, :], y_train)
eval(rf, x_train[:, :], x_test[:, :], y_train, y_test)

0.9969765248778354 [0.79892359 0.66976744 0.99873069 0.88107685 0.76840467 0.75960118]
0.9967489378796883 [0.7457137  0.62773723 0.99871309 0.8763329  0.66593767 0.70846941]


In [13]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_over, y_over)
eval(rf, x_over, x_test, y_over, y_test)

0.8830413451355529 [0.80791488 0.80623752 0.9469674  0.88430157 0.79225773 0.77939193]
0.9797224553672186 [0.6897007  0.00714789 0.9903089  0.78993171 0.48037338 0.21985158]


In [16]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_smote, y_smote)
eval(rf, x_smote, x_test, y_smote, y_test)

0.8328004837968461 [0.7470074  0.67977858 0.90474285 0.835983   0.70998428 0.73961155]
0.9955454130584375 [0.68422212 0.50285714 0.99821513 0.83284148 0.5271208  0.6706684 ]


In [None]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(x_smotetomek, y_smotetomek)
eval(rf, x_smotetomek, x_test, y_smotetomek, y_test)

In [None]:
yh = rf.predict(x_test)
sum(yh != y_test)

11783

In [None]:
train.columns

Index(['tcp.flags', 'tcp.time_delta', 'tcp.len', 'mqtt.conack.flags',
       'mqtt.conack.flags.reserved', 'mqtt.conack.flags.sp', 'mqtt.conack.val',
       'mqtt.conflag.cleansess', 'mqtt.conflag.passwd', 'mqtt.conflag.qos',
       'mqtt.conflag.reserved', 'mqtt.conflag.retain', 'mqtt.conflag.uname',
       'mqtt.conflag.willflag', 'mqtt.conflags', 'mqtt.dupflag',
       'mqtt.hdrflags', 'mqtt.kalive', 'mqtt.len', 'mqtt.msg', 'mqtt.msgid',
       'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.protoname', 'mqtt.qos',
       'mqtt.retain', 'mqtt.sub.qos', 'mqtt.suback.qos', 'mqtt.ver', 'target'],
      dtype='object')