In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [4]:
train = pd.read_csv('Data/train.csv')
test  = pd.read_csv('Data/test.csv')

x_train = train.drop(columns=['target']).values
y_train = train['target'].values

x_test = test.drop(columns=['target']).values
y_test = test['target'].values

In [5]:
train.columns

Index(['tcp.flags', 'tcp.time_delta', 'tcp.len', 'mqtt.conack.flags',
       'mqtt.conack.val', 'mqtt.conflag.passwd', 'mqtt.conflags',
       'mqtt.dupflag', 'mqtt.hdrflags', 'mqtt.kalive', 'mqtt.len',
       'mqtt.msgid', 'mqtt.msgtype', 'mqtt.retain', 'target'],
      dtype='object')

# 0. Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [7]:
import Resampler as res
import Metrics as met

dist = res.class_distribution(y_train)
dist

[(0, 10170), (1, 428), (2, 8340692), (3, 91465), (4, 7637), (5, 6433)]

In [11]:
def build_model():
    return xgb.XGBClassifier(n_estimators = 20, 
                            random_state=42,
                            n_jobs=-1)

# 1. No Resampling

In [12]:
model = build_model()
model.fit(x_train, y_train)

In [13]:
met.eval(model, x_train, x_test, y_train, y_test)

Training Result:  Accuracy: 0.9968 F1 Score: [0.7592, 0.6594, 0.9987, 0.8796, 0.6756, 0.7256]
Testing  Result:  Accuracy: 0.9967 F1 Score: [0.7539, 0.6397, 0.9987, 0.8779, 0.656, 0.7065]


## 2. Undersampling + Oversampling

In [14]:
n_majority = int(x_train.shape[0] * 0.7)

n_minorities = np.array(dist)[[0,1,3,4,5],1]
weights = n_minorities / n_minorities.sum()
n_minorities = (weights * n_majority).astype(int)

undersample_strategy = {2: n_majority}
oversample_strategy = {
    0: n_minorities[0],
    1: n_minorities[1],
    3: n_minorities[2],
    4: n_minorities[3],
    5: n_minorities[4],
}


In [15]:
x_resampled, y_resampled = res.random_undersample(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.random_oversample(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [16]:
model2 = build_model()
model2.fit(x_resampled, y_resampled)

In [17]:
met.eval(model2, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.9282 F1 Score: [0.7701, 0.6666, 0.9596, 0.9363, 0.6872, 0.7373]
Testing  Result:  Accuracy: 0.9849 F1 Score: [0.7404, 0.6296, 0.9927, 0.5766, 0.6593, 0.7156]


## 3. Undersampling + SMOTE (Synthetic Minority Over-sampling )

In [18]:
x_resampled, y_resampled = res.random_undersample(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.smote(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [19]:
model3 = build_model()
model3.fit(x_resampled, y_resampled)

In [20]:
met.eval(model3, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.9280 F1 Score: [0.7676, 0.674, 0.9596, 0.9363, 0.6873, 0.7339]
Testing  Result:  Accuracy: 0.9850 F1 Score: [0.7539, 0.6347, 0.9928, 0.5766, 0.6616, 0.7139]


## 4. NearMiss + SMOTE

In [21]:
x_resampled, y_resampled = res.near_miss(x_train, y_train, strategy=undersample_strategy)

x_resampled, y_resampled = res.smote(x_resampled, y_resampled, strategy=oversample_strategy)
res.class_distribution(y_resampled)

[(0, 518406), (1, 21816), (2, 5919777), (3, 4662347), (4, 389289), (5, 327916)]

In [22]:
model4 = build_model()
model4.fit(x_resampled, y_resampled)

In [23]:
met.eval(model4, x_resampled, x_test, y_resampled, y_test)

Training Result:  Accuracy: 0.9254 F1 Score: [0.768, 0.6672, 0.9568, 0.9331, 0.6869, 0.7352]
Testing  Result:  Accuracy: 0.9806 F1 Score: [0.7549, 0.6296, 0.9905, 0.5302, 0.3358, 0.7111]
