In [1]:
import pandas as pd
import numpy as np
from EIMTC.preprocessing import OneHotEncoderEIMTC
from EIMTC.selection import train_test_split
from EIMTC.metrics import classification_report
from EIMTC.models import M1CNN
from EIMTC.plugins.n_pkts_byte_freq import NPacketsByteFrequency
from EIMTC.plugins.n_bytes import NBytes

In [2]:
filepath = './data/out.csv'
feature_columns = [
    'udps.n_bytes',
    'udps.stnn_image',
    'udps.src2dst_n_packets_byte_frequency',
    'udps.dst2src_n_packets_byte_frequency',
]
label_columns = [ 'os', 'browser']
columns_to_load = feature_columns + label_columns
df = pd.read_csv(filepath, usecols=columns_to_load)
df

Unnamed: 0,udps.src2dst_n_packets_byte_frequency,udps.dst2src_n_packets_byte_frequency,udps.n_bytes,udps.stnn_image,os,browser
0,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox
1,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome
2,"[3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari
3,"[3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox
4,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome
...,...,...,...,...,...,...
20628,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 0.0, 1.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer
20629,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 2.0, 3.0, 1.0, 1.0, 3.0, 0.0, 2.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer
20630,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox
20631,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox


In [4]:
def preprocessing(df):
    df = df.dropna().copy()
    enc = OneHotEncoderEIMTC()
    df['browser_ohc'] = list(enc.fit_transform(df['browser']).toarray())
    NBytes.preprocess(df)
    NPacketsByteFrequency.preprocess(df)
    return df

In [5]:
df = preprocessing(df)
df

Unnamed: 0,udps.src2dst_n_packets_byte_frequency,udps.dst2src_n_packets_byte_frequency,udps.n_bytes,udps.stnn_image,os,browser,browser_ohc
0,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]"
1,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]"
2,"[3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari,"[0.0, 0.0, 0.0, 1.0]"
3,"[3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox,"[0.0, 1.0, 0.0, 0.0]"
4,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]"
...,...,...,...,...,...,...,...
20628,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 0.0, 1.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]"
20629,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 2.0, 3.0, 1.0, 1.0, 3.0, 0.0, 2.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]"
20630,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]"
20631,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]"


In [6]:
n_bytes_features = np.stack(df[feature_columns[0]].values)
byte_freq_features = np.concatenate([
        np.stack(df[feature_columns[2]].values),
        np.stack(df[feature_columns[3]].values)
    ], 
    axis=1
)

features = np.concatenate([n_bytes_features, byte_freq_features], axis=1)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(features, df['browser_ohc'].values, 
    test_size=0.25,
    stratify=df['browser'].values,
    random_state=42)

In [9]:
n_classes = len(df['browser'].unique())
model = M1CNN(payload_size=len(features[0]), n_classes=n_classes)
print(len(features[0]), n_classes)

1296 4


In [10]:
epochs = 10
batch_size = 128
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.fit(
    np.stack(x_train), 
    np.stack(y_train), 
    epochs=epochs, 
    batch_size=batch_size, 
    use_multiprocessing=True,
    workers=4,
    verbose=2
)

Epoch 1/10
121/121 - 4s - loss: 0.4401 - accuracy: 0.8377
Epoch 2/10
121/121 - 1s - loss: 0.1251 - accuracy: 0.9509
Epoch 3/10
121/121 - 1s - loss: 0.0841 - accuracy: 0.9668
Epoch 4/10
121/121 - 1s - loss: 0.0587 - accuracy: 0.9769
Epoch 5/10
121/121 - 1s - loss: 0.0472 - accuracy: 0.9824
Epoch 6/10
121/121 - 1s - loss: 0.0380 - accuracy: 0.9853
Epoch 7/10
121/121 - 1s - loss: 0.0294 - accuracy: 0.9896
Epoch 8/10
121/121 - 1s - loss: 0.0248 - accuracy: 0.9914
Epoch 9/10
121/121 - 1s - loss: 0.0239 - accuracy: 0.9916
Epoch 10/10
121/121 - 1s - loss: 0.0156 - accuracy: 0.9947


<tensorflow.python.keras.callbacks.History at 0x2530f97a208>

In [11]:
predictions = model.model.predict(np.stack(x_test))
predictions = np.argmax(predictions, axis=1)
y_test_true = np.argmax(np.stack(y_test), axis=1)

report = classification_report(y_test_true, predictions)
report

'              precision    recall  f1-score   support\n\n           0       0.96      0.95      0.96      1265\n           1       0.96      0.97      0.97      1465\n           2       0.99      0.99      0.99      1781\n           3       1.00      1.00      1.00       647\n\n    accuracy                           0.98      5158\n   macro avg       0.98      0.98      0.98      5158\nweighted avg       0.98      0.98      0.98      5158\n'

In [20]:
with open('m1cnn+bytefreq_boa2016_browser_report.txt', "w+") as f:
    f.write(report)