In [20]:
import pandas as pd
import numpy as np
from EIMTC.preprocessing import OneHotEncoderEIMTC
from EIMTC.selection import train_test_split
from EIMTC.models import CustomDistiller
from EIMTC.plugins.n_pkts_byte_freq import NPacketsByteFrequency
from EIMTC.plugins.n_bytes import NBytes
from EIMTC.metrics import classification_report

In [21]:
filepath = './data/out.csv'
cols_to_load = ['udps.n_bytes', 'udps.protocol_header_fields', 'udps.stnn_image',
    'udps.src2dst_n_packets_byte_frequency',
    'udps.dst2src_n_packets_byte_frequency','os', 'browser']
df = pd.read_csv(filepath, usecols=cols_to_load)
df

Unnamed: 0,udps.src2dst_n_packets_byte_frequency,udps.dst2src_n_packets_byte_frequency,udps.protocol_header_fields,udps.n_bytes,udps.stnn_image,os,browser
0,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 80, 42540], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox
1,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 67, 14480], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome
2,"[3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 65535], [1, 0, 15, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari
3,"[3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, ...","[[0, 0, 0, 8192], [1, 0, 73, 65535], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox
4,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 89, 28240], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome
...,...,...,...,...,...,...,...
20628,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 0.0, 1.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 8192], [1, 0, 189, 8192], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer
20629,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 2.0, 3.0, 1.0, 1.0, 3.0, 0.0, 2.0, ...","[[0, 0, 0, 65535], [1, 0, 86, 14600], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer
20630,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 17, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox
20631,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 77, 65535], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox


In [22]:
def preprocessing(df):
    df.dropna(inplace=True)
    NBytes.preprocess(df)
    NPacketsByteFrequency.preprocess(df)
    enc = OneHotEncoderEIMTC()
    df['browser_ohc'] = list(enc.fit_transform(df['browser']).toarray())
    df['os_ohc'] = list(enc.fit_transform(df['os']).toarray())
    return df

In [23]:
df = preprocessing(df)
df

Unnamed: 0,udps.src2dst_n_packets_byte_frequency,udps.dst2src_n_packets_byte_frequency,udps.protocol_header_fields,udps.n_bytes,udps.stnn_image,os,browser,browser_ohc,os_ohc
0,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 80, 42540], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
1,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 67, 14480], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
2,"[3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 65535], [1, 0, 15, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari,"[0.0, 0.0, 0.0, 1.0]","[0.0, 1.0, 0.0]"
3,"[3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, ...","[[0, 0, 0, 8192], [1, 0, 73, 65535], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox,"[0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]"
4,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 89, 28240], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
...,...,...,...,...,...,...,...,...,...
20628,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 0.0, 1.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 8192], [1, 0, 189, 8192], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]"
20629,"[3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, ...","[3.0, 3.0, 2.0, 3.0, 1.0, 1.0, 3.0, 0.0, 2.0, ...","[[0, 0, 0, 65535], [1, 0, 86, 14600], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]"
20630,"[3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 17, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
20631,"[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, ...","[[0, 0, 0, 29200], [1, 0, 77, 65535], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"


In [24]:
n_bytes_features = np.stack(df['udps.n_bytes'].values)
byte_freq_features = np.concatenate([
        np.stack(df['udps.src2dst_n_packets_byte_frequency'].values),
        np.stack(df['udps.dst2src_n_packets_byte_frequency'].values)
    ], 
    axis=1
)

features = np.concatenate([n_bytes_features, byte_freq_features], axis=1)
features.shape

(20632, 1296)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(
    features,
    df[['os_ohc', 'browser_ohc']].values,
    test_size = 0.25,
    stratify=df['browser'],
    random_state=42
)

In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, ReLU, MaxPooling1D, Flatten


def stack(layers):
    '''
    Using the Functional-API of Tensorflow to build a sequential
    network (stacked layers) from list of layers.
    '''
    layer_stack = None
    for layer in layers:
        if layer_stack is None:
            layer_stack = layer
        else:
            layer_stack = layer(layer_stack)
    return layer_stack



def wang_payload_modality(payload_size=784):
    input_layer_payload_modality = Input(shape=(payload_size,1), name='input_payload')
    return Model(
        name='nbytes',
        inputs=input_layer_payload_modality,
        outputs=stack([
            input_layer_payload_modality,
            Conv1D(16, 25, name='Conv1D_payload_1'),
            ReLU(),
            MaxPooling1D(3, name='MaxPooling1D_payload_1'),
            Conv1D(32, 35, name='Conv1D_payload_2'),
            ReLU(),
            MaxPooling1D(3, name='MaxPooling1D_payload_2'),
            Flatten(), 
        ])
    )


def byte_freq_modality():
    input_layer_payload_modality = Input(shape=(512,1), name='input_bytefreq')
    return Model(
        name='Byte freq',
        inputs=input_layer_payload_modality,
        outputs=stack([
            input_layer_payload_modality,
            Conv1D(16, 25, name='Conv1D_bytefreq'),
            ReLU(),
            MaxPooling1D(3, name='MaxPooling1D_bytefreq'),
            Conv1D(32, 35, name='Conv1D_bytefreq_2'),
            ReLU(),
            MaxPooling1D(3, name='MaxPooling1D_bytefreq_2'),
            Flatten(), 
        ])
    )


In [47]:
model = CustomDistiller(
    modalities=[
        wang_payload_modality(),
        byte_freq_modality(),
    ],
    adapter_size=32,
    n_classes=[
    len(df['os'].unique()),
    len(df['browser'].unique())
])

In [48]:
epochs = 10
batch_size = 128
number_of_trains = len(model.modalities) +1
model.compile(
    optimizer=['adam'] *number_of_trains,
    loss=['categorical_crossentropy'] *number_of_trains,
    metrics=[['accuracy']] *number_of_trains,
)
model.fit(
    x = [
        x_train[:,:784],
        x_train[:,784:]
    ], 
    y = [
        np.stack(y_train[:,0]), 
        np.stack(y_train[:,1])
    ],
    epochs=[epochs] *number_of_trains,
    batch_size=[batch_size] *number_of_trains,
    use_multiprocessing=[True] *number_of_trains,
    workers=[4] *number_of_trains,
    verbose=[2] *number_of_trains,
)

##################### PRETRAINING_MODEL_NBYTES ##########################
Epoch 1/10
121/121 - 2s - loss: 0.8941 - dense_73_loss: 0.3869 - dense_74_loss: 0.5072 - dense_73_accuracy: 0.8280 - dense_74_accuracy: 0.7902
Epoch 2/10
121/121 - 1s - loss: 0.3655 - dense_73_loss: 0.1700 - dense_74_loss: 0.1955 - dense_73_accuracy: 0.9275 - dense_74_accuracy: 0.9247
Epoch 3/10
121/121 - 1s - loss: 0.2886 - dense_73_loss: 0.1338 - dense_74_loss: 0.1548 - dense_73_accuracy: 0.9428 - dense_74_accuracy: 0.9391
Epoch 4/10
121/121 - 1s - loss: 0.2484 - dense_73_loss: 0.1167 - dense_74_loss: 0.1316 - dense_73_accuracy: 0.9486 - dense_74_accuracy: 0.9465
Epoch 5/10
121/121 - 1s - loss: 0.2292 - dense_73_loss: 0.1077 - dense_74_loss: 0.1214 - dense_73_accuracy: 0.9502 - dense_74_accuracy: 0.9502
Epoch 6/10
121/121 - 1s - loss: 0.2121 - dense_73_loss: 0.0974 - dense_74_loss: 0.1147 - dense_73_accuracy: 0.9566 - dense_74_accuracy: 0.9535
Epoch 7/10
121/121 - 1s - loss: 0.1990 - dense_73_loss: 0.0926 - den

In [49]:
predictions_ohc = model.predict(
    [x_test[:,:784], x_test[:,784:]], 
    verbose=1
)
predictions_os = np.argmax(predictions_ohc[0], axis=1)
predictions_browser = np.argmax(predictions_ohc[1], axis=1)
labels_os = np.argmax(np.stack(y_test[:,0]), axis=1)
labels_browser = np.argmax(np.stack(y_test[:,1]), axis=1)



In [50]:
report_os = classification_report(labels_os, predictions_os, target_names=df['os'].unique())
report_os

'              precision    recall  f1-score   support\n\n       Linux       1.00      1.00      1.00      1403\n         OSX       1.00      0.99      0.99       669\n     Windows       1.00      1.00      1.00      3086\n\n    accuracy                           1.00      5158\n   macro avg       1.00      1.00      1.00      5158\nweighted avg       1.00      1.00      1.00      5158\n'

In [51]:
report_browser = classification_report(labels_browser, predictions_browser, target_names=df['browser'].unique())
report_browser

'              precision    recall  f1-score   support\n\n     Firefox       0.94      0.97      0.95      1265\n      Chrome       0.98      0.94      0.96      1465\n      Safari       0.98      0.99      0.98      1781\n   IExplorer       1.00      1.00      1.00       647\n\n    accuracy                           0.97      5158\n   macro avg       0.97      0.97      0.97      5158\nweighted avg       0.97      0.97      0.97      5158\n'

In [31]:
with open('customdist_boa2016_os_report.txt', 'w+') as f:
    f.write(report_os)

with open('customdist_boa2016_browser_report.txt', 'w+') as f:
    f.write(report_browser)