In [1]:
import pandas as pd
import numpy as np
from EIMTC.preprocessing import OneHotEncoderEIMTC, MalDistPreprocessing
from EIMTC.selection import train_test_split
from EIMTC.models import MalDist

In [2]:
filepath = './data/out.csv'
cols_to_load = ['udps.n_bytes', 'udps.protocol_header_fields', 'udps.stnn_image','os', 'browser']
df = pd.read_csv(filepath, usecols=cols_to_load)
df

Unnamed: 0,udps.protocol_header_fields,udps.n_bytes,udps.stnn_image,os,browser
0,"[[0, 0, 0, 29200], [1, 0, 80, 42540], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox
1,"[[0, 0, 0, 29200], [1, 0, 67, 14480], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome
2,"[[0, 0, 0, 65535], [1, 0, 15, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari
3,"[[0, 0, 0, 8192], [1, 0, 73, 65535], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox
4,"[[0, 0, 0, 29200], [1, 0, 89, 28240], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome
...,...,...,...,...,...
20628,"[[0, 0, 0, 8192], [1, 0, 189, 8192], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer
20629,"[[0, 0, 0, 65535], [1, 0, 86, 14600], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer
20630,"[[0, 0, 0, 29200], [1, 0, 17, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox
20631,"[[0, 0, 0, 29200], [1, 0, 77, 65535], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox


In [3]:
def preprocessing(df):
    df.dropna(inplace=True)
    MalDistPreprocessing.preprocess_features(dataframe=df)
    enc = OneHotEncoderEIMTC()
    df['browser_ohc'] = list(enc.fit_transform(df['browser']).toarray())
    df['os_ohc'] = list(enc.fit_transform(df['os']).toarray())
    return df

In [4]:
df = preprocessing(df)
df

Unnamed: 0,udps.protocol_header_fields,udps.n_bytes,udps.stnn_image,os,browser,browser_ohc,os_ohc
0,"[[0, 0, 0, 29200], [1, 0, 80, 42540], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 44999.0, 2271.65, 10057.03, 4.1294065, ...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
1,"[[0, 0, 0, 29200], [1, 0, 67, 14480], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 30001.0, 1510.45, 6706.0234, 4.129393, ...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
2,"[[0, 0, 0, 65535], [1, 0, 15, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 42.0, 6.5, 12.9147, 2.0129657, 66.0, 14...",OSX,Safari,"[0.0, 0.0, 0.0, 1.0]","[0.0, 1.0, 0.0]"
3,"[[0, 0, 0, 8192], [1, 0, 73, 65535], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 199.0, 22.2, 48.05545, 2.7663713, 54.0,...",Windows,Firefox,"[0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 1.0]"
4,"[[0, 0, 0, 29200], [1, 0, 89, 28240], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 9982.0, 685.73334, 2571.9824, 3.4732416...",Linux,Chrome,"[1.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
...,...,...,...,...,...,...,...
20628,"[[0, 0, 0, 8192], [1, 0, 189, 8192], [0, 0, 0,...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 218.0, 80.2, 88.81536, 0.50415426, 54.0...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]"
20629,"[[0, 0, 0, 65535], [1, 0, 86, 14600], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 330.0, 70.2, 80.19883, 1.7531445, 54.0,...",Windows,IExplorer,"[0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 1.0]"
20630,"[[0, 0, 0, 29200], [1, 0, 17, 28960], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 67.0, 9.25, 19.191212, 2.4089437, 66.0,...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
20631,"[[0, 0, 0, 29200], [1, 0, 77, 65535], [0, 0, 0...","[0.08627450980392157, 0.011764705882352941, 0....","[[0.0, 78.0, 11.9, 28.0674, 1.9586945, 66.0, 1...",Linux,Firefox,"[0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"


In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    df[['udps.n_bytes', 'udps.protocol_header_fields', 'udps.stnn_image']].values,
    df[['os_ohc', 'browser_ohc']].values,
    test_size = 0.25,
    stratify=df['browser'],
    random_state=42
)

In [6]:
model = MalDist(n_classes=[
    len(df['os'].unique()),
    len(df['browser'].unique())
])

In [7]:
epochs = 10
batch_size = 128
model.compile(
    optimizer=['adam'] *4,
    loss=['categorical_crossentropy'] *4,
    metrics=[['accuracy']] *4,
)
model.fit(
    x = [
        np.stack(x_train[:,0]),
        np.stack(x_train[:,1]), 
        np.stack(x_train[:,2])
    ], 
    y = [
        np.stack(y_train[:,0]), 
        np.stack(y_train[:,1])
    ],
    epochs=[epochs] *4,
    batch_size=[batch_size] *4,
    use_multiprocessing=[True] *4,
    workers=[4] *4,
    verbose=[2] *4,
)

##################### PRETRAINING_MODEL_WANG_PAYLOAD_MODALITY_-_NBYTES ##########################
Epoch 1/10
121/121 - 5s - loss: 0.6851 - dense_10_loss: 0.3135 - dense_11_loss: 0.3716 - dense_10_accuracy: 0.8591 - dense_11_accuracy: 0.8501
Epoch 2/10
121/121 - 1s - loss: 0.2995 - dense_10_loss: 0.1397 - dense_11_loss: 0.1598 - dense_10_accuracy: 0.9405 - dense_11_accuracy: 0.9374
Epoch 3/10
121/121 - 1s - loss: 0.2402 - dense_10_loss: 0.1126 - dense_11_loss: 0.1276 - dense_10_accuracy: 0.9511 - dense_11_accuracy: 0.9489
Epoch 4/10
121/121 - 1s - loss: 0.2061 - dense_10_loss: 0.0941 - dense_11_loss: 0.1120 - dense_10_accuracy: 0.9552 - dense_11_accuracy: 0.9547
Epoch 5/10
121/121 - 1s - loss: 0.1858 - dense_10_loss: 0.0838 - dense_11_loss: 0.1020 - dense_10_accuracy: 0.9599 - dense_11_accuracy: 0.9582
Epoch 6/10
121/121 - 1s - loss: 0.1715 - dense_10_loss: 0.0773 - dense_11_loss: 0.0942 - dense_10_accuracy: 0.9641 - dense_11_accuracy: 0.9608
Epoch 7/10
121/121 - 1s - loss: 0.1597 - den

In [8]:
predictions_ohc = model.predict(
    [np.stack(x_test[:,0]),np.stack(x_test[:,1]),np.stack(x_test[:,2])], 
    verbose=1
)
predictions_os = np.argmax(predictions_ohc[0], axis=1)
predictions_browser = np.argmax(predictions_ohc[1], axis=1)
labels_os = np.argmax(np.stack(y_test[:,0]), axis=1)
labels_browser = np.argmax(np.stack(y_test[:,1]), axis=1)



In [9]:
from sklearn.metrics import classification_report
report_os = classification_report(labels_os, predictions_os, target_names=df['os'].unique())
report_os

'              precision    recall  f1-score   support\n\n       Linux       1.00      1.00      1.00      1403\n         OSX       1.00      1.00      1.00       669\n     Windows       1.00      1.00      1.00      3086\n\n    accuracy                           1.00      5158\n   macro avg       1.00      1.00      1.00      5158\nweighted avg       1.00      1.00      1.00      5158\n'

In [10]:
from sklearn.metrics import classification_report
report_browser = classification_report(labels_browser, predictions_browser, target_names=df['browser'].unique())
report_browser

'              precision    recall  f1-score   support\n\n     Firefox       0.96      0.98      0.97      1265\n      Chrome       0.99      0.92      0.95      1465\n      Safari       0.95      0.99      0.97      1781\n   IExplorer       1.00      1.00      1.00       647\n\n    accuracy                           0.97      5158\n   macro avg       0.98      0.97      0.97      5158\nweighted avg       0.97      0.97      0.97      5158\n'

In [28]:
with open('maldist_boa2016_os_report.txt', 'w+') as f:
    f.write(report_os)

with open('maldist_boa2016_browser_report.txt', 'w+') as f:
    f.write(report_browser)