In [5]:
import pandas as pd
import numpy as np
from EIMTC.preprocessing import OneHotEncoderEIMTC, M1CNNPreprocessing
from EIMTC.selection import train_test_split
from EIMTC.metrics import classification_report
from EIMTC.models import M1CNN

In [45]:
filepath = './data/out.csv'
df = pd.read_csv(filepath, usecols=['udps.n_bytes', 'application'])
df

Unnamed: 0,udps.n_bytes,application
0,"[0.08627450980392157, 0.011764705882352941, 0....",google
1,"[0.08627450980392157, 0.011764705882352941, 0....",facebook
2,"[0.08627450980392157, 0.011764705882352941, 0....",youtube
3,"[0.08627450980392157, 0.011764705882352941, 0....",twitter
4,"[0.08627450980392157, 0.011764705882352941, 0....",unknown
...,...,...
20628,"[0.08627450980392157, 0.011764705882352941, 0....",microsoft
20629,"[0.08627450980392157, 0.011764705882352941, 0....",twitter
20630,"[0.08627450980392157, 0.011764705882352941, 0....",google
20631,"[0.08627450980392157, 0.011764705882352941, 0....",twitter


In [46]:
def preprocessing(df):
    df.dropna(inplace=True)
    # remove unrelated samples:
    df.drop(df[df['application'].isin(['wireshark', 'whatsapp'])].index, inplace=True)
    enc = OneHotEncoderEIMTC()
    df['application_ohc'] = list(enc.fit_transform(df['application']).toarray())
    M1CNNPreprocessing.preprocess_features(dataframe=df)

    return df

In [47]:
df = preprocessing(df)
df

Unnamed: 0,udps.n_bytes,application,application_ohc
0,"[0.08627450980392157, 0.011764705882352941, 0....",google,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.08627450980392157, 0.011764705882352941, 0....",facebook,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.08627450980392157, 0.011764705882352941, 0....",youtube,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.08627450980392157, 0.011764705882352941, 0....",twitter,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,"[0.08627450980392157, 0.011764705882352941, 0....",unknown,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
...,...,...,...
20628,"[0.08627450980392157, 0.011764705882352941, 0....",microsoft,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20629,"[0.08627450980392157, 0.011764705882352941, 0....",twitter,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
20630,"[0.08627450980392157, 0.011764705882352941, 0....",google,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20631,"[0.08627450980392157, 0.011764705882352941, 0....",twitter,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [35]:
x_train, x_test, y_train, y_test = train_test_split(df['udps.n_bytes'].values, df['application_ohc'].values, 
    test_size=0.25,
    stratify=df['application'].values,
    random_state=42)

In [41]:
n_bytes = len(df['udps.n_bytes'].iloc[0])
n_classes = len(df['application'].unique())
model = M1CNN(payload_size=n_bytes, n_classes=n_classes)
print(n_bytes, n_classes)

784 10


In [42]:
epochs = 10
batch_size = 128
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.fit(
    np.stack(x_train), 
    np.stack(y_train), 
    epochs=epochs, 
    batch_size=batch_size, 
    use_multiprocessing=True,
    workers=4,
    verbose=2
)

Epoch 1/10
121/121 - 1s - loss: 0.4947 - accuracy: 0.8383
Epoch 2/10
121/121 - 1s - loss: 0.1195 - accuracy: 0.9602
Epoch 3/10
121/121 - 1s - loss: 0.0703 - accuracy: 0.9742
Epoch 4/10
121/121 - 1s - loss: 0.0520 - accuracy: 0.9799
Epoch 5/10
121/121 - 1s - loss: 0.0394 - accuracy: 0.9846
Epoch 6/10
121/121 - 1s - loss: 0.0313 - accuracy: 0.9875
Epoch 7/10
121/121 - 1s - loss: 0.0297 - accuracy: 0.9881
Epoch 8/10
121/121 - 1s - loss: 0.0252 - accuracy: 0.9897
Epoch 9/10
121/121 - 1s - loss: 0.0169 - accuracy: 0.9937
Epoch 10/10
121/121 - 1s - loss: 0.0152 - accuracy: 0.9946


<tensorflow.python.keras.callbacks.History at 0x1b147419240>

In [43]:
predictions = model.model.predict(np.stack(x_test))
predictions = np.argmax(predictions, axis=1)
y_test_true = np.argmax(np.stack(y_test), axis=1)

report = classification_report(y_test_true, predictions)
report

'              precision    recall  f1-score   support\n\n           0       1.00      0.97      0.99        35\n           1       1.00      0.98      0.99        45\n           2       0.99      0.99      0.99      1278\n           3       0.98      1.00      0.99       410\n           4       0.98      0.95      0.97       103\n           5       1.00      1.00      1.00        37\n           6       1.00      1.00      1.00      2610\n           7       0.97      0.94      0.96       331\n           8       0.96      0.82      0.88        28\n           9       0.97      0.98      0.98       281\n\n    accuracy                           0.99      5158\n   macro avg       0.98      0.96      0.97      5158\nweighted avg       0.99      0.99      0.99      5158\n'

In [44]:
with open('m1cnn_boa2016_app_report.txt', "w+") as f:
    f.write(report)
