In [1]:
import pandas as pd
import numpy as np
from EIMTC.preprocessing import OneHotEncoderEIMTC, M1CNNPreprocessing
from EIMTC.selection import train_test_split
from EIMTC.metrics import classification_report
from EIMTC.models import M1CNN

In [2]:
filepath = './data/flowpic_selected_flows_merged.csv'
df = pd.read_csv(filepath, usecols=['ip_version','udps.n_bytes', 'encapsulation', 'traffic_type'])
df

Unnamed: 0,ip_version,udps.n_bytes,encapsulation,traffic_type
0,4,"[48, 130, 0, 176, 2, 1, 0, 4, 6, 112, 117, 98,...",nonvpn,chat
1,4,"[23, 3, 3, 0, 36, 0, 0, 0, 0, 0, 0, 0, 10, 138...",nonvpn,chat
2,4,"[22, 3, 1, 0, 181, 1, 0, 0, 177, 3, 3, 252, 61...",nonvpn,chat
3,4,"[23, 3, 3, 0, 36, 0, 0, 0, 0, 0, 0, 0, 16, 80,...",nonvpn,chat
4,4,"[22, 3, 1, 0, 181, 1, 0, 0, 177, 3, 3, 199, 82...",nonvpn,chat
...,...,...,...,...
199,4,"[68, 111, 53, 57, 142, 157, 210, 145, 236, 72,...",nonvpn,audio
200,4,"[0, 1, 0, 100, 33, 18, 164, 66, 83, 54, 88, 11...",vpn,audio
201,4,"[169, 25, 2, 65, 31, 97, 146, 94, 166, 37, 204...",vpn,audio
202,4,"[169, 25, 2, 65, 31, 97, 146, 94, 166, 37, 204...",vpn,audio


In [3]:
df[df['udps.n_bytes'].isna()]

Unnamed: 0,ip_version,udps.n_bytes,encapsulation,traffic_type


In [4]:
def preprocessing(df):
    df = df.dropna()
    M1CNNPreprocessing.preprocess_features(dataframe=df)
    enc = OneHotEncoderEIMTC()
    df['traffic_type_ohc'] = list(enc.fit_transform(df['traffic_type']).toarray())
    return df

In [5]:
df = preprocessing(df)
df

Unnamed: 0,ip_version,udps.n_bytes,encapsulation,traffic_type,traffic_type_ohc
0,4,"[48, 130, 0, 176, 2, 1, 0, 4, 6, 112, 117, 98,...",nonvpn,chat,"[0.0, 1.0, 0.0, 0.0]"
1,4,"[23, 3, 3, 0, 36, 0, 0, 0, 0, 0, 0, 0, 10, 138...",nonvpn,chat,"[0.0, 1.0, 0.0, 0.0]"
2,4,"[22, 3, 1, 0, 181, 1, 0, 0, 177, 3, 3, 252, 61...",nonvpn,chat,"[0.0, 1.0, 0.0, 0.0]"
3,4,"[23, 3, 3, 0, 36, 0, 0, 0, 0, 0, 0, 0, 16, 80,...",nonvpn,chat,"[0.0, 1.0, 0.0, 0.0]"
4,4,"[22, 3, 1, 0, 181, 1, 0, 0, 177, 3, 3, 199, 82...",nonvpn,chat,"[0.0, 1.0, 0.0, 0.0]"
...,...,...,...,...,...
199,4,"[68, 111, 53, 57, 142, 157, 210, 145, 236, 72,...",nonvpn,audio,"[1.0, 0.0, 0.0, 0.0]"
200,4,"[0, 1, 0, 100, 33, 18, 164, 66, 83, 54, 88, 11...",vpn,audio,"[1.0, 0.0, 0.0, 0.0]"
201,4,"[169, 25, 2, 65, 31, 97, 146, 94, 166, 37, 204...",vpn,audio,"[1.0, 0.0, 0.0, 0.0]"
202,4,"[169, 25, 2, 65, 31, 97, 146, 94, 166, 37, 204...",vpn,audio,"[1.0, 0.0, 0.0, 0.0]"


In [6]:
n_bytes = len(df['udps.n_bytes'].iloc[0])
n_classes = len(df['traffic_type'].unique())
model = M1CNN(payload_size=n_bytes, n_classes=n_classes)
print(n_bytes, n_classes)

784 4


In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['udps.n_bytes'].values, df['traffic_type_ohc'].values, 
    test_size=0.25,
    stratify=df['traffic_type'].values,
    random_state=42)

In [8]:
epochs = 10
batch_size = 128
model.fit(
    np.stack(x_train), 
    np.stack(y_train), 
    epochs=epochs, 
    batch_size=batch_size, 
    use_multiprocessing=True,
    workers=4,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x297febb0240>

In [9]:
predictions = model.model.predict(np.stack(x_test))
predictions = np.argmax(predictions, axis=1)
y_test_true = np.argmax(np.stack(y_test), axis=1)

report = classification_report(y_test_true, predictions)
report

'              precision    recall  f1-score   support\n\n           0       0.50      0.25      0.33         8\n           1       0.52      0.83      0.64        18\n           2       1.00      0.47      0.64        15\n           3       0.55      0.60      0.57        10\n\n    accuracy                           0.59        51\n   macro avg       0.64      0.54      0.54        51\nweighted avg       0.66      0.59      0.58        51\n'

In [10]:
with open('m1cnn_iscx2016flowpic_traffictype_report.txt', "w+") as f:
    f.write(report)
