To store our data files, we create a temporary directory

In [2]:
import os

from implementations.classification_heads import LastTokenClassificationHead
from implementations.transformers.basic_transformers import BasicTransformer

demonstration_folder = "demonstration"

if not os.path.exists(demonstration_folder):
    os.mkdir(demonstration_folder)

We begin by downloading the UNSW-NB15 flow dataset. We can obtain this from [here](https://staff.itee.uq.edu.au/marius/NIDS_datasets/)

In [3]:
import requests

zip_path = os.path.join(demonstration_folder, "dataset.zip")

if not os.path.exists(zip_path):
    r = requests.get("https://api.rdm.uq.edu.au/production/files/8c6e2a00-ef9c-11ed-827d-e762de186848/download")

    with open(zip_path, "wb") as w:
        w.write(r.content)

    print(f"Downloaded file to {zip_path}, size = {os.path.getsize(zip_path):,}")

Downloaded file to demonstration\dataset.zip, size = 33,877,423


We can now extract the dataset

In [4]:
import zipfile

csv_path = os.path.join(demonstration_folder, "dataset.csv")

if not os.path.exists(csv_path):
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extract("fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv", demonstration_folder)
        inner_path = os.path.join(demonstration_folder, "fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv")
        os.rename(inner_path, csv_path)

print(f"Dataset is available at {csv_path}, size = {os.path.getsize(csv_path):,}")

Dataset is available at demonstration\dataset.csv, size = 441,867,785


We can now define the dataset format, which will allow us to ingest this dataset with FlowTransformer

In [5]:
from framework.dataset_specification import DatasetSpecification

flow_format = DatasetSpecification(
        include_fields=['NUM_PKTS_UP_TO_128_BYTES', 'SRC_TO_DST_SECOND_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'NUM_PKTS_128_TO_256_BYTES', 'DST_TO_SRC_AVG_THROUGHPUT', 'DURATION_IN', 'L4_SRC_PORT', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'IN_PKTS', 'NUM_PKTS_512_TO_1024_BYTES', 'CLIENT_TCP_FLAGS', 'TCP_WIN_MAX_IN', 'NUM_PKTS_256_TO_512_BYTES', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'LONGEST_FLOW_PKT', 'L4_DST_PORT', 'MIN_TTL', 'DST_TO_SRC_SECOND_BYTES', 'NUM_PKTS_1024_TO_1514_BYTES', 'DURATION_OUT', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'MAX_TTL', 'SRC_TO_DST_AVG_THROUGHPUT', 'ICMP_IPV4_TYPE', 'MAX_IP_PKT_LEN', 'RETRANSMITTED_OUT_BYTES', 'IN_BYTES', 'RETRANSMITTED_IN_BYTES', 'TCP_WIN_MAX_OUT', 'L7_PROTO', 'RETRANSMITTED_OUT_PKTS', 'RETRANSMITTED_IN_PKTS'],
        categorical_fields=['CLIENT_TCP_FLAGS', 'L4_SRC_PORT', 'TCP_FLAGS', 'ICMP_IPV4_TYPE', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'L4_DST_PORT', 'L7_PROTO'],
        class_column="Attack",
        benign_label="Benign"
    )

We can now set up a version of FlowTransformer:

In [6]:
from framework.flow_transformer_parameters import FlowTransformerParameters
from framework.flow_transformer import FlowTransformer
from implementations.input_encodings import RecordLevelEmbed
from implementations.pre_processings import StandardPreProcessing

# We use several standard component to build our transformer
pre_processing = StandardPreProcessing(n_categorical_levels=32)
encoding = RecordLevelEmbed(64)
transformer = BasicTransformer(n_layers=2, internal_size=128, n_heads=2)
classification_head = LastTokenClassificationHead()

# Define the transformer
ft = FlowTransformer(pre_processing=pre_processing,
                     input_encoding=encoding,
                     sequential_model=transformer,
                     classification_head=classification_head,
                     params=FlowTransformerParameters(window_size=8, mlp_layer_sizes=[128], mlp_dropout=0.1))

After defining the transformer architecture, we can ingest the dataset using the FlowTransformer instance, which will process the dataset into a normalised format (partitioning training and testing data, to ensure only the testing data is used to fit the pre-processing):

In [13]:
from framework.enumerations import EvaluationDatasetSampling
from IPython.display import display

df = ft.load_dataset("UNSW-NB15",
                csv_path,
                specification=flow_format,
                evaluation_dataset_sampling=EvaluationDatasetSampling.LastRows,
                evaluation_percent=0.1,
                cache_path=demonstration_folder)

display(df.iloc[:500])

Using cache file path: demonstration\UNSW-NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_5EjmvToFWKee8t20u0dFpVzNu4s0.feather
Attempting to read dataset from path demonstration\dataset.csv...
Set y to = Attack
Converting numerical columns to floats, and removing out of range values...
Applying pre-processing to numerical values
[Numerical 1 / 28] Processing numerical column DURATION_IN...
[Numerical 2 / 28] Processing numerical column SRC_TO_DST_AVG_THROUGHPUT...
[Numerical 3 / 28] Processing numerical column TCP_WIN_MAX_OUT...
[Numerical 4 / 28] Processing numerical column NUM_PKTS_128_TO_256_BYTES...
[Numerical 5 / 28] Processing numerical column MAX_IP_PKT_LEN...
[Numerical 6 / 28] Processing numerical column MAX_TTL...
[Numerical 7 / 28] Processing numerical column NUM_PKTS_512_TO_1024_BYTES...
[Numerical 8 / 28] Processing numerical column IN_PKTS...
[Numerical 9 / 28] Processing numerical column SRC_TO_DST_SECOND_BYTES...
[Numerical 10 / 28] Processing numerical column NUM_PKTS_UP_TO_128_BY

Unnamed: 0,DURATION_IN,SRC_TO_DST_AVG_THROUGHPUT,TCP_WIN_MAX_OUT,NUM_PKTS_128_TO_256_BYTES,MAX_IP_PKT_LEN,MAX_TTL,NUM_PKTS_512_TO_1024_BYTES,IN_PKTS,SRC_TO_DST_SECOND_BYTES,NUM_PKTS_UP_TO_128_BYTES,...,L7_PROTO_23,L7_PROTO_24,L7_PROTO_25,L7_PROTO_26,L7_PROTO_27,L7_PROTO_28,L7_PROTO_29,L7_PROTO_30,L7_PROTO_31,L7_PROTO_32
0,0.0,0.729139,0.801390,0.0,0.565324,0.630549,0.0,0.000000,0.295560,0.326896,...,False,False,False,False,False,False,False,False,False,False
1,0.0,0.740912,0.817828,0.0,0.565324,0.630549,0.0,0.160324,0.316753,0.374701,...,False,False,False,False,False,False,False,False,False,False
2,0.0,0.749613,0.831726,0.0,0.575948,0.630549,0.0,0.218877,0.329795,0.408620,...,False,False,False,False,False,False,False,False,False,False
3,0.0,0.756864,0.843766,0.0,0.598516,0.630549,0.0,0.255508,0.340052,0.434929,...,False,False,False,False,False,False,False,False,False,False
4,0.0,0.764200,0.854385,0.0,0.598516,0.630549,0.0,0.293311,0.352235,0.465863,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.770252,0.863885,0.0,0.598516,0.630549,0.0,0.312343,0.358927,0.482735,...,False,False,False,False,False,False,False,False,False,False
496,0.0,0.740912,0.817828,0.0,0.565324,0.630549,0.0,0.160324,0.316753,0.374701,...,False,False,False,False,False,False,False,False,False,False
497,0.0,0.775548,0.872479,0.0,0.598516,0.630549,0.0,0.328315,0.365353,0.497492,...,False,False,False,False,False,False,False,False,False,False
498,0.0,0.749613,0.831726,0.0,0.575948,0.630549,0.0,0.218877,0.329795,0.408620,...,False,False,False,False,False,False,False,False,False,False


We are now ready to build the transformer model, that is able to ingest this dataset

In [14]:
# Build the transformer model
m = ft.build_model()
m.summary()

# Compile the model
m.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'], jit_compile=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_DURATION_IN (InputLa  [(None, 8, 1)]               0         []                            
 yer)                                                                                             
                                                                                                  
 input_SRC_TO_DST_AVG_THROU  [(None, 8, 1)]               0         []                            
 GHPUT (InputLayer)                                                                               
                                                                                                  
 input_TCP_WIN_MAX_OUT (Inp  [(None, 8, 1)]               0         []                            
 utLayer)                                                                                     

This model can now be used for machine learning training