In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

### **Google Colab enviromnent**

In [2]:
try:
  from google.colab import drive
  from google.colab import drive
  IN_COLAB = True
  !git clone https://github.com/BSc-WUT/ML-API
  %cd ML-API/packetbaseml
  !pip install -e .
except:
  IN_COLAB = False

In [None]:
!pip install awscli

In [None]:
!make sync_data_from_s3

In [None]:
%cd /content/GZSL/
!mkdir packetbaseml/data/interim
!python3 packetbaseml/src/data/make_dataset.py

In [2]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = 'cpu'
device

'cpu'

### **Create Train and Test DataLoaders from IDS-2018 Dataset**

In [7]:
from src.data.dataset import DatasetIDS2018 
from src.data.parse_dataset import dataframe_to_dataloader
from src.data.labels import train_labels, test_labels, labels

In [9]:
ids_dataset: DatasetIDS2018 = DatasetIDS2018(csv_file_name='./data/interim/merged_data.csv')

In [None]:
ids_dataset.fix_dataset()

In [10]:
labels_idx: dict = {label: i for i, label in enumerate(labels)}
train_labels_idx: list = [idx for label, idx in labels_idx.items() if label in train_labels][:-1] # Not including `Label` class
test_labels_idx: list = [idx for label, idx in labels_idx.items() if label in test_labels]

In [11]:
train_dataframe: pd.DataFrame = ids_dataset.filter_data_by_labels(train_labels_idx)
test_dataframe: pd.DataFrame = ids_dataset.filter_data_by_labels(test_labels_idx)

In [12]:
train_dataframe

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1234,6.0,16079.0,3.0,4.0,326.0,129.0,326.0,0.0,108.666667,188.216188,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1235,6.0,577.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1236,6.0,10279.0,3.0,4.0,326.0,129.0,326.0,0.0,108.666667,188.216188,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1237,6.0,457.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1240,6.0,10271.0,3.0,4.0,326.0,129.0,326.0,0.0,108.666667,188.216188,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15619886,6.0,10855.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
15619887,6.0,1330.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
15619888,6.0,1638.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
15619891,6.0,1480.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [11]:
test_dataframe

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6.0,141385.0,9.0,7.0,553.0,3773.0,202.0,0.0,61.444444,87.534438,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,6.0,281.0,2.0,1.0,38.0,0.0,38.0,0.0,19.000000,26.870058,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,6.0,279824.0,11.0,15.0,1086.0,10527.0,385.0,0.0,98.727273,129.392497,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,6.0,132.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,6.0,274016.0,9.0,13.0,1285.0,6141.0,517.0,0.0,142.777778,183.887722,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16232997,6.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
16232998,6.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
16232999,6.0,732728.0,2.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
16233000,6.0,22.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14


In [13]:
batch_size = 32
train_data_loader = dataframe_to_dataloader(train_dataframe, batch_size)
test_data_loader = dataframe_to_dataloader(test_dataframe, batch_size)

In [12]:
next(iter(train_data_loader))

[tensor([[6.0000e+00, 6.5100e+02, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          1.0000e+00],
         [6.0000e+00, 7.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          1.0000e+01],
         [6.0000e+00, 6.0066e+05, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          5.0000e+00],
         ...,
         [6.0000e+00, 8.6835e+07, 2.0000e+00,  ..., 8.6800e+07, 8.6800e+07,
          5.0000e+00],
         [6.0000e+00, 1.0060e+06, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          5.0000e+00],
         [6.0000e+00, 9.0890e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00]]),
 tensor([ 1., 10.,  5.,  3., 10.,  3.,  3.,  5.,  5.,  5.,  5.,  1.,  7.,  5.,
         10.,  7.,  3.,  7.,  3.,  7., 10., 10.,  3.,  7.,  7.,  7.,  5.,  3.,
          3.,  5.,  5.,  3.])]

In [24]:
next(iter(test_data_loader))

[tensor([[6.0000e+00, 5.2026e+06, 1.4000e+01,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.7000e+01, 2.9645e+05, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.7000e+01, 1.0120e+03, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         ...,
         [1.7000e+01, 3.0700e+02, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.7000e+01, 1.4464e+05, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 8.7000e+01, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 14.,  0.,  0.,  0.,  0.,
          0.,  0.,  0., 13.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.])]

### **Variables**

In [14]:
word_vector_size = 100
input_dim = len(train_dataframe.iloc[0])
output_dim = len(labels)

In [14]:
print(f'Input dim: {input_dim}\nOutput dim: {output_dim}')

Input dim: 78
Output dim: 16


## **Word2Vec**

In [15]:
from src.models.GZSL.word2vec import LabelsEmbeddings

In [16]:
word_vector_size = 100
labels_embeddings = LabelsEmbeddings(word_vector_size)
labels_vectors = labels_embeddings.generate_vectors(labels)

In [17]:
[len(v) for v in labels_vectors]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [18]:
for vector in labels_vectors:
    for word in vector:
        assert len(word) == word_vector_size


### **Map layer**
Last layer of the model should be a map between incident embeddings and labels embeddings - we want to map given input data to the most corresponding Word2Vec label vector. To do this we have to initialize weights of this layer and freeze them.

In [17]:
from src.models.GZSL.utils import map_layer_init

map_layer = map_layer_init(labels_vectors)
print(f"Size: {map_layer.size()}\nType: {map_layer.dtype}")


Size: torch.Size([16, 100])
Type: torch.float32


## **Neural Network for Network Data**

In [18]:
from src.models.GZSL.model import NetNet

In [25]:
model = NetNet(
    device=device,
    input_dim=input_dim,
    words_embeddings_dim=word_vector_size,
    output_dim=output_dim,
    labels_vectors=labels_vectors
)
model.to(device)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): LeakyReLU(negative_slope=0.01)
)

In [26]:
learning_rate = 1e-06
loss_fn = nn.NLLLoss()
model.optim = optim.Adam(model.parameters(), lr=learning_rate)
torch.autograd.set_detect_anomaly(True)


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x1d69e2a1fd0>

### **Training**

In [None]:
model.train_model(epochs=5, data_loader=train_data_loader, loss_fn=loss_fn)

### **Removing last layer**

In [None]:
zsl_model = nn.Sequential(*(list(model.children())[:6] + list(model.children())[7:]))
zsl_model.to(device)

Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Linear(in_features=256, out_features=100, bias=True)
  (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): LeakyReLU(negative_slope=0.01)
)

In [None]:
print(model)
print(zsl_model)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): LeakyReLU(negative_slope=0.01)
)
Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Linear(in_features=256, out_features=100, bias=True)
  (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, a

### **Evaluation**
For this step we will calculate euclidean distance and find the vector which is the closest.

In [None]:
from src.models.GZSL.model_eval import evaluate_model

evaluate_model(device, zsl_model, test_data_loader, map_layer)