In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import os
import pandas as pd

In [12]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False

Mounted at /content/drive


In [3]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = 'cpu'
device

device(type='cuda')

## **About data classes**

### **Train data classes**
- Benign                      
- Bot                                           
- Brute Force -XSS                 
- DDOS attack-HOIC              
- DDOS attack-LOIC-UDP            
- DDoS attacks-LOIC-HTTP      
- DoS attacks-GoldenEye          
- DoS attacks-Hulk                    
- DoS attacks-Slowloris          
- FTP-BruteForce                                
- Label                                                
- SSH-Bruteforce


### **Test data classes**
- Brute Force -Web                 
- DoS attacks-SlowHTTPTest      
- Infilteration                                              
- SQL Injection                     


In [9]:
from src.vars import TRAIN_PICKLE, TEST_PICKLE
from src.data.dataset import DatasetIDS2018, group_dataset_by_labels, save_pickle_dataset, dataframe_to_tensor_dataset, tensor_dataset_to_dataloader
from src.data.labels import train_labels, test_labels, labels_emb, labels

In [15]:
PICKLE_PATH = '../data/pickle/' if not IN_COLAB else '/content/drive/MyDrive/STUDIA/INZ/data/pickle'

In [17]:
train_pickle_path = os.path.join(PICKLE_PATH, TRAIN_PICKLE)
test_pickle_path = os.path.join(PICKLE_PATH, TEST_PICKLE)

if TRAIN_PICKLE not in os.listdir(PICKLE_PATH) and TEST_PICKLE not in os.listdir(PICKLE_PATH):
    ids_dataset = DatasetIDS2018(csv_file_name='../data/raw/small_merge_data.csv')

    train_dataset = group_dataset_by_labels(dataset=ids_dataset, labels=train_labels, labels_emb=labels_emb, labels_column_name='Label')
    save_pickle_dataset(dataset=train_dataset, pickle_path=train_pickle_path)

    test_dataset = group_dataset_by_labels(dataset=ids_dataset, labels=test_labels, labels_emb=labels_emb, labels_column_name='Label')
    save_pickle_dataset(dataset=test_dataset, pickle_path=test_pickle_path)
else:
    train_dataset = pd.read_pickle(train_pickle_path)
    test_dataset = pd.read_pickle(test_pickle_path)



In [18]:
all_labels = torch.from_numpy(test_dataset.values[:, -1].astype(float)).float()
all_labels

tensor([ 0.,  0.,  0.,  ...,  0., 14., 14.])

In [None]:
test_dataset = dataframe_to_tensor_dataset(test_dataset)
train_dataset = dataframe_to_tensor_dataset(train_dataset)

In [24]:
batch_size = 32
train_data_loader = tensor_dataset_to_dataloader(train_dataset, batch_size)
test_data_loader = tensor_dataset_to_dataloader(test_dataset, batch_size)

In [25]:
next(iter(train_data_loader))

[tensor([[6.0000e+00, 1.2883e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 4.8300e+02, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          1.0000e+00],
         [6.0000e+00, 1.2561e+05, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          7.0000e+00],
         ...,
         [6.0000e+00, 1.3680e+03, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 1.5440e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 6.3924e+04, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          7.0000e+00]]),
 tensor([ 3.,  1.,  7.,  3.,  9.,  9.,  7.,  3.,  7.,  3.,  3.,  1.,  1.,  3.,
          3.,  3.,  1.,  1., 10.,  9.,  1.,  1.,  3.,  3.,  1.,  7.,  7.,  7.,
          3.,  3.,  3.,  7.])]

In [26]:
next(iter(test_data_loader))

[tensor([[ 6.0000e+00,  5.3839e+07,  2.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           3.7148e-02,  5.3839e+07,  0.0000e+00,  5.3839e+07,  5.3839e+07,
           5.3839e+07,  5.3839e+07,  0.0000e+00,  5.3839e+07,  5.3839e+07,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  4.0000e+01,
           0.0000e+00,  3.7148e-02,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  2.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           2.7900e+02, -1

### **Variables**

In [27]:
word_vector_size = 100
input_dim = len(train_dataset[0][0])
output_dim = len(labels)

In [28]:
print(f'Input dim: {input_dim}\nOutput dim: {output_dim}')

Input dim: 78
Output dim: 16


## **Word2Vec**

In [None]:
from src.models.GZSL.word2vec import LabelsEmbeddings, gensim_model

In [33]:
labels_embeddings = LabelsEmbeddings(gensim_model=gensim_model)
labels_vectors = labels_embeddings.generate_vectors(labels=labels)

In [34]:
[len(v) for v in labels_vectors]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [35]:
for vector in labels_vectors:
    for word in vector:
        assert len(word) == word_vector_size


### **Map layer**
Last layer of the model should be a map between incident embeddings and labels embeddings - we want to map given input data to the most corresponding Word2Vec label vector. To do this we have to initialize weights of this layer and freeze them.

In [37]:
from src.models.GZSL.utils import map_layer_init

map_layer = map_layer_init(labels_vectors)
print(f"Size: {map_layer.size()}\nType: {map_layer.dtype}")


Size: torch.Size([16, 100])
Type: torch.float32


## **Neural Network for Network Data**

### **Training**

In [40]:
from src.models.GZSL.model import NetNet, train_model, evaluate_model

In [41]:
model = NetNet(
    input_dim=input_dim,
    words_embeddings_dim=word_vector_size,
    output_dim=output_dim,
    labels_vectors=labels_vectors
)
model.to(device)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): LeakyReLU(negative_slope=0.01)
)

In [43]:
learning_rate = 1e-06
loss_fn = nn.NLLLoss()
model.optim = optim.Adam(model.parameters(), lr=learning_rate)
torch.autograd.set_detect_anomaly(True)
train_model(model=model, epochs=5, data_loader=train_data_loader, loss_fn=loss_fn)

Epoch: 0, loss: -0.0777
Epoch: 1, loss: -0.0861
Epoch: 2, loss: -0.0876
Epoch: 3, loss: -0.0931
Epoch: 4, loss: -0.0956


### **Removing last layer**

In [44]:
zsl_model = nn.Sequential(*(list(model.children())[:6] + list(model.children())[7:]))
zsl_model.to(device)

Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Linear(in_features=256, out_features=100, bias=True)
  (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): LeakyReLU(negative_slope=0.01)
)

In [45]:
print(model)
print(zsl_model)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): LeakyReLU(negative_slope=0.01)
)
Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Linear(in_features=256, out_features=100, bias=True)
  (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, a

### **Evaluation**
For this step we will calculate euclidean distance and find the vector which is the closest.

In [48]:
evaluate_model(zsl_model, test_data_loader, map_layer)

Accuracy of the model: 70.12%
