In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import numpy as np

## **Load and Prepare Data**

In [45]:
import import_ipynb
import dataset
from dataset import parse_dataset
import os
import pandas as pd

In [46]:
labels = [
    'Benign',
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
    'Brute Force -Web',
    'DoS attacks-SlowHTTPTest',
    'Infilteration',
    'SQL Injection'                      
]

labels_emb = {}
for i, label in enumerate(labels):
    labels_emb[label] = i

### **Train data classes**
- Benign                      
- Bot                                           
- Brute Force -XSS                 
- DDOS attack-HOIC              
- DDOS attack-LOIC-UDP            
- DDoS attacks-LOIC-HTTP      
- DoS attacks-GoldenEye          
- DoS attacks-Hulk                    
- DoS attacks-Slowloris          
- FTP-BruteForce                                
- Label                                                
- SSH-Bruteforce 


In [47]:
train_labels = [
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
                          
]


### **Test data will consists of classes**
- Brute Force -Web                 
- DoS attacks-SlowHTTPTest      
- Infilteration                                              
- SQL Injection                     


In [48]:
test_labels = [label for label in labels if label not in train_labels]

In [49]:
columns_to_drop = ['Timestamp', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port']

In [50]:
PICKLE_PATH = '../data/pickle/'
TRAIN_PICKLE = 'train_dataset.pkl'
TEST_PICKLE = 'test_dataset.pkl'

In [51]:
train_pickle_path = os.path.join(PICKLE_PATH, TRAIN_PICKLE)
test_pickle_path = os.path.join(PICKLE_PATH, TEST_PICKLE)

if TRAIN_PICKLE not in os.listdir(PICKLE_PATH) and TEST_PICKLE not in os.listdir(PICKLE_PATH):
    ids_dataset = dataset.DatasetIDS2018(csv_file_name='../data/raw/small_merge_data.csv')
    train = ids_dataset.get_data_by_labels(labels=train_labels)
    train_dataset = parse_dataset(dataset=train, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
    train_dataset.to_pickle(train_pickle_path)
    test = ids_dataset.get_data_by_labels(labels=test_labels)
    test_dataset = parse_dataset(dataset=test, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
    test_dataset.to_pickle(test_pickle_path)
else:
    train_dataset = pd.read_pickle(train_pickle_path)
    test_dataset = pd.read_pickle(test_pickle_path)



In [52]:
test_dataset = data.TensorDataset(torch.from_numpy(test_dataset.values).float(),torch.from_numpy(test_dataset.values[:,-1].astype(float)).float())
train_dataset = data.TensorDataset(torch.from_numpy(train_dataset.values).float(),torch.from_numpy(train_dataset.values[:,-1].astype(float)).float())

In [53]:
batch_size = 32

In [54]:
train_data_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
test_data_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 

In [55]:
next(iter(train_data_loader))

[tensor([[6.0000e+00, 5.9130e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 2.3201e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 3.9921e+05, 2.2000e+01,  ..., 0.0000e+00, 0.0000e+00,
          1.0000e+01],
         ...,
         [6.0000e+00, 1.4908e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 7.8260e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          7.0000e+00],
         [6.0000e+00, 8.8290e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00]]),
 tensor([ 3.,  3., 10.,  1.,  3.,  1.,  1.,  3.,  7.,  1.,  3.,  3.,  1.,  9.,
          3.,  3.,  1.,  9.,  7.,  7., 10.,  3.,  7., 10.,  7.,  3.,  7.,  3.,
          1.,  3.,  7.,  3.])]

In [56]:
next(iter(test_data_loader))

[tensor([[6.0000e+00, 3.8700e+02, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 4.7719e+06, 5.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 6.1075e+07, 1.4000e+01,  ..., 1.0012e+07, 1.0008e+07,
          0.0000e+00],
         ...,
         [6.0000e+00, 1.1525e+08, 2.1000e+01,  ..., 1.0015e+07, 1.0001e+07,
          0.0000e+00],
         [6.0000e+00, 1.0000e+01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 4.3229e+06, 5.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 14.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.])]

### **Variables**

In [57]:
word_vector_size = 100
input_dim = len(train_dataset[0][0])
output_dim = len(labels)

In [58]:
input_dim

78

## **Word2Vec**

In [59]:
import string
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [60]:
corpus = []
for label in labels:
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
    corpus.append(words)

In [61]:
gensim_model = Word2Vec(corpus, vector_size=word_vector_size, min_count=1)

In [62]:
class LabelsEmbeddings():
    def __init__(self, gensim_model: Word2Vec):
        self.model = gensim_model

    def fix_vectors_sizes(self, vectors: list) -> list:
        fixed_vectors = []
        max_size = max([len(v) for v in vectors])
        for vector in vectors:
            size_diff = max_size - len(vector)
            vector.extend([[0] * word_vector_size] * size_diff)
            fixed_vectors.append(vector)
        return fixed_vectors

    def generate_vectors(self, labels: dict):
        vectors = []
        for label in labels:
            description_vector = []
            words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
            for word in words:
                if word in self.model.wv.index_to_key:
                    description_vector.append(self.model.wv[word])
                else:
                    description_vector.append(word_vector_size * [0])
            vectors.append(description_vector)
        return self.fix_vectors_sizes(vectors)

In [63]:
labels_embeddings = LabelsEmbeddings(gensim_model=gensim_model)
labels_vectors = labels_embeddings.generate_vectors(labels=labels)

In [64]:
[len(v) for v in labels_vectors]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [65]:
for vector in labels_vectors:
    for word in vector:
        assert len(word) == word_vector_size


In [66]:
labels_vectors

[[array([ 1.3325238e-03,  6.5408563e-03,  9.9846032e-03,  9.0624550e-03,
         -8.0153607e-03,  6.4913859e-03, -5.7147373e-03, -9.7157480e-04,
          4.8282265e-04,  6.5819337e-03,  4.4701435e-03,  4.6037268e-03,
          9.4831241e-03,  3.8266421e-04, -6.0387133e-03, -6.3301004e-03,
          6.4317896e-03, -5.2425815e-03, -2.8498126e-03,  4.0752841e-03,
         -2.2902358e-03, -6.0252128e-03, -2.3236549e-03,  1.2069190e-03,
          2.1833598e-03,  6.0837734e-03, -5.2140011e-03,  3.0779613e-03,
          7.2406759e-03,  2.1951138e-03,  5.3974902e-03, -4.8453333e-03,
          6.1526122e-03, -7.6012816e-03,  3.4928655e-03, -9.3218042e-03,
         -2.6043104e-03, -9.0731988e-03, -1.5882683e-03, -5.3647519e-03,
         -3.9439187e-03,  1.1536527e-03,  2.8003477e-03, -1.5263951e-03,
         -8.1705153e-03, -5.9180222e-03,  8.1929564e-04, -3.9462578e-03,
         -9.4304476e-03, -7.7497482e-04,  6.6332687e-03,  5.9788441e-03,
         -9.9172592e-03,  3.1185830e-03, -5.9873010

### **Map layer**
Last layer of the model should be a map between incident embeddings and labels embeddings - we want to map given input data to the most corresponding Word2Vec label vector. To do this we have to initialize weights of this layer and freeze them.

In [67]:
def map_layer_init(w2c_vectors: list) -> torch.Tensor:
    vectors = np.asarray(w2c_vectors, dtype=float)
    vectors = torch.from_numpy(vectors)
    return vectors[:, -1, :].to(torch.float32)

In [68]:
map_layer = map_layer_init(labels_vectors)
print(f"Size: {map_layer.size()}\nType: {map_layer.dtype}")


Size: torch.Size([16, 100])
Type: torch.float32


## **Neural Network for Network Data**

In [109]:
class PrintLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        print("torch.isfinite(x).all(): {}, min. {:.5f}, max. {:.5f}".format(
            torch.isfinite(x).all(), x.min(), x.max()))
        return x

In [110]:
class NetNet(nn.Module):
    def __init__(self, input_dim: int, words_embeddings_dim: int, output_dim: int, labels_vectors: torch.Tensor):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.linear2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.linear3 = nn.Linear(256, words_embeddings_dim)
        self.bn3 = nn.BatchNorm1d(words_embeddings_dim)
        self.linear4 = nn.Linear(words_embeddings_dim, output_dim)
        self.ReLU = nn.LeakyReLU()
        self.print = PrintLayer()

        # weight initialize
        self.linear4.weight.data = map_layer_init(w2c_vectors=labels_vectors)
        # freeze layer weights
        self.linear4.weight.requires_grad = False

        self.prev_x = torch.empty([])
        

    def forward(self, x):
        x = self.ReLU(self.bn1(self.linear1(x)))
        x = self.ReLU(self.bn2(self.linear2(x)))
        x = self.ReLU(self.bn3(self.linear3(x)))
        x = self.linear4(x)
        x = F.softmax(x, dim=1)
        return x

In [104]:
def print_model_layer_gradients(model):
    for name, layer in model.named_modules():
        if len(list(layer.named_modules())) == 1 and name != 'ReLU':
            print(f"Layer: {name}\nGradients: {layer.weight.grad}")

### **Training** 

In [105]:
from alive_progress import alive_bar

def train_model(model: NetNet, epochs: int, data_loader: data.DataLoader, loss_fn: nn.MSELoss, additional_eps=1e-06):
        model.train()
        with alive_bar(epochs) as bar:
            for epoch in range(epochs):
                for inputs, labels in data_loader:
                    if not torch.isfinite(inputs).all():
                         inputs = torch.nan_to_num(inputs) # removing nan values
                    if additional_eps:
                         inputs = torch.add(inputs, additional_eps)
                    
                    outputs = model(inputs)
                    loss = loss_fn(outputs, labels.to(torch.long))
                    loss.backward(retain_graph=True)
                    model.optim.step()
                    model.optim.zero_grad()
                    '''
                    try:
                        outputs = model(inputs)
                        previous_inputs = inputs
                        loss = loss_fn(outputs, labels.to(torch.long))
                        loss.backward(retain_graph=True)
                        model.optim.step()
                        model.optim.zero_grad()
                    except Exception as e:
                         print_model_layer_gradients(model)
                         print(f'Invalid input?\nInput: {inputs}')
                         print(f'Previous inputs: {previous_inputs}')
                         raise ValueError(str(e))
                    '''

                print(f"Epoch: {epoch}, loss: {loss.item():.3}")               
                print(bar())

In [106]:
model = NetNet(
    input_dim=input_dim, 
    words_embeddings_dim=word_vector_size, 
    output_dim=output_dim, 
    labels_vectors=labels_vectors
)

In [107]:
learning_rate = 1e-03

In [108]:
loss_fn = nn.NLLLoss()
model.optim = optim.Adam(model.parameters(), lr=learning_rate)
torch.autograd.set_detect_anomaly(True)
train_model(model=model, epochs=5, data_loader=train_data_loader, loss_fn=loss_fn)

on 0: Epoch: 0, loss: -0.308
on 1: None
on 1: Epoch: 1, loss: -0.462
on 2: None
on 2: Epoch: 2, loss: -0.308
on 3: None
on 3: Epoch: 3, loss: -0.231
on 4: None
on 4: Epoch: 4, loss: -0.538
on 5: None
|████████████████████████████████████████| 5/5 [100%] in 2:04:57.7 (0.00/s) 


### **Removing last layer**

In [108]:
zsl_model = nn.Sequential(*(list(model.children())[:3] + list(model.children())[4:]))

In [109]:
print(model)
print(zsl_model)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): ReLU()
)
Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): Linear(in_features=512, out_features=256, bias=True)
  (2): Linear(in_features=256, out_features=100, bias=True)
  (3): ReLU()
)


### **Evaluation** 
For this step we will calculate euclidean distance and find the vector which is the closest.

In [None]:
def find_closest_vector(vector: torch.Tensor, classes: torch.Tensor) -> tuple(str, torch.Tensor):
    min_dist = float('inf')
    min_dist_label = ''
    for label, class_vector in classes:
        dist = torch.cdist(class_vector, vector, p=2)
        dist = dist.squeeze(dim=0)
        if dist < min_dist:
            min_dist = dist
            min_dist_label = label
    return min_dist_label, min_dist

In [None]:
def evaluate_model(model: NetNet, data_loader: data.DataLoader, dataset: data.Dataset):
        model.eval()
        true_predictions, predicitons_amount = 0., 0.
        labels = dataset.targets
             
        with torch.no_grad():
            for inputs, label in data_loader:
                pred_input = model(inputs)
                pred_label, dist = find_closest_vector(vector=pred_input, classes=labels)
                predictions = predictions.squeeze(dim=1)
                true_predictions += int(pred_label == label)
                predicitons_amount += 1

            accuracy = 100.0 * true_predictions / predicitons_amount
        
        print(f"Accuracy of the model: {accuracy:4.2f}%")

## **Training**

In [None]:
model.train_model(epochs=150, data_loader=[])