In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import numpy as np

## **Load and Prepare Data**

In [2]:
import import_ipynb
import dataset
from dataset import parse_dataset
import os
import pandas as pd

importing Jupyter notebook from dataset.ipynb


In [3]:
labels = [
    'Benign',
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
    'Brute Force -Web',
    'DoS attacks-SlowHTTPTest',
    'Infilteration',
    'SQL Injection'                      
]

labels_emb = {}
for i, label in enumerate(labels):
    labels_emb[label] = i

### **Train data classes**
- Benign                      
- Bot                                           
- Brute Force -XSS                 
- DDOS attack-HOIC              
- DDOS attack-LOIC-UDP            
- DDoS attacks-LOIC-HTTP      
- DoS attacks-GoldenEye          
- DoS attacks-Hulk                    
- DoS attacks-Slowloris          
- FTP-BruteForce                                
- Label                                                
- SSH-Bruteforce 


In [4]:
train_labels = [
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
                          
]


### **Test data will consists of classes**
- Brute Force -Web                 
- DoS attacks-SlowHTTPTest      
- Infilteration                                              
- SQL Injection                     


In [5]:
test_labels = [label for label in labels if label not in train_labels]

In [6]:
columns_to_drop = ['Timestamp', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port']

In [7]:
PICKLE_PATH = '../data/pickle/'
TRAIN_PICKLE = 'train_dataset.pkl'
TEST_PICKLE = 'test_dataset.pkl'

In [8]:
train_pickle_path = os.path.join(PICKLE_PATH, TRAIN_PICKLE)
test_pickle_path = os.path.join(PICKLE_PATH, TEST_PICKLE)

if TRAIN_PICKLE not in os.listdir(PICKLE_PATH) and TEST_PICKLE not in os.listdir(PICKLE_PATH):
    ids_dataset = dataset.DatasetIDS2018(csv_file_name='../data/raw/small_merge_data.csv')
    train = ids_dataset.get_data_by_labels(labels=train_labels)
    train_dataset = parse_dataset(dataset=train, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
    train_dataset.to_pickle(train_pickle_path)
    test = ids_dataset.get_data_by_labels(labels=test_labels)
    test_dataset = parse_dataset(dataset=test, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
    test_dataset.to_pickle(test_pickle_path)
else:
    train_dataset = pd.read_pickle(train_pickle_path)
    test_dataset = pd.read_pickle(test_pickle_path)





In [9]:
test_dataset = data.TensorDataset(torch.from_numpy(test_dataset.values).float(),torch.from_numpy(test_dataset.values[:,-1].astype(float)).float())
train_dataset = data.TensorDataset(torch.from_numpy(train_dataset.values).float(),torch.from_numpy(train_dataset.values[:,-1].astype(float)).float())

In [10]:
batch_size = 32

In [11]:
train_data_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
test_data_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 

In [12]:
next(iter(train_data_loader))

[tensor([[6.0000e+00, 1.3970e+03, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 5.1800e+02, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          1.0000e+00],
         [6.0000e+00, 3.0398e+06, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          8.0000e+00],
         ...,
         [6.0000e+00, 1.3403e+04, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          3.0000e+00],
         [6.0000e+00, 1.3099e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          7.0000e+00],
         [6.0000e+00, 4.4244e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          7.0000e+00]]),
 tensor([ 3.,  1.,  8.,  7.,  1.,  1.,  3.,  9.,  9.,  7.,  1.,  3.,  9.,  3.,
          3.,  3.,  1.,  1.,  6.,  7.,  7.,  3., 10.,  1.,  6.,  7.,  6.,  9.,
          7.,  3.,  7.,  7.])]

In [13]:
next(iter(test_data_loader))

[tensor([[6.0000e+00, 2.1464e+04, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 5.8772e+05, 5.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 2.5160e+06, 8.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         ...,
         [6.0000e+00, 2.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          1.3000e+01],
         [6.0000e+00, 3.3000e+01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 7.6000e+01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([ 0.,  0.,  0.,  0.,  0., 14.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 13.,  0., 13.,
          0., 13.,  0.,  0.])]

### **Variables**

In [14]:
word_vector_size = 100
input_dim = len(train_dataset[0][0])
output_dim = len(labels)
learning_rate = 0.001

In [15]:
input_dim

78

## **Word2Vec**

In [16]:
import string
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [17]:
corpus = []
for label in labels:
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
    corpus.append(words)

In [18]:
gensim_model = Word2Vec(corpus, vector_size=word_vector_size, min_count=1)

In [19]:
class LabelsEmbeddings():
    def __init__(self, gensim_model: Word2Vec):
        self.model = gensim_model

    def fix_vectors_sizes(self, vectors: list) -> list:
        fixed_vectors = []
        max_size = max([len(v) for v in vectors])
        for vector in vectors:
            size_diff = max_size - len(vector)
            vector.extend([[0] * word_vector_size] * size_diff)
            fixed_vectors.append(vector)
        return fixed_vectors

    def generate_vectors(self, labels: dict):
        vectors = []
        for label in labels:
            description_vector = []
            words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
            for word in words:
                if word in self.model.wv.index_to_key:
                    description_vector.append(self.model.wv[word])
                else:
                    description_vector.append(word_vector_size * [0])
            vectors.append(description_vector)
        return self.fix_vectors_sizes(vectors)

In [20]:
labels_embeddings = LabelsEmbeddings(gensim_model=gensim_model)
labels_vectors = labels_embeddings.generate_vectors(labels=labels)

In [21]:
[len(v) for v in labels_vectors]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [22]:
for vector in labels_vectors:
    for word in vector:
        assert len(word) == word_vector_size


### **Map layer**
Last layer of the model should be a map between incident embeddings and labels embeddings - we want to map given input data to the most corresponding Word2Vec label vector. To do this we have to initialize weights of this layer and freeze them.

In [23]:
def map_layer_init(w2c_vectors: list) -> torch.Tensor:
    vectors = np.asarray(w2c_vectors, dtype=float)
    vectors = torch.from_numpy(vectors)
    return vectors[:, -1, :].to(torch.float32)

In [24]:
map_layer = map_layer_init(labels_vectors)
print(f"Size: {map_layer.size()}\nType: {map_layer.dtype}")


Size: torch.Size([16, 100])
Type: torch.float32


## **Neural Network for Network Data**

In [25]:
class NetNet(nn.Module):
    def __init__(self, input_dim: int, words_embeddings_dim: int, output_dim: int, labels_vectors: torch.Tensor):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, words_embeddings_dim)
        self.linear4 = nn.Linear(words_embeddings_dim, output_dim)
        self.ReLU = nn.ReLU()

        # weight initialize
        self.linear4.weight.data = map_layer_init(w2c_vectors=labels_vectors)
        # freeze layer weights
        self.linear4.weight.requires_grad = False


    def forward(self, x):
        x = self.ReLU(self.linear1(x))
        x = self.ReLU(self.linear2(x))
        x = self.ReLU(self.linear3(x))
        x = self.ReLU(self.linear4(x))
        x = F.softmax(x, dim=1)
        return x

In [26]:
model = NetNet(
    input_dim=input_dim, 
    words_embeddings_dim=word_vector_size, 
    output_dim=output_dim, 
    labels_vectors=labels_vectors
)

In [27]:
print(model)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): ReLU()
)


### **Training** 

In [28]:
from alive_progress import alive_bar

def train_model(model: NetNet, epochs: int, data_loader: data.DataLoader, loss_fn: nn.MSELoss):
        model.train()
        with alive_bar(epochs) as bar:
            for epoch in range(epochs):
                for inputs, labels in data_loader:
                    outputs = model(inputs)
                    loss = loss_fn(outputs, labels.to(torch.long))
                    loss.backward()
                    model.optim.step()
                    model.optim.zero_grad()

                print(f"Epoch: {epoch}, loss: {loss.item():.3}")               
                bar()

In [29]:
loss_fn = nn.CrossEntropyLoss()
model.optim = optim.Adam(model.parameters(), lr=learning_rate)
train_model(model=model, epochs=10, data_loader=train_data_loader, loss_fn=loss_fn)

on 0: Epoch: 0, loss: nan
|████⚠︎                                   | (!) 1/10 [10%] in 3:55.9 (0.00/s) 


KeyboardInterrupt: 

### **Removing last layer**

In [108]:
zsl_model = nn.Sequential(*(list(model.children())[:3] + list(model.children())[4:]))

In [109]:
print(model)
print(zsl_model)

NetNet(
  (linear1): Linear(in_features=78, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=16, bias=True)
  (ReLU): ReLU()
)
Sequential(
  (0): Linear(in_features=78, out_features=512, bias=True)
  (1): Linear(in_features=512, out_features=256, bias=True)
  (2): Linear(in_features=256, out_features=100, bias=True)
  (3): ReLU()
)


### **Evaluation** 
For this step we will calculate euclidean distance and find the vector which is the closest.

In [None]:
def find_closest_vector(vector: torch.Tensor, classes: torch.Tensor) -> tuple(str, torch.Tensor):
    min_dist = float('inf')
    min_dist_label = ''
    for label, class_vector in classes:
        dist = torch.cdist(class_vector, vector, p=2)
        dist = dist.squeeze(dim=0)
        if dist < min_dist:
            min_dist = dist
            min_dist_label = label
    return min_dist_label, min_dist

In [None]:
def evaluate_model(model: NetNet, data_loader: data.DataLoader, dataset: data.Dataset):
        model.eval()
        true_predictions, predicitons_amount = 0., 0.
        labels = dataset.targets
             
        with torch.no_grad():
            for inputs, label in data_loader:
                pred_input = model(inputs)
                pred_label, dist = find_closest_vector(vector=pred_input, classes=labels)
                predictions = predictions.squeeze(dim=1)
                true_predictions += int(pred_label == label)
                predicitons_amount += 1

            accuracy = 100.0 * true_predictions / predicitons_amount
        
        print(f"Accuracy of the model: {accuracy:4.2f}%")

## **Training**

In [None]:
model.train_model(epochs=150, data_loader=[])