In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import numpy as np

## **Load and Prepare Data**

In [2]:
import import_ipynb
import dataset
from dataset import parse_dataset

importing Jupyter notebook from dataset.ipynb


In [3]:
ids_dataset = dataset.DatasetIDS2018()



In [4]:
columns_to_drop = ['Timestamp', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port']

In [5]:
labels = [
    'Benign',
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
    'Brute Force -Web',
    'DoS attacks-SlowHTTPTest',
    'Infilteration',
    'SQL Injection'                      
]

labels_emb = {}
for i, label in enumerate(labels):
    labels_emb[label] = i

Train data will consists of classes:
- Benign                      
- Bot                                           
- Brute Force -XSS                 
- DDOS attack-HOIC              
- DDOS attack-LOIC-UDP            
- DDoS attacks-LOIC-HTTP      
- DoS attacks-GoldenEye          
- DoS attacks-Hulk                    
- DoS attacks-Slowloris          
- FTP-BruteForce                                
- Label                                                
- SSH-Bruteforce 


In [6]:
train_labels = [
    'Benign',
    'Bot',
    'Brute Force -XSS',
    'DDOS attack-HOIC',
    'DDOS attack-LOIC-UDP',
    'DDoS attacks-LOIC-HTTP',
    'DoS attacks-GoldenEye',
    'DoS attacks-Hulk',
    'DoS attacks-Slowloris',
    'FTP-BruteForce',
    'SSH-Bruteforce',
    'Label',
                          
]
train = ids_dataset.get_data_by_labels(labels=train_labels)
train


Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [7]:
train_dataset = parse_dataset(dataset=train, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
train_dataset

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6.0,141385.0,9.0,7.0,553.0,3773.0,202.0,0.0,61.444444,87.534438,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,6.0,281.0,2.0,1.0,38.0,0.0,38.0,0.0,19.000000,26.870058,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,6.0,279824.0,11.0,15.0,1086.0,10527.0,385.0,0.0,98.727273,129.392497,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,6.0,132.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,6.0,274016.0,9.0,13.0,1285.0,6141.0,517.0,0.0,142.777778,183.887722,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8284223,6.0,104258.0,2.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8284224,6.0,103022.0,2.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8284229,6.0,105445.0,2.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8284232,6.0,733880.0,2.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


Test data will consists of classes:
- Brute Force -Web                 
- DoS attacks-SlowHTTPTest      
- Infilteration                                              
- SQL Injection                     


In [8]:
test_labels = [label for label in labels if label not in train_labels]
test = ids_dataset.get_data_by_labels(labels=test_labels)
test

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1048672,21,6,16/02/2018 10:12:14,21,1,1,0,0,0,0,...,40,0,0.0,0,0,0.0,0.0,0,0,DoS attacks-SlowHTTPTest
1048673,21,6,16/02/2018 10:12:14,3,1,1,0,0,0,0,...,40,0,0.0,0,0,0.0,0.0,0,0,DoS attacks-SlowHTTPTest
1048674,21,6,16/02/2018 10:12:14,3,1,1,0,0,0,0,...,40,0,0.0,0,0,0.0,0.0,0,0,DoS attacks-SlowHTTPTest
1048675,21,6,16/02/2018 10:12:14,3,1,1,0,0,0,0,...,40,0,0.0,0,0,0.0,0.0,0,0,DoS attacks-SlowHTTPTest
1048676,21,6,16/02/2018 10:12:14,2,1,1,0,0,0,0,...,40,0,0.0,0,0,0.0,0.0,0,0,DoS attacks-SlowHTTPTest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8284248,20000,6,28/02/2018 10:50:04,2,1,1,0,0,0,0,...,24,0,0,0,0,0,0.0,0,0,Infilteration
8284249,23,6,28/02/2018 11:59:12,3,1,1,0,0,0,0,...,24,0,0,0,0,0,0.0,0,0,Infilteration
8284250,425,6,28/02/2018 10:50:04,2,1,1,0,0,0,0,...,24,0,0,0,0,0,0.0,0,0,Infilteration
8284252,23,6,28/02/2018 11:10:50,22,1,1,0,0,0,0,...,24,0,0,0,0,0,0.0,0,0,Infilteration


In [9]:
test_dataset = parse_dataset(dataset=test, columns_to_drop=columns_to_drop, fixed_type=float, labels_emb=labels_emb, labels_column_name='Label')
test_dataset

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1048672,6,21,1,1,0,0,0,0,0.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,13
1048673,6,3,1,1,0,0,0,0,0.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,13
1048674,6,3,1,1,0,0,0,0,0.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,13
1048675,6,3,1,1,0,0,0,0,0.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,13
1048676,6,2,1,1,0,0,0,0,0.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8284248,6,2,1,1,0,0,0,0,0.0,0.0,...,24,0.0,0.0,0,0,0.0,0.0,0,0,14
8284249,6,3,1,1,0,0,0,0,0.0,0.0,...,24,0.0,0.0,0,0,0.0,0.0,0,0,14
8284250,6,2,1,1,0,0,0,0,0.0,0.0,...,24,0.0,0.0,0,0,0.0,0.0,0,0,14
8284252,6,22,1,1,0,0,0,0,0.0,0.0,...,24,0.0,0.0,0,0,0.0,0.0,0,0,14


In [10]:
test_dataset.to_pickle('../data/pickle/test_dataset.pkl')
train_dataset.to_pickle('../data/pickle/train_dataset.pkl')

In [13]:
test_dataset = data.TensorDataset(torch.from_numpy(test_dataset.values).float(),torch.from_numpy(test_dataset.values[:,-1].astype(float)).float())
train_dataset = data.TensorDataset(torch.from_numpy(train_dataset.values).float(),torch.from_numpy(train_dataset.values[:,-1].astype(float)).float())

In [14]:
train_data_loader = data.DataLoader(train_dataset, batch_size=256, shuffle=True) 
test_data_loader = data.DataLoader(test_dataset, batch_size=256, shuffle=True) 

In [15]:
next(iter(train_data_loader))

[tensor([[6.0000e+00, 4.2344e+06, 5.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 1.0800e+02, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 8.6122e+07, 2.0000e+00,  ..., 8.6122e+07, 8.6122e+07,
          0.0000e+00],
         ...,
         [6.0000e+00, 5.5943e+05, 6.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 1.4100e+02, 2.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [6.0000e+00, 3.5000e+01, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.,  0.,  0.,  0.,  3.,  0.,
          9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.,  0., 10.,  0.,  0.,  0.,
          0.,  0.,  3.,  3.,  0.,  0.,  0.,  3.,  0.,  0.,  3.,  0.,  0.,  0.,
          0.,  7.,  0.,  0.,  0.,  3.,  0.,  3.,  7.,  0.,  0.,  0.,  3.,  1.,
          0.,  9.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  1.,  9.,  0., 

In [16]:
next(iter(test_data_loader))

[tensor([[  6.,   2.,   1.,  ...,   0.,   0.,  14.],
         [ 17., 508.,   1.,  ...,   0.,   0.,  14.],
         [  6.,  27.,   1.,  ...,   0.,   0.,  14.],
         ...,
         [  6.,   3.,   1.,  ...,   0.,   0.,  13.],
         [  6.,  45.,   1.,  ...,   0.,   0.,  14.],
         [  6.,   2.,   1.,  ...,   0.,   0.,  13.]]),
 tensor([14., 14., 14., 13., 14., 14., 14., 13., 14., 14., 14., 13., 14., 14.,
         13., 12., 13., 14., 14., 14., 14., 13., 13., 13., 14., 13., 14., 14.,
         13., 13., 14., 14., 14., 14., 13., 14., 14., 14., 13., 13., 13., 14.,
         13., 13., 14., 14., 13., 14., 13., 14., 13., 13., 14., 14., 13., 13.,
         14., 14., 14., 14., 13., 13., 13., 13., 13., 13., 13., 13., 14., 14.,
         14., 14., 13., 14., 13., 13., 14., 14., 14., 14., 13., 14., 13., 14.,
         14., 14., 14., 13., 13., 14., 13., 14., 14., 14., 14., 14., 14., 13.,
         14., 13., 13., 14., 14., 14., 13., 14., 14., 14., 13., 14., 13., 13.,
         13., 13., 13., 13., 13., 

### **Variables**

In [None]:
word_vector_size = 100
input_dim = 64
output_dim = 16
learning_rate = 0.001

## **Word2Vec**

In [6]:
import string
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [7]:
corpus = []
for label in labels:
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
    corpus.append(words)

In [8]:
gensim_model = Word2Vec(corpus, vector_size=word_vector_size, min_count=1)

In [9]:
class LabelsEmbeddings():
    def __init__(self, gensim_model: Word2Vec):
        self.model = gensim_model

    def fix_vectors_sizes(self, vectors: list) -> list:
        fixed_vectors = []
        max_size = max([len(v) for v in vectors])
        for vector in vectors:
            size_diff = max_size - len(vector)
            vector.extend([[0] * word_vector_size] * size_diff)
            fixed_vectors.append(vector)
        return fixed_vectors

    def generate_vectors(self, labels: dict):
        vectors = []
        for label, _ in labels.items():
            description_vector = []
            words = [word.translate(str.maketrans('', '', string.punctuation)) for word in label.split()]
            for word in words:
                if word in self.model.wv.index_to_key:
                    description_vector.append(self.model.wv[word])
                else:
                    description_vector.append(word_vector_size * [0])
            vectors.append(description_vector)
        return self.fix_vectors_sizes(vectors)

In [10]:
labels_embeddings = LabelsEmbeddings(gensim_model=gensim_model)
labels_vectors = labels_embeddings.generate_vectors(labels=[])
#TODO need to pad?

In [11]:
[len(v) for v in labels_vectors]

[4, 4, 4, 4]

In [13]:
for vector in labels_vectors:
    for word in vector:
        assert len(word) == word_vector_size


### **Map layer**
Last layer of the model should be a map between incident embeddings and labels embeddings - we want to map given input data to the most corresponding Word2Vec label vector. To do this we have to initialize weights of this layer and freeze them.

In [19]:
def map_layer_init(shape: int, w2c_vectors: list) -> torch.Tensor:
    vectors = np.asarray(w2c_vectors, dtype=float)
    return torch.from_numpy(vectors)

## **Neural Network for Network Data**

In [20]:
class NetNet(nn.Module):
    def __init__(self, input_dim: int, words_embeddings_dim: int, output_dim: int, labels_vectors: torch.Tensor):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, words_embeddings_dim)
        self.linear4 = nn.Linear(words_embeddings_dim, output_dim)
        self.ReLU = nn.ReLU()
        self.softmax = nn.Softmax()

        # weight initialize
        self.linear4.weight.data = map_layer_init(shape=words_embeddings_dim, w2c_vectors=labels_vectors)
        # freeze layer weights
        self.linear4.weight.requires_grad = False


    def forward(self, x):
        x = self.ReLU(self.linear1(x))
        x = self.ReLU(self.linear2(x))
        x = self.ReLU(self.linear3(x))
        x = self.softmax(self.linear4(x))
        return x

In [21]:
model = NetNet(
    input_dim=input_dim, 
    words_embeddings_dim=word_vector_size, 
    output_dim=output_dim, 
    labels_vectors=labels_vectors
)

### **Training** 

In [None]:
def train_model(model: NetNet, epochs: int, data_loader: data.DataLoader, loss_fn: nn.MSELoss):
        model.train()

        for epoch in range(epochs):
            for inputs, labels in data_loader:
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                loss.backward()
                model.optim.step()
                model.optim.zero_grad()

            if epoch % 10 == 1:
                print(f"Epoch: {epoch}, loss: {loss.item():.3}")

In [None]:
loss_fn = nn.MSELoss()
model.optim = optim.Adam(model.parameters(), lr=learning_rate)
train_model(model=model, epochs=150, data_loader=train_data_loader, loss_fn=loss_fn)

In [22]:
print(model)

NetNet(
  (linear1): Linear(in_features=64, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=1, bias=True)
  (ReLU): ReLU()
  (softmax): Softmax(dim=None)
)


### **Removing last layer**

In [32]:
zsl_model = nn.Sequential(*(list(model.children())[:3] + list(model.children())[4:-1]))

In [33]:
print(zsl_model)

Sequential(
  (0): Linear(in_features=64, out_features=512, bias=True)
  (1): Linear(in_features=512, out_features=256, bias=True)
  (2): Linear(in_features=256, out_features=100, bias=True)
  (3): ReLU()
)


### **Evaluation** 
For this step we will calculate euclidean distance and find the vector which is the closest.

In [None]:
def find_closest_vector(vector: torch.Tensor, classes: torch.Tensor) -> tuple(str, torch.Tensor):
    min_dist = float('inf')
    min_dist_label = ''
    for label, class_vector in classes:
        dist = torch.cdist(class_vector, vector, p=2)
        dist = dist.squeeze(dim=0)
        if dist < min_dist:
            min_dist = dist
            min_dist_label = label
    return min_dist_label, min_dist

In [None]:
def evaluate_model(model: NetNet, data_loader: data.DataLoader, dataset: data.Dataset):
        model.eval()
        true_predictions, predicitons_amount = 0., 0.
        labels = dataset.targets
             
        with torch.no_grad():
            for inputs, label in data_loader:
                pred_input = model(inputs)
                pred_label, dist = find_closest_vector(vector=pred_input, classes=labels)
                predictions = predictions.squeeze(dim=1)
                true_predictions += int(pred_label == label)
                predicitons_amount += 1

            accuracy = 100.0 * true_predictions / predicitons_amount
        
        print(f"Accuracy of the model: {accuracy:4.2f}%")

## **Training**

In [None]:
model.train_model(epochs=150, data_loader=[])