In [1]:
import os
from pathlib import Path
import torch
import re
import random
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from collections import Counter
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
## for cut length
from itertools import zip_longest

In [2]:
# for Google colab: check for gpu
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jul  9 12:31:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off | 00000000:01:00.0  On |                  N/A |
|  0%   42C    P8              13W / 320W |    963MiB / 16376MiB |     16%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
### absolute positional encoding
class PositionalEmbedding(torch.nn.Module):
    # d_model is the dimension of output : 300
    def __init__(self, d_model, max_len=2000):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float() # 2000 * 1030
        pe.require_grad = False

        for pos in range(max_len):
            # for each dimension of the each position
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        # include the batch size
        self.register_buffer('pe', pe.unsqueeze(0))


    def forward(self, x):
        return self.pe



## embeddings : positional + segmentation + packet embedding
class NetformerEmbedding(torch.nn.Module):
    """
    Netformer Embedding which is consisted with under features
        1. PacketEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding [0 : flow, 1 : packet, 2: '[SEP]']
        sum of all these features are output of NetformerEmbedding
    """

    def __init__(self, feature_size, embed_size, seq_len=2000, dropout=0.1):
        """
        :param feature_size: total bin size 1030
        :param embed_ size: embedding size of token embedding
        :param dropout: dropout rate
        :seq_len : length of the input features
        """
        super().__init__()
        self.embed_size = embed_size
        # (m, seq_len) --> (m, seq_len, embed_size)
        # padding_idx is not updated during training, remains as fixed pad (0)
        self.token = torch.nn.Embedding(feature_size, embed_size)
        self.segment = torch.nn.Embedding(2, embed_size)
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):

        sequence = sequence.to(torch.long)
        segment_label= segment_label.to(torch.long)
        # print(segment_label.shape)
        # print(sequence.shape)
        # print(self.token(sequence).shape)
        # print(self.position(sequence).shape)
        # print(self.segment(segment_label).shape)
        # print("Max index in sequence:", sequence.max())
        # print("Min index in sequence:", sequence.min())
        # print("Max index in segment_label:", segment_label.max())
        # print("Min index in segment_label:", segment_label.min())
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

In [4]:
### multi-head attention
class MultiHeadedAttention(torch.nn.Module):

    def __init__(self, heads, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()

        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, d_model)
        mask of shape: (batch_size, 1, feature_size, max_words)
        """
        # (batch_size, max_len, d_model)
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # (batch_size, max_len, d_model) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))

        # fill 0 mask with super small number so it wont affect the softmax weight
        # (batch_size, h, max_len, max_len)
        scores = scores.masked_fill(mask == 0, -1e9)

        # (batch_size, h, max_len, max_len)
        # softmax to put attention weight for all non-pad tokens
        # max_len X max_len matrix of attention
        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)

        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        # (batch_size, max_len, d_model)
        return self.output_linear(context)

class FeedForward(torch.nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, middle_dim=500, dropout=0.1):
        super(FeedForward, self).__init__()

        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(torch.nn.Module):
    def __init__(
        self,
        d_model=10,
        heads=10,
        feed_forward_hidden=10 * 4,
        dropout=0.1
        ):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        # residual layer
        interacted = self.layernorm(interacted + embeddings)
        # bottleneck
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [5]:
class NetFormer(torch.nn.Module):
    """
    Netformer model : Native transformer for network traffic
    """

    def __init__(self, feature_size = 1030, d_model=10, n_layers=2, heads=2, dropout=0.1, number_of_class=4):
        """
        :param feature_size: feature_size of total bins
        :param hidden : hidden layers of Netformer
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads
        self.number_of_class = number_of_class
        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4
        # self.classhead = Classification_head(d_model, number_of_class)
        # embedding for Netformer, sum of positional, segment, token embeddings
        self.embedding = NetformerEmbedding(feature_size = feature_size, embed_size=d_model)
        
        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        # attention masking for padded token
        # print(segment_info.dim())
        assert segment_info.dim() == 2, "segment_info should have 2 dimensions: [batch_size, seq_len]"
        
        # (batch_size, 1, seq_len, seq_len)
        # mask = (segment_info > 0).unsqueeze(1).repeat(1, segment_info.size(1), 1).unsqueeze(1)
        mask = (segment_info > 0).unsqueeze(1).repeat(1, segment_info.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)
        # print("embedding works")
        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        # print("encoder works")
        # x = self.self.classhead(x)
        return x

In [6]:
class Classification_head(torch.nn.Module):
    """
    for classification head define
    """
    
    def __init__(self, hidden, num_of_classes,drop=0.):
        super().__init__()
        self.layer1 = torch.nn.Linear(hidden, 1)
        self.mlp_head = torch.nn.Linear(2000, num_of_classes)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

        # self.act1 = torch.nn.GELU()
        # self.drop =torch.nn.Dropout(drop)

    def forward(self,x):
        x = self.layer1(x)
        # print(x.shape)
        x = x.squeeze()
        # print(x.shape)
        x = self.mlp_head(x)
        # print(x.shape)
        x = self.softmax(x)
        # print(x.shape)
        return x

In [7]:
class NetFormerLM(torch.nn.Module):
    """
    BERT Language Model
    Next Sentence Prediction Model + Masked Language Model
    """

    def __init__(self, netformer: NetFormer, number_of_class):
        """
        :param netformer: netformer model which should be trained
        :param vocab_size: total vocab size for masked_lm
        """

        super().__init__()
        self.netformer = netformer
        self.chead = Classification_head(self.netformer.d_model, number_of_class)

    def forward(self, x, segment_label):
        x = self.netformer(x, segment_label)
        return self.chead(x)




## schedule learning rate

class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [8]:
# ## trainner
# class NetformerTrainer:
#     def __init__(
#         self,
#         model,
#         train_dataloader,
#         test_dataloader=None,
#         lr= 1e-4,
#         weight_decay=0.01,
#         betas=(0.9, 0.999),
#         warmup_steps=1000,
#         log_freq=200,
#         num_of_class = 6,
#         device='cuda'
#         ):

#         self.device = device
#         self.model = model
#         self.train_data = train_dataloader
#         self.test_data = test_dataloader
#         self.num_of_class = num_of_class

#         # Setting the Adam optimizer with hyper-param
#         self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
#         self.optim_schedule = ScheduledOptim(
#             self.optim, self.model.netformer.d_model, n_warmup_steps=warmup_steps
#             )

#         # Using Negative Log Likelihood Loss function for predicting the masked_token
#         self.criterion = torch.nn.CrossEntropyLoss()
#         # self.criterion_2 = torch.nn.MSELoss()
#         self.log_freq = log_freq
#         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

#     def train(self, epoch):
#         self.iteration(epoch, self.train_data)

#     def test(self, epoch):
#         self.iteration(epoch, self.test_data, train=False)

#     def iteration(self, epoch, data_loader, train=True):

#         avg_loss = 0.0
#         total_correct = 0
#         total_element = 0
#         all_labels = []
#         all_preds = []
#         mode = "train" if train else "test"

#         # progress bar
#         data_iter = tqdm.tqdm(
#             enumerate(data_loader),
#             desc="EP_%s:%d" % (mode, epoch),
#             total=len(data_loader),
#             bar_format="{l_bar}{r_bar}"
#         )

#         for i, data in data_iter:

#             # 0. batch_data will be sent into the device(GPU or cpu)
#             data = {key: value.to(self.device) for key, value in data.items()}

#             # 1. forward the next_sentence_prediction and masked_lm model
#             classification_output = self.model.forward(data["netformer_input"], data["segment_label"])
#             # class_output = mask_lm_output.transpose(1, 2)
#             # print(mask_lm_output.shape)
#             # print(type(mask_lm_output))
#             # print("classification_output shape:", classification_output.shape)
#             # print("sequence_label shape:", data["sequence_label"].shape)

#             classification_loss = self.criterion(classification_output, data["sequence_label"].to(torch.long))

#             # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
#             loss = classification_loss 

#             # 3. backward and optimization only in train
#             if train:
#                 self.optim_schedule.zero_grad()
#                 loss.backward()
#                 self.optim_schedule.step_and_update_lr()

#             correct = classification_output.argmax(dim=-1).eq(data["sequence_label"]).sum().item()
#             total_correct += correct
#             total_element += data["sequence_label"].nelement()

#             all_labels.extend(data["sequence_label"].cpu().numpy())
#             all_preds.extend(classification_output.argmax(dim=-1).cpu().numpy())
            
#             post_fix = {
#                 "epoch": epoch,
#                 "iter": i,
#                 # "avg_loss": avg_loss / (i + 1),
#                 # "avg_acc": total_correct / total_element * 100,
#                 "loss": loss.item()
#             }

#             if i % self.log_freq == 0:
#                 data_iter.write(str(post_fix))
#         print( "total_acc=",
#               total_correct * 100.0 / total_element)

#         # Calculate metrics
#         all_labels = np.array(all_labels)
#         all_preds = np.array(all_preds)
        
#         accuracy = accuracy_score(all_labels, all_preds)
#         precision = precision_score(all_labels, all_preds, average=None, zero_division=0)
#         recall = recall_score(all_labels, all_preds, average=None, zero_division=0)
#         f1 = f1_score(all_labels, all_preds, average=None, zero_division=0)
#         # roc_auc = roc_auc_score(all_labels, self.one_hot_encode(all_preds, self.num_of_class), average=None, multi_class='ovr')

#         print(f"Accuracy: {accuracy:.4f}")
#         for i in range(self.num_of_class):
#             print(f"Class {i}: Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1-Score: {f1[i]:.4f}, ROC-AUC: {roc_auc[i]:.4f}")

#     @staticmethod
#     def one_hot_encode(labels, num_classes):
#         return np.eye(num_classes)[labels]

In [9]:
# from sklearn.metrics import confusion_matrix
# import numpy as np

class NetformerTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        test_dataloader=None,
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=1000,
        log_freq=200,
        num_of_class = 6,
        device='cuda'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader
        self.num_of_class = num_of_class

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.netformer.d_model, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.CrossEntropyLoss()
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):

        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        all_labels = []
        all_preds = []
        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            classification_output = self.model.forward(data["netformer_input"], data["segment_label"])
            classification_loss = self.criterion(classification_output, data["sequence_label"].to(torch.long))

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = classification_loss 

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            correct = classification_output.argmax(dim=-1).eq(data["sequence_label"]).sum().item()
            total_correct += correct
            total_element += data["sequence_label"].nelement()

            all_labels.extend(data["sequence_label"].cpu().numpy())
            all_preds.extend(classification_output.argmax(dim=-1).cpu().numpy())
            
            post_fix = {
                "epoch": epoch,
                "iter": i,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print("total_acc=", total_correct * 100.0 / total_element)

        # Calculate metrics
        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average=None, zero_division=0)
        recall = recall_score(all_labels, all_preds, average=None, zero_division=0)
        f1 = f1_score(all_labels, all_preds, average=None, zero_division=0)

        print(f"Accuracy: {accuracy:.4f}")
        for i in range(self.num_of_class):
            print(f"Class {i}: Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1-Score: {f1[i]:.4f}")

        # Calculate and print confusion matrix
        conf_matrix = confusion_matrix(all_labels, all_preds)
        print("Confusion Matrix:")
        print(conf_matrix)

    @staticmethod
    def one_hot_encode(labels, num_classes):
        return np.eye(num_classes)[labels]


In [10]:
class NetformerDatasetDownstream(Dataset):
    def __init__(self, input_sequences, input_labels, input_segments, seq_len = 2000):
        self.seq_len = seq_len
        self.session_flows = len(input_sequences)
        self.sessions = input_sequences
        self.segments = input_segments
        self.labels = input_labels
        self.special_token_dict =  {'PAD': 0, 'MASK': 1028}
        self.mask_ratio = 0


    def __len__(self):
        return self.session_flows

    def __getitem__(self,item):

        ##step 1 : get random sessions 
        s1, seg1,seq_label = self.get_session_flow(item)

        ## step 2: replace random word in sentence 
        s1_random, s1_label, s1_idx = self.random_word(s1)
        
        segment_label = seg1

        netformer_input = s1_random
        netformer_label = s1_label
        netformer_idx = s1_idx

        
        output = {"netformer_input": netformer_input,
                  "netformer_label": netformer_label,
                  "netformer_idx":netformer_idx,
                  "segment_label": segment_label,
                "sequence_label": seq_label}

        return {key: torch.tensor(value,dtype=torch.float32) for key, value in output.items()}


    def random_word(self, sentence):
        output_label = []
        output = []
        output_idx =[]


        for i, token in enumerate(sentence):
            prob = random.random()

            if prob < self.mask_ratio:
                prob /= self.mask_ratio
    
                if prob < 0.8:
                    output.append(self.special_token_dict['MASK'])
                elif prob < 0.9:
                    output.append(self.random_selection(self.sessions))
                else:
                    output.append(token)
    
                output_label.append(token)
                output_idx.append(1)
    
            else:
                output.append(token)
                output_label.append(0)
                output_idx.append(0)
                

        assert len(output) == len(output_label)
        return output, output_label, output_idx
        

    def random_selection(self, input_sequences):
        rand_session = random.randrange(len(input_sequences))
        rand_flow = random.randrange(len(input_sequences[rand_session]))
        return input_sequences[rand_session][rand_flow]
        

    def get_session_flow(self, item):
        '''Return session data and segments'''
        return self.sessions[item], self.segments[item],self.labels[item]