In [1]:
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load network dataset
ds_train = load_dataset("rdpahalavan/network-packet-flow-header-payload", split="train")
ds_test = load_dataset("rdpahalavan/network-packet-flow-header-payload", split="test")

In [3]:
print(ds_train)
print(ds_train.features)

Dataset({
    features: ['packet_dat', 'attack_cat'],
    num_rows: 1187781
})
{'packet_dat': Value('string'), 'attack_cat': Value('string')}


In [4]:
sample = ds_train[0]   # first row
print(sample)
print(sample["packet_dat"])  # torch.Tensor
print(sample["attack_cat"])  # torch.Tensor

{'packet_dat': '0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 10 32 32 32 32 32 32 32 32 60 47 100 105 118 62 10 32 32 32 32 32 32 32 32 60 100 105 118 32 99 108 97 115 115 61 34 99 111 110 116 101 110 116 95 115 101 99 116 105 111 110 95 116 101 120 116 34 62 10 32 32 32 32 32 32 32 32 32 32 60 112 62 10 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 85 98 117 110 116 117 39 115 32 65 112 97 99 104 101 50 32 100 101 102 97 117 108 116 32 99 111 110 102 105 103 117 114 97 116 105 111 110 32 105 115 32 100 105 102 102 101 114 101 110 116 32 102 114 111 109 32 116 104 101 10 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 117 112 115 116 114 101 97 109 32 100 101 102 97 117 108 116 32 99 111 110 102 105 103 117 114 97 116 105 111 110 44 32 97 110 100 32 115 112 108 105 116 32 105 110 116 111 32 115 101 118 101 114 97 108 32 102 105 108 101 115 32 111 112 116 105 109 105 122 101 100 32 102 111 114 10 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 105 110 116 101 114 97 99 116 105 111 11

In [5]:
def get_label_categories(labels):
    counter = 0
    categories = {}
    for label in labels:
        if label not in categories:
            categories[label] = counter
            counter += 1
    return categories

In [6]:
categories = get_label_categories(ds_train["attack_cat"])
print(categories)

{'DDoS': 0, 'Normal': 1, 'DoS Hulk': 2, 'DoS': 3, 'Bot': 4, 'Exploits': 5, 'Fuzzers': 6, 'Reconnaissance': 7, 'Web Attack - XSS': 8, 'Heartbleed': 9, 'SSH Patator': 10, 'DoS SlowHTTPTest': 11, 'FTP Patator': 12, 'Generic': 13, 'Web Attack - Brute Force': 14, 'DoS GoldenEye': 15, 'Analysis': 16, 'Worms': 17, 'Infiltration': 18, 'DoS Slowloris': 19, 'Shellcode': 20, 'Backdoor': 21, 'Port Scan': 22, 'Web Attack - SQL Injection': 23}


In [7]:
# function to parse "packet_dat" string into a list of ints
def parse_packet(example):
    nums = list(map(int, example["packet_dat"].split()))
    return {"packet_tensor": torch.tensor(nums, dtype=torch.float), "attack_tensor": torch.tensor(categories[example["attack_cat"]], dtype=torch.int)}

In [8]:
# apply conversion
ds_train = ds_train.map(parse_packet)
ds_test = ds_test.map(parse_packet)

In [9]:
# now cast to torch
ds_train.set_format(type="torch", columns=["packet_tensor", "attack_tensor"], device=device)
ds_test.set_format(type="torch", columns=["packet_tensor", "attack_tensor"], device=device)

In [10]:
# model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(513, 256)
        self.fc2 = nn.Linear(256, 128)
        self.output = nn.Linear(128, 24)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.output(x)

        return x

In [11]:
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

In [12]:
# use with DataLoader
train_loader = DataLoader(ds_train, batch_size=8, shuffle=True)
test_loader = DataLoader(ds_test, batch_size=8)
train_loss = []
#for epoch in range(10):
model.train(True)
running_loss = 0.0

for batch in train_loader:
    inputs = batch["packet_tensor"]   # tensor of shape [B, L]
    labels = batch["attack_tensor"]   # true attack category labels
    
    # zero gradients
    optimizer.zero_grad()

    # forward pass
    output = model(inputs)
    loss = criterion(output, labels)
    running_loss += loss.item() * inputs.size(0)

    # backward pass and optimize
    loss.backward()
    optimizer.step()

print(running_loss)

KeyboardInterrupt: 