In [1]:
import fastai
import os
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from sklearn.preprocessing import (
    StandardScaler
)

from typing import (
    Callable,
    Dict,
    List,
    Tuple
)

from Quick.cleaning.loading import (
    examine_dataset,
    remove_infs_and_nans
)

from Quick.cleaning.utils import (
    get_file_path
)

from Quick.runners.deep import (
    run_deep_nn_experiment
)

from Quick.runners.residual import (
    run_residual_deep_nn_experiment
)

from Quick.runners.sk import (
    run_sk_experiment
)

from Quick.runners.torch import (
    run_torch_nn_experiment
)

from rff.layers import (
    GaussianEncoding,
)


from Quick.constants import *



In [17]:
import torch.nn as nn
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# device = torch.device('cpu')

True
True


In [18]:
data_path_1: str = './data/'
data_path_2: str = './data/03-11/'
data_sets_1: list = [
    'Darknet_experiments_base.csv',
    'airline-passengers.csv',
]
data_sets_2: list = [
    'Syn.csv',
]

file_path_1: callable = get_file_path(data_path_1)
file_path_2: callable = get_file_path(data_path_2)
file_set_1: list = list(map(file_path_1, data_sets_1))
file_set_2: list = list(map(file_path_2, data_sets_2))
current_job: int = 0

# dataset_1 = examine_dataset(0, file_set_1, data_sets_1)
# dataset_2 = examine_dataset(1, file_set_1, data_sets_1)
dataset_3 = examine_dataset(0, file_set_2, data_sets_2)

Dataset 0/1: We now look at ./data/03-11/Syn.csv


Loading Dataset: ./data/03-11/Syn.csv
	To Dataset Cache: ./cache/Syn.csv.pickle


        File:				./data/03-11/Syn.csv  
        Job Number:			0
        Shape:				(4320541, 88)
        Samples:			4320541 
        Features:			88
    


#### Now we try to use the darknet dataset to train the model

In [None]:
darknet = dataset_2['Dataset']

In [None]:
darknet.head()

In [None]:
darknet_no_label = darknet.drop(columns=['Traffic Type', 'Application Type'])

In [None]:
darknet_no_label.head()

In [None]:
# train-test split for time series
train_size = int(len(darknet_no_label) * 0.67)
test_size = len(darknet_no_label) - train_size
train, test = darknet_no_label[:train_size], darknet_no_label[train_size:]

In [None]:
test.shape

In [None]:
train.shape

In [None]:
# now we standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


# train-test split for time series
# df_downsampled = darknet_no_label.sample(frac=0.1, random_state=42)
df_downsampled = darknet_no_label.copy()

scaler.fit(df_downsampled)
df_downsampled = scaler.transform(df_downsampled)

train_size = int(len(df_downsampled) * 0.67)
test_size = len(df_downsampled) - train_size
train, test = df_downsampled[:train_size], df_downsampled[train_size:]


In [None]:
def create_dataset(dataset, lookback, device):
    """Transform a time series into a prediction dataset
    
    Args:
        dataset: A numpy array of time series, first dimension is the time steps
        lookback: Size of window for prediction
    """
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+lookback:i+lookback+1]
        X.append(feature)
        y.append(target)



    # print(len(X), len(y))
    print((X:=np.array(X)).shape, (y := np.array(y)).shape)
    return torch.tensor(X.astype('float32'), device=device), torch.tensor(y.astype('float32'), device=device)


lookback = 10
X_train, y_train = create_dataset(train, lookback=lookback, device = device)
X_test, y_test = create_dataset(test, lookback=lookback, device = device)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# for i in range(1, 10):
#     print(X_train.shape)
#     # print(X_train[i])
#     # ts, vs = X_train[i].shape
#     print(X_train[i])
#     print(X_train[i].permute(1, 0))
#     break

In [None]:
import os

In [None]:
import torch.nn as nn
# nn.MultiheadAttention()

In [None]:
import torch.nn as nn
class AirModel(nn.Module):
    def __init__(self, input_size, lookback_size, embedding_size=200):
        super().__init__()
        # self.lstm = nn.LSTM(input_size=input_size, hidden_size=embedding_size, num_layers=1, batch_first=True)
        self.flat = nn.Flatten()
        self.unflat = nn.Unflatten(0, (-1, 1))
        self.cont_embed = nn.Sequential(
            nn.Flatten(0, 1),
            nn.Linear(input_size, embedding_size),
            # nn.ReLU(),
            # nn.Linear(embedding_size, embedding_size),
            nn.Unflatten(0, (-1, lookback_size))
        )

        # we need a regression head to predict the next item in the sequence of shape 
        self.head = nn.Sequential(
            nn.Linear(lookback*embedding_size, lookback*embedding_size),
            # nn.BatchNorm1d(lookback_size*embedding_size),
            # nn.LayerNorm(lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, input_size),
            # nn.ReLU(),
        )
        self.norm1 = nn.LayerNorm((lookback_size, embedding_size))

        self.query = nn.Linear(embedding_size, embedding_size)
        self.key = nn.Linear(embedding_size, embedding_size)
        self.value = nn.Linear(embedding_size, embedding_size)

        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(0.1)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.norm3 = nn.LayerNorm(embedding_size)

        

        self.pos_embed = nn.Embedding(lookback_size, embedding_size)
        # self.linear = nn.Linear(lookback*embedding_size, input_size)
    def forward(self, x):
        # print(x.shape)
        # x = self.cont_embed(x)
        x = self.cont_embed(x) + self.pos_embed(torch.arange(x.shape[1]))
        x = self.norm1(x)

        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        # print(query.shape, key.shape, value.shape)

        pre_attention = self.softmax(torch.matmul(query, key.transpose(-2, -1)))
        # print(pre_attention.shape)

        attention = torch.matmul(pre_attention, value)
        # print(attention.shape)

        x = self.norm2(x + attention)

        # print(x.shape)

        # we mix the signals together
        x = self.flat(x)
        x = self.head(x)
        x = self.unflat(x)

        # print(x.shape)

        
        # x = self.flat(x)
        # x, _ = self.lstm(x)
        # # print(x.shape)
        # x = self.flat(x)
        # # print(x.shape)
        # x = self.linear(x)
        # x = self.unflat(x)
        # # print(x.shape)

        # we stop for debugging
        # print('stopping')
        # raise Exception('stop')
        return x


# model = AirModel(input_size=62, lookback_size=lookback)
# y_pred = model(X_batch)


In [None]:
import torch.nn as nn
class AirModel(nn.Module):
    def __init__(self, input_size, lookback_size, device, embedding_size=200):
        super().__init__()
        # self.lstm = nn.LSTM(input_size=input_size, hidden_size=embedding_size, num_layers=1, batch_first=True)
        self.embedding_size = embedding_size
        self.flat = nn.Flatten()
        self.unflat = nn.Unflatten(0, (-1, 1))
        self.cont_embed = nn.Sequential(
            nn.Flatten(0, 1),
            nn.Linear(input_size, embedding_size, device = device),
            # nn.ReLU(),
            # nn.Linear(embedding_size, embedding_size),
            nn.Unflatten(0, (-1, lookback_size))
        )

        self.device = device

        # we need a regression head to predict the next item in the sequence of shape 
        self.head = nn.Sequential(
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            # nn.BatchNorm1d(lookback_size*embedding_size),
            # nn.LayerNorm(lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, input_size, device = device),
            # nn.ReLU(),
        )
        self.norm1 = nn.LayerNorm((lookback_size, embedding_size), device = device)

        self.query = nn.Linear(embedding_size, embedding_size, device = device)
        self.key = nn.Linear(embedding_size, embedding_size, device = device)
        self.value = nn.Linear(embedding_size, embedding_size, device = device)

        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(0.1)
        self.norm2 = nn.LayerNorm(embedding_size, device = device)
        self.norm3 = nn.LayerNorm(embedding_size, device = device)

        

        self.pos_embed = nn.Embedding(lookback_size, embedding_size, device = device)
        # self.linear = nn.Linear(lookback*embedding_size, input_size)
    def forward(self, x):
        # print(x.shape)
        # x = self.cont_embed(x)
        x = self.cont_embed(x) + self.pos_embed(torch.arange(x.shape[1], device = self.device))
        x = self.norm1(x)

        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        # print(query.shape, key.shape, value.shape)

        pre_attention = self.softmax(torch.matmul(query, key.transpose(-2, -1))/torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32)))
        
        # print(pre_attention.shape)

        attention = torch.matmul(pre_attention, value)
        # print(attention.shape)

        x = self.norm2(x + attention)

        # print(x.shape)

        # we mix the signals together
        x = self.flat(x)
        x = self.head(x)
        x = self.unflat(x)

        # print(x.shape)

        
        # x = self.flat(x)
        # x, _ = self.lstm(x)
        # # print(x.shape)
        # x = self.flat(x)
        # # print(x.shape)
        # x = self.linear(x)
        # x = self.unflat(x)
        # # print(x.shape)

        # we stop for debugging
        # print('stopping')
        # raise Exception('stop')
        return x


# model = AirModel(input_size=62, lookback_size=lookback, device = device)
# y_pred = model(X_batch)


In [None]:
# failed model
class AirModel(nn.Module):
    def __init__(self, input_size, lookback_size, device, embedding_size=80, heads = 2):
        super().__init__()
        # self.lstm = nn.LSTM(input_size=input_size, hidden_size=embedding_size, num_layers=1, batch_first=True)
        self.embedding_size = embedding_size
        self.device = device
        self.flat1 = nn.Flatten(0, 1)
        self.flat2 = nn.Flatten()
        self.unflat1 = nn.Unflatten(0, (-1, lookback_size, embedding_size))
        self.unflat2 = nn.Unflatten(0, (-1, 1))
        self.cont_embed = nn.Sequential(
            nn.Flatten(0, 1),
            nn.Linear(input_size, embedding_size, device = device),
            # nn.ReLU(),
            # nn.Linear(embedding_size, embedding_size),
            nn.Unflatten(0, (-1, lookback_size))
        )

        # we need a regression head to predict the next item in the sequence of shape 
        self.head = nn.Sequential(
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            # nn.BatchNorm1d(lookback_size*embedding_size),
            # nn.LayerNorm(lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(embedding_size, input_size, device = device),
            # nn.ReLU(),
        )
        self.norm1 = nn.LayerNorm((lookback_size, embedding_size), device = device)
        self.norm2 = nn.LayerNorm(embedding_size, device = device)


        self.mh_attention = nn.MultiheadAttention(embed_dim=embedding_size, num_heads=heads, dropout=0.1, device = device)
        

        self.pos_embed = nn.Embedding(lookback_size, embedding_size, device = device)
        # self.linear = nn.Linear(lookback*embedding_size, input_size)
    def forward(self, x):
        batch_size, lookback_size, input_size = x.shape

        x = self.cont_embed(x) + self.pos_embed(torch.arange(x.shape[1], device = self.device))
        x = self.norm1(x)

        attention, _ = self.mh_attention(x, x, x)

        x = self.norm1(x + attention)

        # print(x.shape)


        print(x.shape)

        # x = x.reshape(batch_size, lookback_size, self.embedding_size)
        # print(x.shape)


        # we mix the signals together
        x = self.flat2(x)
        x = self.head(x)
        x = self.unflat2(x)

        # print(x.shape)

        
        # x = self.flat(x)
        # x, _ = self.lstm(x)
        # # print(x.shape)
        # x = self.flat(x)
        # # print(x.shape)
        # x = self.linear(x)
        # x = self.unflat(x)
        # # print(x.shape)

        # we stop for debugging
        # print('stopping')
        # raise Exception('stop')
        return x


model = AirModel(input_size=62, lookback_size=lookback)
y_pred = model(X_batch)


In [None]:
# failed model
class AirModel(nn.Module):
    def __init__(self, input_size, lookback_size, device, embedding_size=100, heads = 2):
        super().__init__()
        # self.lstm = nn.LSTM(input_size=input_size, hidden_size=embedding_size, num_layers=1, batch_first=True)
        self.embedding_size = embedding_size
        self.device = device
        self.flat1 = nn.Flatten(0, 1)
        self.flat2 = nn.Flatten()
        self.unflat1 = nn.Unflatten(0, (-1, lookback_size, embedding_size))
        self.unflat2 = nn.Unflatten(0, (-1, 1))
        self.cont_embed = nn.Sequential(
            nn.Flatten(0, 1),
            nn.Linear(input_size, embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size),
            nn.Unflatten(0, (-1, lookback_size))
        )

        # we need a regression head to predict the next item in the sequence of shape 
        self.head = nn.Sequential(
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            # nn.BatchNorm1d(lookback_size*embedding_size),
            # nn.LayerNorm(lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(embedding_size, input_size, device = device),
            # nn.ReLU(),
        )
        self.norm1 = nn.LayerNorm((lookback_size, embedding_size), device = device)
        self.norm2 = nn.LayerNorm(embedding_size, device = device)


        self.mh_attention = nn.MultiheadAttention(embed_dim=embedding_size, num_heads=heads, dropout=0.1, device = device)
        

        self.pos_embed = nn.Embedding(lookback_size, embedding_size, device = device)
        # self.linear = nn.Linear(lookback*embedding_size, input_size)
    def forward(self, x):
        batch_size, lookback_size, input_size = x.shape

        x = self.cont_embed(x) + self.pos_embed(torch.arange(x.shape[1], device = self.device))
        x = self.norm1(x)

        attention, _ = self.mh_attention(x, x, x)

        x = self.norm1(x + attention)

        # print(x.shape)


        print(x.shape)

        # x = x.reshape(batch_size, lookback_size, self.embedding_size)
        # print(x.shape)


        # we mix the signals together
        x = self.flat2(x)
        x = self.head(x)
        x = self.unflat2(x)

        # print(x.shape)

        
        # x = self.flat(x)
        # x, _ = self.lstm(x)
        # # print(x.shape)
        # x = self.flat(x)
        # # print(x.shape)
        # x = self.linear(x)
        # x = self.unflat(x)
        # # print(x.shape)

        # we stop for debugging
        # print('stopping')
        # raise Exception('stop')
        return x


model = AirModel(input_size=62, lookback_size=lookback)
y_pred = model(X_batch)


In [None]:
import numpy as np
import torch.optim as optim
import torch.utils.data as data

batch_size = 16
model = AirModel(input_size=62, lookback_size=lookback, device = device)
optimizer = optim.Adam(model.parameters(), lr=.01)

# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# we want to use the ranger optimizer
# optimizer = optim.RAdam(model.parameters())
loss_fn = nn.MSELoss()
# loss_fn = nn.L1Loss()
# log likelihood loss
# loss_fn = nn.GaussianNLLLoss()
# we need to gradient clip because the LSTM is very sensitive to exploding gradients
# https://stackoverflow.com/questions/54716377/pytorch-why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch
# https://stackoverflow.com/questions/55735643/how-to-use-clip-grad-norm-in-pytorch
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)

avg_train_loss = []
avg_test_loss = []

n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    # if epoch % 100 != 0:
    #     continue
    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train).cpu()
        train_rmse = np.sqrt(loss_fn(y_train_pred, y_train.cpu()))
        y_test_pred = model(X_test).cpu()
        test_rmse = np.sqrt(loss_fn(y_test_pred, y_test.cpu()))
    avg_train_loss.append(train_rmse)
    avg_test_loss.append(test_rmse)
    difference_train = avg_train_loss[-1] - avg_train_loss[-2] if len(avg_train_loss) > 1 else 0
    difference_test = avg_test_loss[-1] - avg_test_loss[-2] if len(avg_test_loss) > 1 else 0
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f, diff train %.4f, diff test %.4f" % (epoch, train_rmse, test_rmse, difference_train, difference_test))

In [None]:
# batch_size = 64
# loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
optimizer = optim.Adam(model.parameters(), lr=.1)
n_epochs = 50
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    # if epoch % 100 != 0:
    #     continue
    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train).cpu()
        train_rmse = np.sqrt(loss_fn(y_train_pred, y_train.cpu()))
        y_test_pred = model(X_test).cpu()
        test_rmse = np.sqrt(loss_fn(y_test_pred, y_test.cpu()))
    avg_train_loss.append(train_rmse)
    avg_test_loss.append(test_rmse)
    difference_train = avg_train_loss[-1] - avg_train_loss[-2] if len(avg_train_loss) > 1 else 0
    difference_test = avg_test_loss[-1] - avg_test_loss[-2] if len(avg_test_loss) > 1 else 0
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f, diff train %.4f, diff test %.4f" % (epoch, train_rmse, test_rmse, difference_train, difference_test))

In [None]:
import numpy as np
import torch.optim as optim
import torch.utils.data as data

batch_size = 32
model = AirModel(input_size=62, lookback_size=lookback)
# optimizer = optim.Adam(model.parameters(), lr=0.01)

# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# we want to use the ranger optimizer
# optimizer = optim.RAdam(model.parameters())
loss_fn = nn.MSELoss()
# we need to gradient clip because the LSTM is very sensitive to exploding gradients
# https://stackoverflow.com/questions/54716377/pytorch-why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch
# https://stackoverflow.com/questions/55735643/how-to-use-clip-grad-norm-in-pytorch
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)

avg_train_loss = []
avg_test_loss = []

n_epochs = 20
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss = loss_fn(y_pred, y_batch)
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.zero_grad()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
    # Validation
    # if epoch % 100 != 0:
    #     continue
    model.eval()
    with torch.no_grad():
        y_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_pred, y_train))
        y_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_pred, y_test))
    avg_train_loss.append(train_rmse)
    avg_test_loss.append(test_rmse)
    difference_train = avg_train_loss[-1] - avg_train_loss[-2] if len(avg_train_loss) > 1 else 0
    difference_test = avg_test_loss[-1] - avg_test_loss[-2] if len(avg_test_loss) > 1 else 0
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f, diff train %.4f, diff test %.4f" % (epoch, train_rmse, test_rmse, difference_train, difference_test))
    #  train avg %.1f, test avg %.1f" % (epoch, train_rmse, test_rmse, np.mean(avg_train_loss), np.mean(avg_test_loss)))

In [None]:
import numpy as np
import torch.optim as optim
import torch.utils.data as data

model = AirModel(input_size=62, lookback_size=lookback)
optimizer = optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# we want to use the ranger optimizer
# optimizer = optim.RAdam(model.parameters())
loss_fn = nn.MSELoss()
# we need to gradient clip because the LSTM is very sensitive to exploding gradients
# https://stackoverflow.com/questions/54716377/pytorch-why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch
# https://stackoverflow.com/questions/55735643/how-to-use-clip-grad-norm-in-pytorch
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=8)

n_epochs = 2000
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    # if epoch % 100 != 0:
    #     continue
    model.eval()
    with torch.no_grad():
        y_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_pred, y_train))
        y_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_pred, y_test))
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

In [None]:
import numpy as np
import torch.optim as optim
import torch.utils.data as data

model = AirModel(input_size=62, lookback_size=lookback)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
# we need to gradient clip because the LSTM is very sensitive to exploding gradients
# https://stackoverflow.com/questions/54716377/pytorch-why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch
# https://stackoverflow.com/questions/55735643/how-to-use-clip-grad-norm-in-pytorch
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=8)

n_epochs = 2000
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    # if epoch % 100 != 0:
    #     continue
    model.eval()
    with torch.no_grad():
        y_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_pred, y_train))
        y_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_pred, y_test))
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

In [None]:
# Epoch 0: train RMSE 225961433890816.0000

Now we experiment with the ddos syn dataset. Unlike the darknet dataset, which was pre cleaned for a different experiment, the syn dataset has its timestamps, so it can be ordered by time. We will use this to our advantage to create a time series model. 

In [None]:
from datetime import datetime
ddos: pd.DataFrame = remove_infs_and_nans(dataset_3)
ddos.columns = [column.strip() for column in ddos.columns]

timestamps = ddos['Timestamp']
std_time = timestamps.apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').timestamp())
ddos['Timestamp'] = std_time

ddos = ddos.sort_values(by=['Timestamp'])

deleting 282982 rows with Infinity in column Flow Bytes/s


In [5]:
# configure pandas to show float64 with 10 decimal places
pd.options.display.float_format = '{:.10f}'.format

In [6]:
time_cols= [
    'Timestamp',
]

speed_cols = [
    'Flow Bytes/s',

    'Bwd Packets/s',
    'Fwd Packets/s',
    'Flow Packets/s',
    
    'Down/Up Ratio',
]

length_cols = [
    'ACK Flag Count',
    'CWE Flag Count',
    'ECE Flag Count',
    'FIN Flag Count',
    'PSH Flag Count',
    'RST Flag Count',
    'SYN Flag Count',
    'URG Flag Count',
    
    'Flow Duration',

    'Bwd PSH Flags',
    'Fwd PSH Flags',
    'Bwd URG Flags',
    'Fwd URG Flags',

    'Bwd Header Length',
    
    'Total Length of Fwd Packets',
    'Total Length of Bwd Packets',
    'Fwd Header Length',
    
    'Total Backward Packets',
    'Total Fwd Packets',
    'Subflow Bwd Packets',
    'Subflow Fwd Packets',
    
    'Subflow Fwd Bytes',
    'Subflow Bwd Bytes',

    'Bwd IAT Total',
    'Fwd IAT Total',

    'act_data_pkt_fwd',
    'Init_Win_bytes_backward',
    'Init_Win_bytes_forward',
]

categorical_cols = {
    'Protocol': 16,
    'Inbound': 2,
}

min_cols = [
    'Active Min',
    'Idle Min',

    'Bwd IAT Min',
    'Fwd IAT Min',
    'Flow IAT Min',
    
    'Bwd Packet Length Min',
    'Fwd Packet Length Min',
    'Min Packet Length',

    'min_seg_size_forward',
]

max_cols = [
    'Active Max',
    'Idle Max',

    'Bwd IAT Max',
    'Fwd IAT Max',
    'Flow IAT Max',
    
    'Max Packet Length',
    'Bwd Packet Length Max',
    'Fwd Packet Length Max',
]

mean_cols = [
    'Active Mean',
    'Idle Mean',

    'Bwd Avg Bulk Rate',
    'Fwd Avg Bulk Rate',
    
    'Bwd Avg Bytes/Bulk',
    'Fwd Avg Bytes/Bulk',
    
    'Bwd IAT Mean',
    'Fwd IAT Mean',
    'Flow IAT Mean',
    
    'Packet Length Mean',
    'Bwd Packet Length Mean',
    'Fwd Packet Length Mean',
    
    'Average Packet Size',
    'Bwd Avg Packets/Bulk',
    'Fwd Avg Packets/Bulk',
    
    'Avg Bwd Segment Size',
    'Avg Fwd Segment Size',
]

std_cols = [
    'Active Std',
    'Idle Std',
    
    'Bwd IAT Std',
    'Fwd IAT Std',
    'Flow IAT Std',

    'Packet Length Std',
    'Packet Length Variance',
    'Bwd Packet Length Std',
    'Fwd Packet Length Std',
]

port_cols = [
    'Destination Port',
    'Source Port',
]

unused_cols = [
    'Unnamed: 0',
    'Flow ID',
    'Source IP',
    'Destination IP',
    'SimillarHTTP',
    'Label',
    'Fwd Header Length.1'
] + port_cols

# concatenate all column groups together to get a list of all columns
all_cols = speed_cols + port_cols + length_cols + time_cols + list(categorical_cols.keys()) + min_cols + max_cols + mean_cols + std_cols + unused_cols

# all_cols = time_cols + categorical_cols + min_cols + max_cols + mean_cols + std_cols + length_cols + unused_cols

set(ddos.columns)- set(all_cols)

set()

In [7]:
column_groups = {
    'speed_cols': speed_cols,
    'length_cols': length_cols,
    'min_cols': min_cols,
    'max_cols': max_cols,
    'mean_cols': mean_cols,
    'std_cols': std_cols,
}

In [8]:
ddos_clean = ddos.copy().drop(columns=unused_cols)

In [9]:
train_size = int(len(ddos_clean) * 0.8)
test_size = len(ddos_clean) - train_size
train = ddos_clean[:train_size]
test = ddos_clean[train_size:]

In [10]:
class Preprocessor:
    map_index_key = '__map_index__'
    category_map = {}
    old_maps = []

    def __init__(
        self,
        pipeline: List[Callable[[pd.DataFrame], pd.DataFrame]],
        device: torch.device,
    ) -> None:

        self.pipeline = pipeline
        self.device = device

        self.fitted = False

    def fit(self, X: pd.DataFrame, leave_out: List[str] = ['Timestamp']):

        X = X.copy()
        for col in leave_out:
            if col in X.columns:
                X = X.drop(columns=[col])

        for step in self.pipeline:
            step.fit(X)

        self.fitted = True

        return self

    def transform(
        self, 
        X: pd.DataFrame,
        leave_out: List[str] = ['Timestamp']
    ) -> pd.DataFrame:

        if not self.fitted:
            raise Exception('E1: You must fit the preprocessor before transforming')
    
        X = X.copy()
        
        if leave_out != []:
            left_out = X[leave_out]
            X = X.drop(columns=leave_out)

        columns = X.columns

        for col in leave_out:
            if col in columns:
                raise Exception('E2: Column %s was found in the transformed dataset' % col)


        for step in self.pipeline:
            X = step.transform(X)

        X = pd.DataFrame(X, columns=columns)

        if leave_out != []:
            for col in leave_out:
                X[col] = left_out[col]
            # X[leave_out] = left_out

        for col in leave_out:
            if col not in X.columns:
                raise Exception('E3: Column %s was not found in the transformed dataset' % col)

        return X

    def index_categories(
        self, 
        X: pd.DataFrame, 
        categorical_cols: List[str]
    ) -> pd.DataFrame:
        '''
            We need to index the categorical columns so that they are in the range [0, n_categories) and save the mapping
        '''
        X = X.copy()

        if self.map_index_key in categorical_cols:
            raise Exception('Cannot use the reserved key %s as a column name' % self.map_index_key)

        if self.category_map != {}:
            self.old_maps.append(self.category_map)
        
        old_mapping_index = len(self.old_maps)

        for col in categorical_cols:
            X[col], self.category_map[col] = pd.factorize(X[col])
            self.category_map[self.map_index_key] = old_mapping_index    

        return X

    def create_dataset(
        self, 
        dataset: pd.DataFrame,
        categorical_cols: List[str],
        relative_cols: List[str],
        other_cols: Dict[str, List[str]],
        context_sequence_length: int = 1,
        target_sequence_length: int = 1,
        target_sequence_offset: int = 0,
    ) -> Dict[str, torch.Tensor]:
        '''
            We create a dataset for each column group where each shares the same index
            in the first dimension

            This is because the groups will have different lengths in the last dimension
            

            We also need to create a context and target sequence for each column group
                where the context sequence is the input and the target sequence is the output
            
                for each sample s in the dataset:
                    context_sequence = df[s:s+context_sequence_length]
                    target_sequence = df[s+target_sequence_offset:s+target_sequence_offset+target_sequence_length]


        '''
        
        if not self.fitted:
            self.fit(
                dataset, 
                leave_out=relative_cols + categorical_cols
            )

        dataset = self.transform(
            dataset,
            leave_out=relative_cols + categorical_cols
        )

        bounds = max(
            context_sequence_length, 
            target_sequence_length + target_sequence_offset
        )

        dataset_groups = {}

        for i, (name, group) in enumerate(other_cols.items()):
            dataset_groups[name] = dataset[group]

        categorical_data = dataset[categorical_cols]
        categorical_data = self.index_categories(categorical_data, categorical_cols)

        relative_data = dataset[relative_cols]

        output = {}

        src = '_source'
        tgt = '_target'

        output['categorical_data' + src] = []
        output['categorical_data' + tgt] = []
        output['relative_data' + src] = []
        output['relative_data' + tgt] = []
    
        for name in other_cols.keys():
            output[name + src] = []
            output[name + tgt] = []

        for i in range(len(dataset) - bounds):
            
            output['categorical_data' + src].append(
                categorical_data[i:i+context_sequence_length].values
            )

            output['categorical_data' + tgt].append(
                categorical_data[i+target_sequence_offset:i+target_sequence_offset+target_sequence_length].values
            )

            relative_source = relative_data[i:i+context_sequence_length].values

            relative_target = relative_data[i+target_sequence_offset:i+target_sequence_offset+target_sequence_length].values

            relative_source = relative_source - relative_source[0]
            relative_target = relative_target - relative_source[0]

            output['relative_data' + src].append(
                relative_source
            )

            output['relative_data' + tgt].append(
                relative_target
            )
            

            for name in dataset_groups.keys():
                output[name + src].append(
                    dataset_groups[name][i:i+context_sequence_length].values
                )

                output[name + tgt].append(
                    dataset_groups[name][i+target_sequence_offset:i+target_sequence_offset+target_sequence_length].values
                )

        for name in output.keys():
            if name == 'categorical_data' + src or name == 'categorical_data' + tgt:
                output[name] = torch.tensor(output[name], dtype=torch.long, device=self.device)
            else:
                output[name] = torch.tensor(output[name], dtype=torch.float32, device=self.device)

        return output

In [11]:
time_cols= [
    'Timestamp',
]

speed_cols = [
    'Flow Bytes/s',

    'Bwd Packets/s',
    'Fwd Packets/s',
    'Flow Packets/s',
    
    'Down/Up Ratio',
]

length_cols = [
    'ACK Flag Count',
    'CWE Flag Count',
    'ECE Flag Count',
    'FIN Flag Count',
    'PSH Flag Count',
    'RST Flag Count',
    'SYN Flag Count',
    'URG Flag Count',
    
    'Flow Duration',

    'Bwd PSH Flags',
    'Fwd PSH Flags',
    'Bwd URG Flags',
    'Fwd URG Flags',

    'Bwd Header Length',
    
    'Total Length of Fwd Packets',
    'Total Length of Bwd Packets',
    'Fwd Header Length',
    
    'Total Backward Packets',
    'Total Fwd Packets',
    'Subflow Bwd Packets',
    'Subflow Fwd Packets',
    
    'Subflow Fwd Bytes',
    'Subflow Bwd Bytes',

    'Bwd IAT Total',
    'Fwd IAT Total',

    'act_data_pkt_fwd',
    'Init_Win_bytes_backward',
    'Init_Win_bytes_forward',
]

min_cols = [
    'Active Min',
    'Idle Min',

    'Bwd IAT Min',
    'Fwd IAT Min',
    'Flow IAT Min',
    
    'Bwd Packet Length Min',
    'Fwd Packet Length Min',
    'Min Packet Length',

    'min_seg_size_forward',
]

max_cols = [
    'Active Max',
    'Idle Max',

    'Bwd IAT Max',
    'Fwd IAT Max',
    'Flow IAT Max',
    
    'Max Packet Length',
    'Bwd Packet Length Max',
    'Fwd Packet Length Max',
]

mean_cols = [
    'Active Mean',
    'Idle Mean',

    'Bwd Avg Bulk Rate',
    'Fwd Avg Bulk Rate',
    
    'Bwd Avg Bytes/Bulk',
    'Fwd Avg Bytes/Bulk',
    
    'Bwd IAT Mean',
    'Fwd IAT Mean',
    'Flow IAT Mean',
    
    'Packet Length Mean',
    'Bwd Packet Length Mean',
    'Fwd Packet Length Mean',
    
    'Average Packet Size',
    'Bwd Avg Packets/Bulk',
    'Fwd Avg Packets/Bulk',
    
    'Avg Bwd Segment Size',
    'Avg Fwd Segment Size',
]

std_cols = [
    'Active Std',
    'Idle Std',
    
    'Bwd IAT Std',
    'Fwd IAT Std',
    'Flow IAT Std',

    'Packet Length Std',
    'Packet Length Variance',
    'Bwd Packet Length Std',
    'Fwd Packet Length Std',
]

column_groups = {
    'speed_cols': speed_cols,
    'length_cols': length_cols,
    'min_cols': min_cols,
    'max_cols': max_cols,
    'mean_cols': mean_cols,
    'std_cols': std_cols,
}

categorical_cols = {
        'Protocol': 16,
        'Inbound': 2,
}

process = Preprocessor([StandardScaler()], device)

context_length = 10
target_length = 3
target_offset = 8

data_dict = process.create_dataset(
    ddos_clean[:1000],
    list(categorical_cols.keys()),
    time_cols,
    column_groups,
    context_sequence_length=context_length,
    target_sequence_length=target_length,
    target_sequence_offset=target_offset
)

for name, tensor in data_dict.items():
    print(name, tensor.shape)

categorical_data_source torch.Size([989, 10, 2])
categorical_data_target torch.Size([989, 3, 2])
relative_data_source torch.Size([989, 10, 1])
relative_data_target torch.Size([989, 3, 1])
speed_cols_source torch.Size([989, 10, 5])
speed_cols_target torch.Size([989, 3, 5])
length_cols_source torch.Size([989, 10, 28])
length_cols_target torch.Size([989, 3, 28])
min_cols_source torch.Size([989, 10, 9])
min_cols_target torch.Size([989, 3, 9])
max_cols_source torch.Size([989, 10, 8])
max_cols_target torch.Size([989, 3, 8])
mean_cols_source torch.Size([989, 10, 17])
mean_cols_target torch.Size([989, 3, 17])
std_cols_source torch.Size([989, 10, 9])
std_cols_target torch.Size([989, 3, 9])


  output[name] = torch.tensor(output[name], dtype=torch.long, device=self.device)


In [12]:
tensor_dataset = data.TensorDataset(
    data_dict['relative_data_source'],
    data_dict['categorical_data_source'],
    data_dict['speed_cols_source'],
    data_dict['length_cols_source'],
    data_dict['min_cols_source'],
    data_dict['max_cols_source'],
    data_dict['mean_cols_source'],
    data_dict['std_cols_source'],
    data_dict['relative_data_target'],
    data_dict['categorical_data_target'],
    data_dict['speed_cols_target'],
    data_dict['length_cols_target'],
    data_dict['min_cols_target'],
    data_dict['max_cols_target'],
    data_dict['mean_cols_target'],
    data_dict['std_cols_target'],
)

In [13]:

train_size = int(len(tensor_dataset) * 0.8)
test_size = len(tensor_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(tensor_dataset, [train_size, test_size])    

In [14]:
batch_size = 64

train_dataloader = data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataloader = data.DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [15]:
class GeneratedLoss(nn.Module):
    "Measures how well we have generated the sequence item"

    def __init__(
        self, 
        categorical_cols
    ):
        super(GeneratedLoss, self).__init__()

        self.ce = nn.CrossEntropyLoss()
        self.mse = nn.MSELoss()

        self.categorical_cols = categorical_cols

    def forward(
        self, 
        preds, 
        cat_targs, 
        cont_targs
    ):

        cats, conts = preds
        tot_ce, pos = cats.new([0]), 0
        for i, (k,v) in enumerate(self.cat_dict.items()):
            tot_ce += self.ce(cats[:, pos:pos+v], cat_targs[:,i])
            pos += v
        
        norm_cats = cats.new([len(self.cat_dict)])
        norm_conts = conts.new([conts.size(1)])
        cat_loss = tot_ce/norm_cats
        cont_loss = self.mse(conts, cont_targs)/norm_conts
        total = cat_loss+cont_loss

        return total / cats.size(0)

class MultiheadAttention(nn.Module):
    def __init__(
        self,
        embedding_size     : int,
        heads              : int,
        device             : torch.device,
    ) -> None:
    
        super(MultiheadAttention, self).__init__()

        self.embedding_size      = embedding_size
        self.heads               = heads

        if embedding_size % heads != 0:
            raise Exception(f'embedding_size must be divisible by heads. embedding_size: {embedding_size}, heads: {heads}')

        self.query = nn.Linear(embedding_size, embedding_size, device = device)
        self.key   = nn.Linear(embedding_size, embedding_size, device = device)
        self.value = nn.Linear(embedding_size, embedding_size, device = device)

        self.softmax = nn.Softmax(dim=-1)

    def forward(
        self, 
        q: torch.Tensor, 
        v: torch.Tensor,
        k: torch.Tensor
    ):
        '''
            We perform multihead attention on the query embeddings using the encoder state as the key and value

            shape:
                x: (batch_size, total_num_queries, embedding_size)
                encoder_state: (batch_size, lookback_size, embedding_size)
                output: (batch_size, total_num_queries, embedding_size)
        '''

        # print(f"q: {q.shape}")

        key  : torch.Tensor = self.key(k)
        value: torch.Tensor = self.value(v)
        query: torch.Tensor = self.query(q)

        # we split the query, key, and value into heads
        query = query.reshape(
            query.shape[0],
            query.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        key = key.reshape(
            key.shape[0],
            key.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        value = value.reshape(
            value.shape[0],
            value.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        # we perform the attention
        pre_attention = torch.matmul(query, key.transpose(-2, -1))/torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32))

        # print(f"pre_attention: {pre_attention.shape}")

        # we apply the softmax to the pre_attention

        distribution = self.softmax(pre_attention)

        # print(f"distribution: {distribution.shape}")
        # print(f"value: {value.shape}")

        attention = torch.einsum('bhqk,bhkd->bhqd', distribution, value)

        # attention = torch.matmul(self.softmax(pre_attention), value)

        # print(f"attention: {attention.shape}")


        # attention shape: batch_size, heads, total_num_queries, embedding_size // heads
        # final shape: batch_size, total_num_queries, embedding_size

        full_attention = attention.transpose(1, 2).reshape(
            attention.shape[0],
            attention.shape[-2],
            self.embedding_size
        )

        # print(f"full_attention: {full_attention.shape}")

        return full_attention

class RandomFourierTimeEncoding(nn.Module):

    def __init__(
        self,
        device: torch.device,
        input_size: int = 1,
        encoding_half_size: int = 50,
        sigma: float = 4,
    ) -> None:
        super(RandomFourierTimeEncoding, self).__init__()

        self.fourier = GaussianEncoding(
            sigma=sigma,
            input_size=input_size,
            encoded_size=encoding_half_size,
        ).to(device)
        # )

    def forward(
        self,
        X: torch.Tensor,
    ) -> torch.Tensor:
        '''
            X: (batch_size, sequence_length, input_size)
            output: (batch_size, sequence_length, 2*encoding_half_size + input_size)
        '''

        output = self.fourier(X)
        output = torch.cat([X, output], dim=-1)

        return output

class TrafficFlowEncoderBlock(nn.Module):
    def __init__(
        self,
        embedding_size: int,
        heads: int,
        forward_expansion: int,
        device: torch.device,
        dropout_rate: float = 0.1,
    ) -> None:
        super(TrafficFlowEncoderBlock, self).__init__()

        self.attention = MultiheadAttention(
            embedding_size=embedding_size,
            heads=heads,
            device=device,
        )

        self.norm1 = nn.LayerNorm(embedding_size)
        self.norm2 = nn.LayerNorm(embedding_size)

        self.ffw = nn.Sequential(
            nn.Linear(embedding_size, forward_expansion*embedding_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embedding_size, embedding_size),
        )

        self.dropout = nn.Dropout(dropout_rate)

    def forward(
        self,
        X: torch.Tensor,
    ) -> torch.Tensor:
        '''
            X: (batch_size, sequence_length, embedding_size)
            output: (batch_size, sequence_length, embedding_size)
        '''

        X_norm = self.norm1(X)

        attention = self.attention(X_norm, X_norm, X_norm)

        X = X + self.dropout(attention)

        X_norm = self.norm2(X)

        ffw = self.ffw(X_norm)

        X = X + self.dropout(ffw)

        return X

class TrafficFlowEncoder(nn.Module):
    def __init__(
        self,
        embedding_size: int,
        heads: int,
        forward_expansion: int,
        num_layers: int,
        device: torch.device,
        dropout_rate: float = 0.1,
    ) -> None:
        super(TrafficFlowEncoder, self).__init__()

        self.layers = nn.Sequential(
            *[TrafficFlowEncoderBlock(
                embedding_size=embedding_size,
                heads=heads,
                forward_expansion=forward_expansion,
                device=device,
                dropout_rate=dropout_rate,
            ) for _ in range(num_layers)]
        )


    def forward(
        self,
        X: torch.Tensor,
    ) -> torch.Tensor:
        '''
            X: (batch_size, sequence_length, embedding_size)
            output: (batch_size, sequence_length, embedding_size)
        '''

        return self.layers(X)



class TrafficFlowTransformer(nn.Module):
    def __init__(
        self,
        categorical_cols: Dict[str, int],
        column_groups: Dict[str, List[str]],
        device: torch.device,
        embedding_size: int = 100,
        encoding_size: int = 50,
        encoder_heads: int = 2,
        encoder_layers: int = 2,
        encoder_forward_expansion: int = 2,
    ) -> None:
    
        super(TrafficFlowTransformer, self).__init__()
    
        self.categorical_cols = categorical_cols

        self.time_embedding = nn.Sequential(
            RandomFourierTimeEncoding(
                device=device,
                input_size=1,
                encoding_half_size=encoding_size,
            ),
            nn.Linear(2*encoding_size + 1, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size),
        ).to(device)

        self.categorical_embeddings = nn.ModuleDict({
            col: nn.Embedding(num_embeddings=n_categories, embedding_dim=embedding_size)
            for col, n_categories in categorical_cols.items()
        }).to(device)

        self.continuous_embeddings = nn.ModuleDict({
            group: nn.Sequential(
                nn.Linear(len(cols), embedding_size),
                nn.ReLU(),
                nn.Linear(embedding_size, embedding_size),
            ) for group, cols in column_groups.items()
        }).to(device)

        self.encoder = TrafficFlowEncoder(
            embedding_size=embedding_size,
            heads=encoder_heads,
            forward_expansion=encoder_forward_expansion,
            num_layers=encoder_layers,
            device=device,
        ).to(device)

        # self.transformer = nn.Transformer(
        #     d_model=embedding_size,
        #     nhead=encoder_heads,
        #     num_encoder_layers=encoder_layers,
        #     num_decoder_layers=decoder_layers,
        #     dim_feedforward=encoder_forward_expansion*embedding_size,
        #     dropout=dropout_rate,
        # ).to(device)

        # self.encoder = self.transformer.encoder


    def forward(
        self,
        X_time,
        X_cat,
        X_cont_group,
        y_time
    ):
        print(f"X_time: {X_time.shape}")
        print(f"X_cat: {X_cat.shape}")
        # print(f"X_cont_group: {X_cont_group.shape}")
        print(f"X_cont_group: {len(X_cont_group)}")
        print(f"y_time: {y_time.shape}")

        # we embed the time
        X_time_emb = self.time_embedding(X_time)
        Y_time_emb = self.time_embedding(y_time)

        print(f"X_time_emb: {X_time_emb.shape}")
        print(f"y_time_emb: {Y_time_emb.shape}")

        # we embed the categorical columns
        X_cat_embs = [
            self.categorical_embeddings[col](X_cat[:, :, i])
            for i, col in enumerate(self.categorical_cols.keys())
        ]

        print(*[f"X_cat_embs[{i}]: {emb.shape}" for i, emb in enumerate(X_cat_embs)], sep='\n')

        # we embed the continuous columns
        X_cont_embs = [
            self.continuous_embeddings[group](X_cont_group[group])
            for group in self.continuous_embeddings.keys()
        ]

        print(*[f"X_cont_embs[{i}]: {emb.shape}" for i, emb in enumerate(X_cont_embs)], sep='\n')

        # print(f"X_cat_embs: {[emb.shape for emb in X_cat_embs]}")
        # print(f"X_cont_embs: {[emb.shape for emb in X_cont_embs]}")

        X_emb = X_time_emb + sum(X_cat_embs) + sum(X_cont_embs)

        print(f"X_emb: {X_emb.shape}")

        X_enc = self.encoder(X_emb)

        print(f"X_enc: {X_enc.shape}")

        X_fourier = torch.fft.fft(X_enc, dim=-1)

        print(f"X_fourier: {X_fourier.shape}")

        return 

In [16]:

n_epochs = 10

column_groups = {
    'speed_cols': speed_cols,
    'length_cols': length_cols,
    'min_cols': min_cols,
    'max_cols': max_cols,
    'mean_cols': mean_cols,
    'std_cols': std_cols,
}

categorical_cols = {
        'Protocol': 16,
        'Inbound': 2,
}

epoch_length = len(train_dataloader.dataset)

train_losses = []
test_losses  = []
train_counter = []
test_counter = [i*epoch_length for i in range(n_epochs + 1)]


loss_fn = GeneratedLoss(
    categorical_cols
)

batch_0 = None

model = TrafficFlowTransformer(
    categorical_cols,
    column_groups,
    embedding_size=100,
    device=device,
)

# print(model)

for epoch in range(n_epochs):
    model.train()

    for batch in train_dataloader:
        X_time = batch[0]
        y_time = batch[8]

        X_cat = batch[1]
        y_cat = batch[9]

        X_cont_group = batch[2:8]
        X_cont_group_dict = {
            group: X_cont_group[i] for i, group in enumerate(column_groups.keys())
        }
        y_cont_group = batch[10:]

        y_cat_pred, y_cont_group = model(
            X_time,
            X_cat,
            X_cont_group_dict,
            y_time,
        )

        batch_0 = batch
        break
    break


X_time: torch.Size([64, 10, 1])
X_cat: torch.Size([64, 10, 2])
X_cont_group: 6
y_time: torch.Size([64, 3, 1])
X_time_emb: torch.Size([64, 10, 100])
y_time_emb: torch.Size([64, 3, 100])


IndexError: index out of range in self

In [46]:
len(batch_0)

16

In [None]:
from typing import (
    Dict,
    List
)

class DebugLayer(nn.Module):
    '''
        This module is simply an intermediate that will print out the shape of the input
    '''
    def __init__(self) -> None:
        super(DebugLayer, self).__init__()

    def forward(self, x: torch.Tensor, verbose: bool = False) -> torch.Tensor:
        if verbose:
            print(x.shape)

        return x


In [None]:
class TrafficDecoderHead(nn.Module):
    '''
        This is the head of the decoder.

        params:
            input_size: the size of the last dimension of the input
            categorical_cols: a dictionary of column names and the number of categories in each column
            column_groups: a dictionary of column groups, each with a group name a list of column names
            

        For each categorical column, we create a classification trunk of layers that will end in a softmax layer of size num_categories
        For each column group, we create a regression trunk of layers that will end in a linear layer that is the length of the column group
    '''

    def __init__(
        self, 
        input_size: int, 
        categorical_cols: Dict[str, int],
        column_groups: Dict[str, List[str]],
        device: torch.device,
    ) -> None:
    
        super(TrafficDecoderHead, self).__init__()

        self.input_size = input_size
        self.categorical_cols = categorical_cols
        self.column_groups = column_groups
        self.device = device

        self.categorical_head = nn.ModuleDict()

        for column, num_categories in categorical_cols.items():
            hidden_size = (input_size + num_categories) // 2
            self.categorical_head[column] = nn.Sequential(
                nn.Linear(input_size, hidden_size, device = device),
                nn.ReLU(),
                nn.Linear(hidden_size, num_categories, device = device),
                nn.Softmax(dim=-1)
            )

        self.continuous_head = nn.ModuleDict()

        for column_group, columns in column_groups.items():
            hidden_size = (input_size + len(columns)) // 2
            self.continuous_head[column_group] = nn.Sequential(
                nn.Linear(input_size, hidden_size, device = device),
                nn.ReLU(),
                nn.Linear(hidden_size, len(columns), device = device),
            )

        self.inspector = DebugLayer()


    def forward(self, x):

        x = self.inspector(x)

        cont_output = torch.cat(
            [head(x) for head in self.continuous_head.values()],
            dim= -1
        )

        cat_output = torch.cat(
            [head(x) for head in self.categorical_head.values()],
            dim= -1
        )

        return cont_output, cat_output


batch_size: int = 3
samples: int    = 4
variables: int  = 5

column_groups = {
    'sample_group_1':[
        'col_1',
        'col_2',
    ],
    'sample_group_2': [
        'col_3',
        'col_4',
        'col_5',
    ],
}

categorical_columns = {
    'cat_1': 2,
    'cat_2': 3,
}

test_vals = torch.rand(batch_size, samples, variables)


model = TrafficDecoderHead(variables, categorical_columns, column_groups)
# torch.onnx.export(model, test_vals, 'test.onnx', input_names=['decoder hidden state'], output_names=['decoder output'])
model(test_vals)

In [None]:
import torch.nn as nn
class AirModel(nn.Module):
    def __init__(self, input_size, lookback_size, device, embedding_size=200):
        super().__init__()
        # self.lstm = nn.LSTM(input_size=input_size, hidden_size=embedding_size, num_layers=1, batch_first=True)
        self.embedding_size = embedding_size
        self.flat = nn.Flatten()
        self.unflat = nn.Unflatten(0, (-1, 1))
        self.cont_embed = nn.Sequential(
            nn.Flatten(0, 1),
            nn.Linear(input_size, embedding_size, device = device),
            # nn.ReLU(),
            # nn.Linear(embedding_size, embedding_size),
            nn.Unflatten(0, (-1, lookback_size))
        )

        self.device = device

        # we need a regression head to predict the next item in the sequence of shape 
        self.head = nn.Sequential(
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            # nn.BatchNorm1d(lookback_size*embedding_size),
            # nn.LayerNorm(lookback*embedding_size),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, lookback*embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(lookback*embedding_size, input_size, device = device),
            # nn.ReLU(),
        )
        self.norm1 = nn.LayerNorm((lookback_size, embedding_size), device = device)

        self.query = nn.Linear(embedding_size, embedding_size, device = device)
        self.key = nn.Linear(embedding_size, embedding_size, device = device)
        self.value = nn.Linear(embedding_size, embedding_size, device = device)

        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(0.1)
        self.norm2 = nn.LayerNorm(embedding_size, device = device)
        self.norm3 = nn.LayerNorm(embedding_size, device = device)

        

        self.pos_embed = nn.Embedding(lookback_size, embedding_size, device = device)
        # self.linear = nn.Linear(lookback*embedding_size, input_size)
    def forward(self, x):
        # print(x.shape)
        # x = self.cont_embed(x)
        x = self.cont_embed(x) + self.pos_embed(torch.arange(x.shape[1], device = self.device))
        x = self.norm1(x)

        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        # print(query.shape, key.shape, value.shape)

        pre_attention = self.softmax(torch.matmul(query, key.transpose(-2, -1))/torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32)))
        
        # print(pre_attention.shape)

        attention = torch.matmul(pre_attention, value)
        # print(attention.shape)

        x = self.norm2(x + attention)

        # print(x.shape)

        # we mix the signals together
        x = self.flat(x)
        x = self.head(x)
        x = self.unflat(x)

        # print(x.shape)

        # print('stopping')
        # raise Exception('stop')
        return x


# model = AirModel(input_size=62, lookback_size=lookback, device = device)
# y_pred = model(X_batch)


In [None]:
class MultiheadAttention(nn.Module):
    def __init__(
        self,
        embedding_size     : int,
        heads              : int,
        forward_expansion  : int,
        device             : torch.device,
    ) -> None:
    
        super(MultiheadAttention, self).__init__()

        self.embedding_size      = embedding_size
        self.heads               = heads

        if embedding_size % heads != 0:
            raise Exception(f'embedding_size must be divisible by heads. embedding_size: {embedding_size}, heads: {heads}')

        self.query = nn.Linear(embedding_size, embedding_size, device = device)
        self.key   = nn.Linear(embedding_size, embedding_size, device = device)
        self.value = nn.Linear(embedding_size, embedding_size, device = device)

        self.norm_1 = nn.LayerNorm(embedding_size, device = device)
        self.norm_2 = nn.LayerNorm(embedding_size, device = device)

        self.softmax = nn.Softmax(dim=-1)

        self.ffwd = nn.Sequential(
            nn.Linear(embedding_size, forward_expansion*embedding_size, device = device),
            nn.ReLU(),
            nn.Linear(forward_expansion*embedding_size, embedding_size, device = device),
        )

    def forward(
        self, 
        q: torch.Tensor, 
        v: torch.Tensor,
        k: torch.Tensor
    ):
        '''
            We perform multihead attention on the query embeddings using the encoder state as the key and value

            shape:
                x: (batch_size, total_num_queries, embedding_size)
                encoder_state: (batch_size, lookback_size, embedding_size)
                output: (batch_size, total_num_queries, embedding_size)
        '''

        key  : torch.Tensor = self.key(k)
        value: torch.Tensor = self.value(v)
        query: torch.Tensor = self.query(q)

        # we split the query, key, and value into heads
        query = query.reshape(
            query.shape[0],
            query.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        key = key.reshape(
            key.shape[0],
            key.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        value = value.reshape(
            value.shape[0],
            value.shape[1],
            self.heads,
            self.embedding_size // self.heads
        ).transpose(1, 2)

        # we perform the attention
        pre_attention = torch.matmul(query, key.transpose(-2, -1))/torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32))
        attention = torch.matmul(self.softmax(pre_attention), value)

        # we concatenate the heads together
        attention = attention.transpose(1, 2).reshape(
            attention.shape[0],
            attention.shape[1],
            self.embedding_size
        )

        # we pass the attention through a normalization layer
        x = self.norm_1(attention + q)

        # we pass the attention through a feed forward layer
        x = self.norm_2(self.ffwd(x) + x)

        return x

        





In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()


In [None]:
class TrafficForecaster(nn.Module):
    '''
    
    '''

    def __init__(self, *args, **kwargs) -> None:
        super(TrafficForecaster, self).__init__(*args, **kwargs)


    def forward(self, x):
        return x

In [None]:
time_cols= [
    'Timestamp',
]

speed_cols = [
    'Flow Bytes/s',

    'Bwd Packets/s',
    'Fwd Packets/s',
    'Flow Packets/s',
    
    'Down/Up Ratio',
]

port_cols = [
    'Destination Port',
    'Source Port',
]

length_cols = [
    'ACK Flag Count',
    'CWE Flag Count',
    'ECE Flag Count',
    'FIN Flag Count',
    'PSH Flag Count',
    'RST Flag Count',
    'SYN Flag Count',
    'URG Flag Count',
    
    'Flow Duration',

    'Bwd PSH Flags',
    'Fwd PSH Flags',
    'Bwd URG Flags',
    'Fwd URG Flags',

    'Bwd Header Length',
    
    'Total Length of Fwd Packets',
    'Total Length of Bwd Packets',
    'Fwd Header Length',
    
    'Total Backward Packets',
    'Total Fwd Packets',
    'Subflow Bwd Packets',
    'Subflow Fwd Packets',
    
    'Subflow Fwd Bytes',
    'Subflow Bwd Bytes',

    'Bwd IAT Total',
    'Fwd IAT Total',

    'act_data_pkt_fwd',
    'Init_Win_bytes_backward',
    'Init_Win_bytes_forward',
]

categorical_cols = [
    'Protocol',
    'Inbound',
]

min_cols = [
    'Active Min',
    'Idle Min',

    'Bwd IAT Min',
    'Fwd IAT Min',
    'Flow IAT Min',
    
    'Bwd Packet Length Min',
    'Fwd Packet Length Min',
    'Min Packet Length',

    'min_seg_size_forward',
]

max_cols = [
    'Active Max',
    'Idle Max',

    'Bwd IAT Max',
    'Fwd IAT Max',
    'Flow IAT Max',
    
    'Max Packet Length',
    'Bwd Packet Length Max',
    'Fwd Packet Length Max',
]

mean_cols = [
    'Active Mean',
    'Idle Mean',

    'Bwd Avg Bulk Rate',
    'Fwd Avg Bulk Rate',
    
    'Bwd Avg Bytes/Bulk',
    'Fwd Avg Bytes/Bulk',
    
    'Bwd IAT Mean',
    'Fwd IAT Mean',
    'Flow IAT Mean',
    
    'Packet Length Mean',
    'Bwd Packet Length Mean',
    'Fwd Packet Length Mean',
    
    'Average Packet Size',
    'Bwd Avg Packets/Bulk',
    'Fwd Avg Packets/Bulk',
    
    'Avg Bwd Segment Size',
    'Avg Fwd Segment Size',
]

std_cols = [
    'Active Std',
    'Idle Std',
    
    'Bwd IAT Std',
    'Fwd IAT Std',
    'Flow IAT Std',

    'Packet Length Std',
    'Packet Length Variance',
    'Bwd Packet Length Std',
    'Fwd Packet Length Std',
]

unused_cols = [
    'Unnamed: 0',
    'Flow ID',
    'Source IP',
    'Destination IP',
    'SimillarHTTP',
    'Label',
    'Fwd Header Length.1'
] + port_cols

In [None]:
ddos_clean = ddos.copy().drop(columns=unused_cols)

In [None]:
ddos[ddos["Label"] != 'Syn']

In [None]:
benign = ddos[ddos["Label"] == 'BENIGN']
syn = ddos[ddos["Label"] == 'Syn']

In [None]:
max_port = max(ddos['Destination Port'].max(), ddos['Source Port'].max())
max_port

In [None]:
syn.head()