# TabTransformer

## Setting device and seed

In [None]:

import itertools
import pickle
import random

import matplotlib.pyplot as plt
import pandas as pd

import torch
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight


In [None]:
import numpy as np
SEED = 42

def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower


fix_random(SEED)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
# device = torch.device('cpu')

print('Using device:', device)


## Model definition

In [None]:
import torch.nn as nn


class TabTransformer(nn.Module):
    def __init__(self, cat_dims, num_numerical, num_classes, dim_embedding=8, num_heads=2, num_layers=2, dropout=0.1, hidden_size=None):
        """
        Args:
            cat_dims: List of integers, dove ogni elemento rappresenta i valori unici di una colonna categoriale.
            num_numerical: Numero di caratteristiche numeriche.
            num_classes: Numero di classi per output.
            dim_embedding: Dimensione degli embeddings.
            num_heads: Numero di "head" nel Multi-Head Attention.
            num_layers: Numero di livelli Transformer.
            dropout: Dropout per prevenire overfitting.
        """
        super(TabTransformer, self).__init__()

        # Embeddings per features categoriali
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_dim, dim_embedding) for cat_dim in cat_dims
        ])

        # Layer per le features numeriche
        self.numerical_norm = nn.LayerNorm(num_numerical) if num_numerical > 0 else None

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_embedding,
            nhead=num_heads,
            dim_feedforward=dim_embedding * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classificatore finale
        self.classifier = nn.Sequential(
            nn.Linear(len(cat_dims) * dim_embedding + (num_numerical if num_numerical > 0 else 0), hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x_cat, x_num):
        """
        Args:
            x_cat: Tensore (batch_size, num_categorical_features), indici per features categoriali.
            x_num: Tensore (batch_size, num_numerical_features), valori numerici.
        Returns:
            Logits (batch_size, num_classes).
        """
        # Embedding per features categoriali
        x_cat = x_cat.long()
        cat_embeddings = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_embeddings = torch.stack(cat_embeddings, dim=1)  # (batch_size, num_categorical_features, dim_embedding)

        # Passa attraverso il Transformer
        transformed_cat = self.transformer(cat_embeddings)  # (batch_size, num_categorical_features, dim_embedding)
        transformed_cat = transformed_cat.view(transformed_cat.size(0), -1)  # Flatten per concatenare

        # Normalizzazione delle features numeriche
        if x_num is not None and self.numerical_norm is not None:
            x_num = self.numerical_norm(x_num)

        # Concatenazione
        if x_num is not None:
            x = torch.cat([transformed_cat, x_num], dim=1)
        else:
            x = transformed_cat

        # Classificatore
        logits = self.classifier(x)
        return logits


import torch
import numpy as np


class PyTorchTabTransformer:
    def __init__(self, model, cat_idx, num_idx, device='cpu'):
        self.model = model
        self.device = device
        self.model.to(self.device)
        self.cat_idx = cat_idx
        self.num_idx = num_idx

    def predict(self, X):
        """
        Esegue le previsioni sul modello PyTorch.
        """
        self.model.eval()  # Modalità di valutazione
        with torch.no_grad():
            # Controlla se X è un array numpy e convertilo in un tensore PyTorch
            if isinstance(X, np.ndarray):
                X = torch.tensor(X, dtype=torch.float32).to(self.device)

            # Supponi che X sia diviso in categoriale e numerico
            y_pred = self.model(X[:, self.cat_idx].long(),
                                X[:, self.num_idx])
            return torch.argmax(y_pred, dim=1).cpu().numpy()

## Training and test utilities

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score
import time


def train_model(model, criterion, optimizer, epochs, data_loader, val_loader, device, scheduler, patience, cat_idxs, num_idxs):
    n_iter = 0

    best_model = None
    best_val_loss = float('inf')
    epochs_since_last_improvement = 0

    start = time.time()

    loss_history = []
    val_loss_history = []

    for epoch in range(epochs):
        model.train()

        start_epoch = time.time()

        loss_train = 0

        for x_cat, x_num, targets in data_loader:
            # print(f'Epoch [{epoch}/{epochs}] - {time.time() - start_epoch:.2f} seconds - Train Loss: {loss_train:.6f}', end='\r')
            x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(x_cat, x_num)  # Passa entrambe le componenti
            loss = criterion(outputs, targets.long())
            loss.backward()
            optimizer.step()
            n_iter += 1
            loss_train += loss.item()

        scheduler.step()
        loss_train /= len(data_loader)

        # Compute Val Loss
        val_loss, y_pred, y_true = test_model(model, criterion, val_loader)
        # perf = evaluate_model(y_true.detach().cpu().numpy(), y_pred.detach().cpu().numpy())
        loss_history.append(loss_train)

        # Early stopping with threshold
        threshold = 0.01
        if val_loss < best_val_loss - threshold:
            best_val_loss = val_loss
            torch.save(model, f"{filepath}/models/model_best_tf.save")
            with open(f"{filepath}/models/model_best_tf_custom.save", "wb") as f:
                wrapped_model = PyTorchTabTransformer(model, cat_idxs, num_idxs, device)
                pickle.dump(wrapped_model, f)

            epochs_since_last_improvement = 0
        elif epochs_since_last_improvement >= patience:
            break
        else:
            epochs_since_last_improvement += 1

        # print('Epoch [{}/{}] - {:.2f} seconds - train_loss: {:.6f} - val_loss: {:.6f} - patience: {}'.format(epoch ,
        #                                                                                                      epochs, time.time() - start_epoch, loss_train, val_loss, epochs_since_last_improvement), end='\r')
        # calculate balanced accuracy
        balanced_accuracy = balanced_accuracy_score(
            y_true.detach().cpu().numpy(),
            y_pred.detach().cpu().numpy()
        )
        print(f'Epoch [{epoch}/{epochs}] - {time.time() - start_epoch:.2f} seconds - Train Loss: {loss_train:.6f} - Val Loss: {val_loss:.6f} - Val Balanced Accuracy: {balanced_accuracy:.6f}')

    print('\nTraining ended after {:.2f} seconds - Best val_loss: {:.6f}'.format(time.time() - start, best_val_loss))
    best_model = torch.load(f"{filepath}/models/model_best_tf.save")
    return best_model, loss_history, val_loss_history, best_val_loss


def test_model(model, criterion, loader):
    model.eval()
    y_pred = []
    y_true = []

    total_loss = 0.0

    with torch.no_grad():  # Turn off gradient tracking
        for x_cat, x_num, targets in loader:
            x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
            preds = model(x_cat, x_num)  # Outputs logits or probabilities

            loss = criterion(preds, targets.long())
            total_loss += loss.item()

            # Convert predictions to class labels
            predicted_classes = torch.argmax(preds, dim=1)

            # Accumulate predictions and targets
            y_pred.append(predicted_classes.cpu())
            y_true.append(targets.cpu())

    # Concatenate tensors only after the loop to minimize memory usage
    y_pred = torch.cat(y_pred, dim=0)
    y_true = torch.cat(y_true, dim=0)

    avg_loss = total_loss / len(loader)
    return avg_loss, y_pred, y_true


def evaluate_model(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    return {"acc": acc, "bacc": bacc, "f1": f1}


## Define train, validation and test sets

In [None]:
save_in_test_folder = True
if save_in_test_folder:
    filepath = "../TestModule"
else:
    filepath = ".."

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# get features names
features = list(df1.columns)
# features_to_remove = ["label", "ts", "src_ip", "dst_ip", "dns_query", "ssl_subject", "ssl_issuer", "http_uri", "type", "http_referrer", "http_user_agent"]
features_to_remove = ["label", "type", "ts", "http_referrer"]
features = [feature for feature in features if feature not in features_to_remove]
df1 = df1[features + ["type"]]

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
df1 = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
df1.loc[:, "src_bytes"] = df1["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
with open(f"{filepath}/transformer/target_encoder.save", "wb") as f:
    pickle.dump(le, f)

y = le.transform(y)
num_classes = len(le.classes_)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

# fold = np.zeros(X.shape[0])
# fold[train_idx] = -1

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold = np.full(len(y), -1)  # Inizializza tutto con -1 (default: train)

# Assegna i fold ai campioni
for fold_number, (_, val_idx) in enumerate(skf.split(X, y)):
    fold[val_idx] = fold_number  # Assegna il numero del fold ai campioni di validazione

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

## Preprocessing

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import pickle

class CustomOrdinalEncoder(OrdinalEncoder):
    def transform(self, X):
        encoded = super().transform(X)
        # Shift all values by +1 and replace unknown_value (-1) with 0
        return np.where(encoded == -1, 0, encoded + 1)

    def inverse_transform(self, X):
        # Handle the inverse transform to account for the +1 offset
        X = np.where(X == 0, -1, X - 1)
        return super().inverse_transform(X)

def preprocess(X_train,X_val):
    categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
    numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

    ct = ColumnTransformer(
        [
            ("cat", CustomOrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
            ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
        ],
        remainder="passthrough"  # Mantieni le altre colonne invariate
    )
    ct.set_output(transform="pandas")

    ct = ct.fit(X_train)
    with open(f"{filepath}/transformer/transformer_tf.save", "wb") as f:
        pickle.dump(ct, f)

    # train set
    X_train = ct.transform(X_train)

    cat_idxs = [i for i, f in enumerate(X_train.columns) if "cat__" in f]
    cat_dims = [len(X_train[f].unique()) + 1 for i, f in enumerate(X_train.columns) if "cat__" in f]
    num_idxs = [i for i, f in enumerate(X_train.columns) if "scale__" in f]
    numeric_columns_number = len(num_idxs)

    X_train = X_train.to_numpy()

    # validation set
    X_val = ct.transform(X_val).to_numpy()
    return X_train, X_val, cat_idxs, cat_dims, num_idxs, numeric_columns_number

## Define weights for unbalanced classes

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
print(class_weights)

## Create DataLoader

In [None]:
class TabDataset(torch.utils.data.Dataset):
    def __init__(self, x_cat, x_num, y):
        self.x_cat = x_cat
        self.x_num = x_num
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x_cat[idx], self.x_num[idx], self.y[idx]


# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype=torch.long)
#
# X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
# y_val_tensor = torch.tensor(y_val, dtype=torch.long)
#
# # filter the categorical and numerical features
# X_cat_train = X_train_tensor[:, cat_idxs]
# X_num_train = X_train_tensor[:, num_idxs]
#
# X_cat_val = X_val_tensor[:, cat_idxs]
# X_num_val = X_val_tensor[:, num_idxs]
#
# # X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# # y_test_tensor = torch.tensor(y_test, dtype=torch.long)
#
# train_dataset = TabDataset(X_cat_train, X_num_train, y_train_tensor)
# val_dataset = TabDataset(X_cat_val, X_num_val, y_val_tensor)
#
# # val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor), batch_size=y_val.shape[0], shuffle=False)
# # test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor), batch_size=y_test.shape[0], shuffle=False)

## Hyperparameters configuration

In [11]:
nums_epochs = [1000]
batch_sizes = [512, 1024]
patience = [20]
dim_embedding = [8, 16]
num_heads = [4, 8]
num_layers = [2, 4]
hidden_sizes = [64, 128]
learning_rate = [0.01, 0.001]
dropout = [0, 0.3]
hyperparameters = list(itertools.product(nums_epochs, batch_sizes, hidden_sizes, patience, dim_embedding, num_heads, num_layers, learning_rate, dropout))
n_comb = len(hyperparameters)
print(f'Number of hyperparameter combinations: {n_comb}')

Number of hyperparameter combinations: 128


## Training

In [None]:
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

kf = StratifiedKFold(n_splits=5)

# Initialize the model, loss, and optimizer
best_loss = float('inf')
current_iter = 0
for epochs, batch_size, hidden_size, patience_, dim_embedding_, num_heads_, num_layers_, lr, dropout in hyperparameters:
    fix_random(seed)
    start = time.time()

    print(f'Iteration {current_iter + 1}/{n_comb} - Hyperparameters: epochs={epochs}, batch_size={batch_size}, hidden_size={hidden_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}, dropout={dropout}')

    log_name = f"B{batch_size}-hidden{hidden_size}-pat{patience_}-dim{dim_embedding_}-heads{num_heads_}-layers{num_layers_}-lr{lr}-drop{dropout}"

    writer = SummaryWriter('runs/tab_transformer/' + log_name)
    accuracy_per_fold = []
    balanced_accuracy_score_per_fold = []
    f1_score_per_fold = []
    best_loss_per_fold = []

    fold = 1

    for train_index, val_index in kf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        X_train_fold, X_val_fold, cat_idxs_fold, cat_dims_fold, num_idxs_fold, numeric_columns_number_fold = preprocess(X_train_fold, X_val_fold)

        X_train_tensor_fold = torch.tensor(X_train_fold, dtype=torch.float32)
        y_train_tensor_fold = torch.tensor(y_train_fold, dtype=torch.long)

        X_val_tensor_fold = torch.tensor(X_val_fold, dtype=torch.float32)
        y_val_tensor_fold = torch.tensor(y_val_fold, dtype=torch.long)

        # filter the categorical and numerical features
        X_cat_train_fold = X_train_tensor_fold[:, cat_idxs_fold]
        X_num_train_fold = X_train_tensor_fold[:, num_idxs_fold]

        X_cat_val_fold = X_val_tensor_fold[:, cat_idxs_fold].long()
        X_num_val_fold = X_val_tensor_fold[:, num_idxs_fold]

        train_dataset_fold = TabDataset(X_cat_train_fold, X_num_train_fold, y_train_tensor_fold)
        val_dataset_fold = TabDataset(X_cat_val_fold, X_num_val_fold, y_val_tensor_fold)

        train_loader_fold = DataLoader(train_dataset_fold, batch_size=batch_size, shuffle=True)
        val_loader_fold = DataLoader(val_dataset_fold, batch_size=batch_size)

        # Modello TabTransformer

        model = TabTransformer(cat_dims_fold, numeric_columns_number_fold, num_classes, dim_embedding=dim_embedding_, num_heads=num_heads_, num_layers=num_layers_, dropout=dropout, hidden_size=hidden_size).to(device)

        class_weights_fold = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_fold), y=y_train_fold)
        class_weights_fold = dict(enumerate(class_weights_fold))

        criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights_fold.values()), dtype=torch.float32).to(device))
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

        # Training
        model, loss_history, val_loss_history, best_loss_per_fold = train_model(
            model, criterion, optimizer, epochs, train_loader_fold, val_loader_fold, device, scheduler, patience_, cat_idxs_fold, num_idxs_fold
        )

        # Validation
        val_loss, y_pred, y_true = test_model(model, criterion, val_loader_fold)
        perf = evaluate_model(y_true.detach().cpu().numpy(), y_pred.detach().cpu().numpy())

        print(f"Fold {fold} - Accuracy: {perf['acc']:.2f}%")
        print(f"Fold {fold} - Balanced Accuracy: {perf['bacc']:.2f}%")
        print(f"Fold {fold} - F1 Score: {perf['f1']:.2f}%")

        accuracy_per_fold.append(perf["acc"])
        balanced_accuracy_score_per_fold.append(perf["bacc"])
        f1_score_per_fold.append(perf["f1"])
        best_loss_per_fold.append(best_loss)
        fold += 1


        # y_true, _, y_pred = test_model(model, val_loader, device)
        # val_loss = criterion(y_pred, y_true)

        # if val_loss < best_loss:
        #     best_loss = val_loss
        #     with open(f"{filepath}/models/tf.save", "wb") as f:
        #         wrapped_model = PyTorchTabTransformer(model, cat_idxs_fold, num_idxs_fold, device)
        #         pickle.dump(wrapped_model, f)
        #     # best_model = copy.deepcopy(model)
        #     best_hyperparameters = f"epochs={epochs}, batch_size={batch_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}"

        # print(f'Hyperparameters: epochs={epochs}, batch_size={batch_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}')
        # print(f'Validation Loss: {val_loss}')
    # Riassunto dei risultati
    print("Cross-Validation Results:")
    print(f"Average Accuracy: {np.mean(accuracy_per_fold) * 100:.2f}%")
    print(f"Standard Deviation of Accuracy: {np.std(accuracy_per_fold) * 100:.2f}%")
    print(f"Average Balanced Accuracy: {np.mean(balanced_accuracy_score_per_fold) * 100:.2f}%")
    print(f"Standard Deviation of Balanced Accuracy: {np.std(balanced_accuracy_score_per_fold) * 100:.2f}%")
    print(f"Average F1 Score: {np.mean(f1_score_per_fold) * 100:.2f}%")
    print(f"Standard Deviation of F1 Score: {np.std(f1_score_per_fold) * 100:.2f}%")

    # Close tensorboard writer after a training
    # Log hyperparameters and metrics to TensorBoard
    writer.add_hparams(
        {
            'hparam/bsize': batch_size,
            'hparam/hidden_size': hidden_size,
            'hparam/epochs': epochs,
            'hparam/patience': patience_,
            'hparam/dim_embedding': dim_embedding_,
            'hparam/num_heads': num_heads_,
            'hparam/num_layers': num_layers_,
            'hparam/lr': lr,
            'hparam/dropout': dropout
        },
        {
            'Best Loss': np.mean(best_loss_per_fold),
            'Avg Accuracy': np.mean(accuracy_per_fold),
            'Std Accuracy': np.std(accuracy_per_fold),
            'Avg Balanced Accuracy': np.mean(balanced_accuracy_score_per_fold),
            'Std Balanced Accuracy': np.std(balanced_accuracy_score_per_fold),
            'Avg F1 score': np.mean(f1_score_per_fold),
            'Std F1 score': np.std(f1_score_per_fold)
        }
    )
    writer.flush()
    current_iter += 1
    writer.close()

## Test

In [None]:
# best_model = torch.load(f"{filepath}/model/model_tf.save")
# # test_loss, y_pred, y_true = test_model(best_model, criterion, test_loader)
# test_loss, y_pred, y_true = test_model(best_model, criterion, val_loader)
# y_pred = torch.argmax(y_pred, dim=1)
# print(f'Best hyperparameters: {best_hyperparameters}')
# print(f'Test Loss: {test_loss}')
# print(f'Test Accuracy: {accuracy_score(y_true.detach().numpy(), y_pred.detach().numpy())}')
# plt.plot(loss_history, label='train_loss')
# plt.plot(val_loss_history, label='val_loss')
# plt.legend()
# plt.show()

# Hyperparameter tuning