# TabTransformer

## Setting device and seed

In [1]:
import copy
import itertools
import pickle
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

In [2]:
SEED = 42


def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower


fix_random(SEED)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
# device = torch.device('cpu')

print('Using device:', device)


Using device: cuda


## Model definition

In [3]:
# class TabTransformer(torch.nn.Module):
#     def __init__(self, num_features, num_classes, dim_embedding=8, num_heads=2, num_layers=2):
#         super(TabTransformer, self).__init__()
#         self.embedding = torch.nn.Linear(num_features, dim_embedding)
#         encoder_layer = torch.nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True)
#         self.transformer = torch.nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
#         self.classifier = torch.nn.Linear(dim_embedding, num_classes)
#
#     def forward(self, x):
#         x = self.embedding(x)
#         x = x.unsqueeze(1)  # Adding a sequence length dimension
#         x = self.transformer(x)
#         x = torch.mean(x, dim=1)  # Pooling
#         x = self.classifier(x)
#         return x

import torch
import torch.nn as nn


class TabTransformer(nn.Module):
    def __init__(self, cat_dims, num_numerical, num_classes, dim_embedding=8, num_heads=2, num_layers=2, dropout=0.1):
        """
        Args:
            cat_dims: List of integers, dove ogni elemento rappresenta i valori unici di una colonna categoriale.
            num_numerical: Numero di caratteristiche numeriche.
            num_classes: Numero di classi per output.
            dim_embedding: Dimensione degli embeddings.
            num_heads: Numero di "head" nel Multi-Head Attention.
            num_layers: Numero di livelli Transformer.
            dropout: Dropout per prevenire overfitting.
        """
        super(TabTransformer, self).__init__()

        # Embeddings per features categoriali
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_dim, dim_embedding) for cat_dim in cat_dims
        ])

        # Layer per le features numeriche
        self.numerical_norm = nn.LayerNorm(num_numerical) if num_numerical > 0 else None

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_embedding,
            nhead=num_heads,
            dim_feedforward=dim_embedding * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classificatore finale
        self.classifier = nn.Sequential(
            nn.Linear(len(cat_dims) * dim_embedding + (num_numerical if num_numerical > 0 else 0), 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, x_cat, x_num):
        """
        Args:
            x_cat: Tensore (batch_size, num_categorical_features), indici per features categoriali.
            x_num: Tensore (batch_size, num_numerical_features), valori numerici.
        Returns:
            Logits (batch_size, num_classes).
        """
        # Embedding per features categoriali
        x_cat = x_cat.long()
        cat_embeddings = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_embeddings = torch.stack(cat_embeddings, dim=1)  # (batch_size, num_categorical_features, dim_embedding)

        # Passa attraverso il Transformer
        transformed_cat = self.transformer(cat_embeddings)  # (batch_size, num_categorical_features, dim_embedding)
        transformed_cat = transformed_cat.view(transformed_cat.size(0), -1)  # Flatten per concatenare

        # Normalizzazione delle features numeriche
        if x_num is not None and self.numerical_norm is not None:
            x_num = self.numerical_norm(x_num)

        # Concatenazione
        if x_num is not None:
            x = torch.cat([transformed_cat, x_num], dim=1)
        else:
            x = transformed_cat

        # Classificatore
        logits = self.classifier(x)
        return logits



## Training and test utilities

In [4]:
from sklearn.metrics import balanced_accuracy_score
import time


def train_model(model, criterion, optimizer, epochs, data_loader, val_loader, device, scheduler, patience):
    n_iter = 0

    best_model = None
    best_val_loss = float('inf')
    epochs_since_last_improvement = 0

    start = time.time()

    loss_history = []
    val_loss_history = []

    for epoch in range(epochs):
        model.train()

        start_epoch = time.time()

        loss_train = 0

        for x_cat, x_num, targets in data_loader:
            print(f'Epoch [{epoch}/{epochs}] - {time.time() - start_epoch:.2f} seconds - Train Loss: {loss_train:.6f}', end='\r')
            x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(x_cat, x_num)  # Passa entrambe le componenti
            loss = criterion(outputs, targets.long())
            loss.backward()
            optimizer.step()
            n_iter += 1
            loss_train += loss.item()

        scheduler.step()
        loss_train /= len(data_loader)

        # Compute Val Loss
        val_loss, y_pred, y_true = test_model(model, criterion, val_loader)
        # y_true, y_pred, y_pred_probs = test_model(model, val_loader, device)
        # val_loss = criterion(y_pred_probs, y_true)


        loss_history.append(loss_train)
        val_loss_history.append(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # print('Starting deep copy')
            # best_model = copy.deepcopy(model)
            epochs_since_last_improvement = 0
        elif epochs_since_last_improvement >= patience:
            break
        else:
            epochs_since_last_improvement += 1

        # print('Epoch [{}/{}] - {:.2f} seconds - train_loss: {:.6f} - val_loss: {:.6f} - patience: {}'.format(epoch ,
        #                                                                                                      epochs, time.time() - start_epoch, loss_train, val_loss, epochs_since_last_improvement), end='\r')
        # calculate balanced accuracy
        balanced_accuracy = balanced_accuracy_score(
            y_true.detach().cpu().numpy(),
            y_pred.detach().cpu().numpy()
        )
        print(f'Epoch [{epoch}/{epochs}] - {time.time() - start_epoch:.2f} seconds - Train Loss: {loss_train:.6f} - Val Loss: {val_loss:.6f} - Val Balanced Accuracy: {balanced_accuracy:.6f}')

    print('\nTraining ended after {:.2f} seconds - Best val_loss: {:.6f}'.format(time.time() - start, best_val_loss))

    return best_model, loss_history, val_loss_history


def test_model(model, criterion, loader):
    model.eval()
    y_pred = []
    y_true = []

    total_loss = 0.0

    with torch.no_grad():  # Turn off gradient tracking
        for x_cat, x_num, targets in loader:
            x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
            preds = model(x_cat, x_num)  # Outputs logits or probabilities

            loss = criterion(preds, targets.long())
            total_loss += loss.item()

            # Convert predictions to class labels
            predicted_classes = torch.argmax(preds, dim=1)

            # Accumulate predictions and targets
            y_pred.append(predicted_classes.cpu())
            y_true.append(targets.cpu())

    # Concatenate tensors only after the loop to minimize memory usage
    y_pred = torch.cat(y_pred, dim=0)
    y_true = torch.cat(y_true, dim=0)

    avg_loss = total_loss / len(loader)
    return avg_loss, y_pred, y_true

# def test_model(model, loader, device):
#     model.eval()
#     # y_pred = torch.tensor([], requires_grad=True).to(device)
#     # y_true = torch.tensor([], requires_grad=True).to(device)
#     y_pred = []
#     y_test = []
#
#     total_loss = 0.0
#
#
#     for x_cat, x_num, targets in loader:
#         x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
#         y_pred += model(x_cat, x_num)
#         y_test += targets
#
#     y_pred = torch.stack(y_pred).squeeze()
#     y_test = torch.stack(y_test).squeeze()
#     # y_pred_c is the class with the highest probability
#     y_pred_c = y_pred.argmax(dim=1, keepdim=True).squeeze()
#
#
#     # loss = criterion(preds, targets.long())
#     # total_loss += loss.item()
#     # y_pred = torch.cat((y_pred, preds.squeeze()))
#     # y_true = torch.cat((y_true, targets.detach()))
#     # avg_loss = total_loss / len(loader)
#     return y_test, y_pred_c, y_pred

# def test_model(model, criterion, loader):
#     model.eval()
#     # y_pred = torch.tensor([], requires_grad=True).to(device)
#     # y_true = torch.tensor([], requires_grad=True).to(device)
#     y_pred = torch.tensor([]).to(device)
#     y_true = torch.tensor([]).to(device)
#
#     total_loss = 0.0
#
#     for x_cat, x_num, targets in loader:
#         x_cat, x_num, targets = x_cat.to(device), x_num.to(device), targets.to(device)
#         preds = model(x_cat, x_num)
#         loss = criterion(preds, targets.long())
#         total_loss += loss.item()
#         y_pred = torch.cat((y_pred, preds.squeeze()))
#         y_true = torch.cat((y_true, targets.detach()))
#
#     avg_loss = total_loss / len(loader)
#     return avg_loss, y_pred.squeeze(), y_true.squeeze()

## Define train, validation and test sets

In [5]:
save_in_test_folder = True
if save_in_test_folder:
    filepath = "../TestModule"
else:
    filepath = ".."

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# get features names
features = list(df1.columns)
# features_to_remove = ["label", "ts", "src_ip", "dst_ip", "dns_query", "ssl_subject", "ssl_issuer", "http_uri", "type", "http_referrer", "http_user_agent"]
features_to_remove = ["label", "type", "ts", "http_referrer"]
features = [feature for feature in features if feature not in features_to_remove]
df1 = df1[features + ["type"]]

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
df1 = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
df1.loc[:, "src_bytes"] = df1["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
with open(f"{filepath}/transformer/target_encoder.save", "wb") as f:
    pickle.dump(le, f)

y = le.transform(y)
num_classes = len(le.classes_)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

# fold = np.zeros(X.shape[0])
# fold[train_idx] = -1

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold = np.full(len(y), -1)  # Inizializza tutto con -1 (default: train)

# Assegna i fold ai campioni
for fold_number, (_, val_idx) in enumerate(skf.split(X, y)):
    fold[val_idx] = fold_number  # Assegna il numero del fold ai campioni di validazione

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

#Righe: 616983 #Colonne: 43
#Righe: 616983 #Colonne: 43


## Preprocessing

In [6]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import pickle

categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
# boolean_columns = X_train.select_dtypes(include=["bool"]).columns.tolist()

class CustomOrdinalEncoder(OrdinalEncoder):
    def transform(self, X):
        encoded = super().transform(X)
        # Shift all values by +1 and replace unknown_value (-1) with 0
        return np.where(encoded == -1, 0, encoded + 1)

    def inverse_transform(self, X):
        # Handle the inverse transform to account for the +1 offset
        X = np.where(X == 0, -1, X - 1)
        return super().inverse_transform(X)


ct = ColumnTransformer(
    [
        ("cat", CustomOrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
        # ("ordinal", OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns),  # Trasforma le colonne categoriche
        ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
    ],
    remainder="passthrough"  # Mantieni le altre colonne invariate
)
ct.set_output(transform="pandas")

ct = ct.fit(X_train)
with open(f"{filepath}/transformer/transformer_tf.save", "wb") as f:
    pickle.dump(ct, f)

# train set
X_train = ct.transform(X_train)

cat_idxs = [i for i, f in enumerate(X_train.columns) if "cat__" in f]
cat_dims = [len(X_train[f].unique()) + 1 for i, f in enumerate(X_train.columns) if "cat__" in f]
num_idxs = [i for i, f in enumerate(X_train.columns) if "scale__" in f]
numeric_columns_number = len(num_idxs)

X_train = X_train.to_numpy()
# validation set
X_val = ct.transform(X_val).to_numpy()

# X
X = ct.transform(X).to_numpy()

## Define weights for unbalanced classes

In [7]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
print(class_weights)

{0: 4.069469865611345, 1: 0.3381003918130257, 2: 1.132545546326465, 3: 4.543735616312253, 4: 98.7172, 5: 2.9863625363020327, 6: 1.1966881637007225, 7: 63.85329883570505, 8: 0.28803534018428717, 9: 0.9751965859248429}


## Create DataLoader

In [8]:
class TabDataset(torch.utils.data.Dataset):
    def __init__(self, x_cat, x_num, y):
        self.x_cat = x_cat
        self.x_num = x_num
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x_cat[idx], self.x_num[idx], self.y[idx]



X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# filter the categorical and numerical features
X_cat_train = X_train_tensor[:, cat_idxs]
X_num_train = X_train_tensor[:, num_idxs]


X_cat_val = X_val_tensor[:, cat_idxs]
X_num_val = X_val_tensor[:, num_idxs]

# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TabDataset(X_cat_train, X_num_train, y_train_tensor)
val_dataset = TabDataset(X_cat_val, X_num_val, y_val_tensor)

# val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor), batch_size=y_val.shape[0], shuffle=False)
# test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor), batch_size=y_test.shape[0], shuffle=False)

## Hyperparameters configuration

In [9]:
nums_epochs = [1000]
batch_sizes = [1024]
patience = [20]
dim_embedding = [8]
num_heads = [8]
num_layers = [2]
learning_rate = [0.001]
hyperparameters = list(itertools.product(nums_epochs, batch_sizes, patience, dim_embedding, num_heads, num_layers, learning_rate))
n_comb = len(hyperparameters)
print(f'Number of hyperparameter combinations: {n_comb}')

Number of hyperparameter combinations: 1


## Training

In [10]:
# Initialize the model, loss, and optimizer
best_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights.values()), dtype=torch.float32).to(device))
current_iter = 0
for epochs, batch_size, patience_, dim_embedding_, num_heads_, num_layers_, lr in hyperparameters:

    print(f'Iteration {current_iter + 1}/{n_comb} - Hyperparameters: epochs={epochs}, batch_size={batch_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}')

    # Modello TabTransformer
    model = TabTransformer(cat_dims, numeric_columns_number, num_classes, dim_embedding=dim_embedding_, num_heads=num_heads_, num_layers=num_layers_).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    # Training
    model, loss_history, val_loss_history = train_model(
        model, criterion, optimizer, epochs, train_loader, val_loader, device, scheduler, patience_
    )

    # Validation
    val_loss, y_pred, y_true = test_model(model, criterion, val_loader)
    # y_true, _, y_pred = test_model(model, val_loader, device)
    # val_loss = criterion(y_pred, y_true)

    if val_loss < best_loss:
        best_loss = val_loss
        with open(f"{filepath}/model/model_tf.save", "wb") as f:
            torch.save(model, f)
        # best_model = copy.deepcopy(model)
        best_hyperparameters = f"epochs={epochs}, batch_size={batch_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}"

    print(f'Hyperparameters: epochs={epochs}, batch_size={batch_size}, patience={patience_}, dim_embedding={dim_embedding_}, num_heads={num_heads_}, num_layers={num_layers_}, lr={lr}')
    print(f'Validation Loss: {val_loss}')

    current_iter += 1

Iteration 1/1 - Hyperparameters: epochs=1000, batch_size=1024, patience=20, dim_embedding=8, num_heads=8, num_layers=2, lr=0.001
Epoch [0/1000] - 23.49 seconds - Train Loss: 0.465393 - Val Loss: 0.170987 - Val Balanced Accuracy: 0.942187
Epoch [1/1000] - 23.10 seconds - Train Loss: 0.167314 - Val Loss: 0.112947 - Val Balanced Accuracy: 0.951410
Epoch [2/1000] - 23.77 seconds - Train Loss: 0.132189 - Val Loss: 0.109431 - Val Balanced Accuracy: 0.960977
Epoch [3/1000] - 23.74 seconds - Train Loss: 0.110198 - Val Loss: 0.136625 - Val Balanced Accuracy: 0.959941
Epoch [4/1000] - 23.31 seconds - Train Loss: 0.102455 - Val Loss: 0.083260 - Val Balanced Accuracy: 0.966621
Epoch [5/1000] - 23.39 seconds - Train Loss: 0.091284 - Val Loss: 0.077487 - Val Balanced Accuracy: 0.973264
Epoch [6/1000] - 23.28 seconds - Train Loss: 0.086682 - Val Loss: 0.068761 - Val Balanced Accuracy: 0.973750
Epoch [7/1000] - 23.14 seconds - Train Loss: 0.078860 - Val Loss: 0.062875 - Val Balanced Accuracy: 0.975266

AttributeError: 'NoneType' object has no attribute 'eval'

## Test

In [None]:
best_model = torch.load(f"{filepath}/model/model_tf.save")
# test_loss, y_pred, y_true = test_model(best_model, criterion, test_loader)
test_loss, y_pred, y_true = test_model(best_model, criterion, val_loader)
y_pred = torch.argmax(y_pred, dim=1)
print(f'Best hyperparameters: {best_hyperparameters}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {accuracy_score(y_true.detach().numpy(), y_pred.detach().numpy())}')
plt.plot(loss_history, label='train_loss')
plt.plot(val_loss_history, label='val_loss')
plt.legend()
plt.show()