In [1]:
# ===============================
# Transformer NIDS for CICIDS-2017 (Multiclass, Rare-class aware)
# Stable Kaggle-ready single cell (with extra metrics)
# ===============================
import os, gc, math, json, warnings, time
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, f1_score, confusion_matrix,
    balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score,
    precision_recall_fscore_support
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# ----------------- 0) Repro -----------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# ----------------- 1) Config -----------------
DATA_DIR = "/kaggle/input/cicids-2017"
OUT_DIR  = "/kaggle/working"

EPOCHS   = 8                # feel free to set 5
BATCH    = 1024
LR       = 3e-4             # a touch lower for stability
WD       = 1e-4
D_MODEL  = 96
N_HEADS  = 4
N_LAYERS = 4
FFW      = 256
DROPOUT  = 0.20
GAMMA    = 1.5              # slightly softer focal
CB_BETA  = 0.999            # less extreme class weights
WEIGHT_CLIP = 10.0          # cap very large class weights
GRAD_CLIP   = 1.0           # clip global grad-norm
PATIENCE    = 2             # early stop on macro-F1

MERGE_ATTEMPTED = False     # keep full label space by default

# ----------------- GPU Info -----------------
print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}:", torch.cuda.get_device_name(i))

# ----------------- 2) Load Data -----------------
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
assert len(csv_files) > 0, "No CSVs found in DATA_DIR"

dfs = []
for f in sorted(csv_files):
    print("Loading", f)
    df_part = pd.read_csv(os.path.join(DATA_DIR, f), low_memory=False)
    dfs.append(df_part)

df = pd.concat(dfs, ignore_index=True)
print("Full shape:", df.shape)

# ----------------- 3) Labels -----------------
df["Label"] = df["Label"].replace("BENIGN", "Normal")
if MERGE_ATTEMPTED:
    df["Label"] = df["Label"].str.replace(" - Attempted", "", regex=False)

y_le = LabelEncoder()
y = y_le.fit_transform(df["Label"])
classes = list(y_le.classes_)
num_classes = len(classes)
print("Classes:", classes)

# ----------------- 4) Feature Cleaning -----------------
drop_cols = [c for c in ["Flow ID","Src IP","Dst IP","Timestamp","Label"] if c in df.columns]
df = df.drop(columns=drop_cols)

cat_candidates = [c for c in df.columns if c.lower() in ("protocol", "proto", "protocolname")]
cat_cols = [c for c in cat_candidates if c in df.columns]

df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

nunq = df.nunique()
zero_var = nunq[nunq <= 1].index.tolist()
if zero_var:
    print("Dropping zero-variance cols:", zero_var)
    df = df.drop(columns=zero_var)

for c in cat_cols:
    df[c] = df[c].astype(str)
    le_c = LabelEncoder()
    df[c] = le_c.fit_transform(df[c])

num_cols = [c for c in df.columns if c not in cat_cols]

X_cat = df[cat_cols].astype(np.int32).values if cat_cols else None
X_num = df[num_cols].astype(np.float32).values if num_cols else None

del dfs, df
gc.collect()

# ----------------- 5) Split + Scale -----------------
X_cat_train, X_cat_temp, X_num_train, X_num_temp, y_train, y_temp = train_test_split(
    X_cat, X_num, y, test_size=0.30, random_state=SEED, stratify=y
)

X_cat_val, X_cat_test, X_num_val, X_num_test, y_val, y_test = train_test_split(
    X_cat_temp, X_num_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_val   = scaler.transform(X_num_val)
X_num_test  = scaler.transform(X_num_test)

print("Train shape:", X_num_train.shape)

# ----------------- 6) Dataset -----------------
class TabularSet(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long) if X_cat is not None else None
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return self.X_num[idx], (self.X_cat[idx] if self.X_cat is not None else None), self.y[idx]

train_ds = TabularSet(X_num_train, X_cat_train, y_train)
val_ds   = TabularSet(X_num_val,   X_cat_val,   y_val)
test_ds  = TabularSet(X_num_test,  X_cat_test,  y_test)

# ----------------- 7) Sampler for Imbalance -----------------
class_counts = np.bincount(y_train, minlength=num_classes)
inv_freq = 1.0 / np.maximum(class_counts, 1)
sample_weights = inv_freq[y_train]
train_sampler = WeightedRandomSampler(
    weights=torch.tensor(sample_weights, dtype=torch.double),
    num_samples=len(sample_weights),
    replacement=True
)

# ----------------- 8) Loaders -----------------
pin = torch.cuda.is_available()
num_workers = 4 if pin else 2
dloader_kwargs = dict(num_workers=num_workers, pin_memory=pin, persistent_workers=True, prefetch_factor=4)
train_loader = DataLoader(train_ds, batch_size=BATCH, sampler=train_sampler, **dloader_kwargs)
val_loader   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False, **dloader_kwargs)
test_loader  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False, **dloader_kwargs)

# ----------------- 9) Model -----------------
class CLSPooling(nn.Module):
    def forward(self, x): return x[:, 0, :]

def get_cat_cardinalities(arr, cols):
    if arr is None or not cols: return []
    return [int(arr[:, i].max()) + 1 for i in range(arr.shape[1])]

cat_cards = get_cat_cardinalities(X_cat_train, cat_cols)

class TabularTransformer(nn.Module):
    def __init__(self, n_num, cat_cards, num_classes,
                 d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, ffw=FFW, dropout=DROPOUT):
        super().__init__()
        self.num_tokenizers = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_num)])
        self.cat_embeds = nn.ModuleList([nn.Embedding(c, d_model) for c in cat_cards])
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.pos = nn.Parameter(torch.randn(1, 1+n_num+len(cat_cards), d_model) * 0.02)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=ffw,
            dropout=dropout, batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.pool = CLSPooling()
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x_num, x_cat):
        B = x_num.size(0)
        num_tokens = torch.stack([tok(x_num[:, i:i+1]) for i, tok in enumerate(self.num_tokenizers)], dim=1)
        if self.cat_embeds:
            cat_tokens = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)], dim=1)
        else:
            cat_tokens = num_tokens.new_zeros((B, 0, num_tokens.size(-1)))
        cls = self.cls.expand(B, 1, -1)
        tokens = torch.cat([cls, num_tokens, cat_tokens], dim=1)
        tokens = tokens + self.pos[:, :tokens.size(1), :]
        x = self.encoder(tokens)
        x = self.pool(x)
        x = self.norm(x)
        return self.head(x)

# ----------------- 10) Loss: Class-Balanced Focal (device-safe + clipped) -----------------
class ClassBalancedFocalLoss(nn.Module):
    def __init__(self, samples_per_class, num_classes, beta=CB_BETA, gamma=GAMMA, weight_clip=WEIGHT_CLIP):
        super().__init__()
        samples = torch.tensor(samples_per_class, dtype=torch.float32)
        eff_num = 1.0 - torch.pow(torch.tensor(beta, dtype=torch.float32), samples)
        weights = (1.0 - beta) / torch.clamp(eff_num, min=1e-6)
        # normalize then clip extreme weights
        weights = weights / weights.mean()
        if weight_clip is not None:
            weights = torch.clamp(weights, max=weight_clip)
        self.register_buffer("class_weights", weights)
        self.gamma = gamma

    def forward(self, logits, target):
        w = self.class_weights.to(logits.device, dtype=logits.dtype)
        ce = nn.functional.cross_entropy(logits, target, weight=w, reduction='none')
        with torch.no_grad():
            pt = torch.softmax(logits, dim=1).gather(1, target.view(-1,1)).squeeze(1)
        focal = ((1.0 - pt).clamp_(min=1e-6) ** self.gamma) * ce
        return focal.mean()

# ----------------- 11) Train/Eval -----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularTransformer(len(num_cols), cat_cards, num_classes).to(device)

# Optional multi-GPU (simple)
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs with DataParallel")
    model = nn.DataParallel(model)

criterion = ClassBalancedFocalLoss(class_counts, num_classes).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

# ---- Warmup -> Cosine scheduler (batch-wise) ----
total_steps = EPOCHS * len(train_loader)
warmup_steps = max(1000, int(0.05 * total_steps))
cosine_steps = max(1, total_steps - warmup_steps)

warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.2, total_iters=warmup_steps)
cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_steps, eta_min=LR * 0.1)
sched = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_steps])

best_val_macro = -1.0
best_path = os.path.join(OUT_DIR, "best_transformer_cbfl.pt")
no_improve = 0

def evaluate(loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        for xb_num, xb_cat, yb in loader:
            xb_num, yb = xb_num.to(device), yb.to(device)
            xb_cat = xb_cat.to(device) if xb_cat is not None else None
            logits = model(xb_num, xb_cat)
            preds.append(torch.argmax(logits, 1).cpu().numpy())
            labels.append(yb.cpu().numpy())
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    return f1_score(labels, preds, average="macro"), preds, labels

# ----------------- 12) Fit -----------------
global_step = 0
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    t0 = time.time()
    for i, (xb_num, xb_cat, yb) in enumerate(train_loader, 1):
        xb_num, yb = xb_num.to(device), yb.to(device)
        xb_cat = xb_cat.to(device) if xb_cat is not None else None

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)

        # NaN/Inf guard
        if not torch.isfinite(loss):
            print(f"Non-finite loss at step {global_step}: {loss.item()}, skipping step.")
            optimizer.zero_grad(set_to_none=True)
            global_step += 1
            sched.step()
            continue

        scaler.scale(loss).backward()

        # clip after unscale
        scaler.unscale_(optimizer)
        if GRAD_CLIP is not None:
            params = model.module.parameters() if isinstance(model, nn.DataParallel) else model.parameters()
            torch.nn.utils.clip_grad_norm_(params, GRAD_CLIP)

        scaler.step(optimizer)
        scaler.update()
        sched.step()
        total_loss += loss.item()
        global_step += 1

        if i % 200 == 0:
            elapsed = time.time() - t0
            steps_left = len(train_loader) - i
            est = elapsed / i * steps_left
            current_lr = optimizer.param_groups[0]["lr"]
            print(f"  step {i}/{len(train_loader)}  loss={total_loss/i:.4f}  lr={current_lr:.2e}  ETA/epoch ~ {est/60:.1f} min")

    val_macro, _, _ = evaluate(val_loader)
    print(f"Epoch {epoch}/{EPOCHS}  TrainLoss: {total_loss/len(train_loader):.4f}  ValMacroF1: {val_macro:.4f}")

    if val_macro > best_val_macro + 1e-4:
        best_val_macro = val_macro
        to_save = model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict()
        torch.save({
            "model": to_save,
            "classes": classes,
            "num_cols": num_cols,
            "cat_cols": cat_cols,
        }, best_path)
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("Early stopping.")
            break

# ----------------- 13) Test -----------------
ckpt = torch.load(best_path, map_location=device)
state = ckpt["model"]
model.load_state_dict(state, strict=False)

test_macro, test_preds, test_labels = evaluate(test_loader)
print("Test Macro-F1:", test_macro)
print(classification_report(test_labels, test_preds, target_names=classes, zero_division=0))

# ---- Extra aggregation metrics (imbalance-robust) ----
bal_acc = balanced_accuracy_score(test_labels, test_preds)      # mean recall across classes
mcc     = matthews_corrcoef(test_labels, test_preds)            # chance-corrected correlation
kappa   = cohen_kappa_score(test_labels, test_preds)            # chance-corrected agreement
mp, mr, mf1, _ = precision_recall_fscore_support(
    test_labels, test_preds, average='macro', zero_division=0
)

print(f"\nBalanced Accuracy: {bal_acc:.6f}")
print(f"Matthews Corrcoef (MCC): {mcc:.6f}")
print(f"Cohen's Kappa: {kappa:.6f}")
print(f"Macro Precision: {mp:.6f} | Macro Recall: {mr:.6f} | Macro F1: {mf1:.6f}")

# Save artifacts
pd.DataFrame({
    "true": [classes[i] for i in test_labels],
    "pred": [classes[i] for i in test_preds]
}).to_csv(os.path.join(OUT_DIR, "test_preds.csv"), index=False)

rep = classification_report(test_labels, test_preds, target_names=classes, zero_division=0, output_dict=True)
pd.DataFrame(rep).to_csv(os.path.join(OUT_DIR, "classification_report.csv"))

cm = confusion_matrix(test_labels, test_preds, labels=list(range(num_classes)))
np.save(os.path.join(OUT_DIR, "confusion_matrix.npy"), cm)
print("Saved: test_preds.csv, classification_report.csv, confusion_matrix.npy")

CUDA available: True
GPU count: 2
GPU 0: Tesla T4
GPU 1: Tesla T4
Loading friday.csv
Loading friday_plus.csv
Loading monday.csv
Loading monday_plus.csv
Loading thursday.csv
Loading thursday_plus.csv
Loading tuesday.csv
Loading tuesday_plus.csv
Loading wednesday.csv
Loading wednesday_plus.csv
Full shape: (4199942, 105)
Classes: ['Botnet', 'Botnet - Attempted', 'DDoS', 'DoS GoldenEye', 'DoS GoldenEye - Attempted', 'DoS Hulk', 'DoS Hulk - Attempted', 'DoS Slowhttptest', 'DoS Slowhttptest - Attempted', 'DoS Slowloris', 'DoS Slowloris - Attempted', 'FTP-Patator', 'FTP-Patator - Attempted', 'Heartbleed', 'Infiltration', 'Infiltration - Attempted', 'Infiltration - Portscan', 'Normal', 'Portscan', 'SSH-Patator', 'SSH-Patator - Attempted', 'Web Attack - Brute Force', 'Web Attack - Brute Force - Attempted', 'Web Attack - SQL Injection', 'Web Attack - SQL Injection - Attempted', 'Web Attack - XSS', 'Web Attack - XSS - Attempted']
Train shape: (2939959, 101)
Using 2 GPUs with DataParallel
  step 2

In [2]:
# ===============================
# Imports & global setup
# ===============================
import os, gc, math, time, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# plotting (optional)
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn bits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, f1_score, confusion_matrix,
    balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score,
    precision_recall_fscore_support
)

# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Repro
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA available: True
GPU count: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


In [4]:
# ===============================
# Config & paths (Kaggle)
# ===============================
# If you uploaded a custom dataset, adjust DATA_DIR below to match its folder name under /kaggle/input
CANDIDATE_DIRS = [
    "/kaggle/input/cicids-2017",
    "/kaggle/input/cicids2017",
    "/kaggle/input/cicids-2017-dataset",
    "/kaggle/input",  # last-resort scan
]

def find_data_dir():
    target_names = {"monday.csv","tuesday.csv","wednesday.csv","thursday.csv","friday.csv"}
    for base in CANDIDATE_DIRS:
        if not os.path.exists(base): 
            continue
        # exact folder
        if any(f.lower().endswith(".csv") for f in os.listdir(base)):
            names = {f.lower() for f in os.listdir(base)}
            if any(n in names for n in target_names):
                return base
        # scan one level down
        for sub in os.listdir(base):
            p = os.path.join(base, sub)
            if os.path.isdir(p) and any(f.lower().endswith(".csv") for f in os.listdir(p)):
                names = {f.lower() for f in os.listdir(p)}
                if any(n in names for n in target_names):
                    return p
    raise FileNotFoundError("Couldn't find CICIDS-2017 CSVs under /kaggle/input. Please check dataset path.")

DATA_DIR = find_data_dir()
OUT_DIR  = "/kaggle/working"

# Training knobs
EPOCHS   = 8
BATCH    = 1024
LR       = 3e-4
WD       = 1e-4
D_MODEL  = 96
N_HEADS  = 4
N_LAYERS = 4
FFW      = 256
DROPOUT  = 0.20

# Class-imbalance loss knobs
GAMMA      = 1.5
CB_BETA    = 0.999
WEIGHT_CLIP= 10.0
GRAD_CLIP  = 1.0

# Early stop on Val Macro-F1
PATIENCE   = 2

# Keep full label space (Attempted vs. Non-attempted)
MERGE_ATTEMPTED = False

print("Using DATA_DIR:", DATA_DIR)

Using DATA_DIR: /kaggle/input/cicids-2017


In [5]:
# ===============================
# Load CSVs
# ===============================
def load_all_csvs(data_dir):
    # usual CICIDS names, but we’ll just take every .csv to be safe
    csvs = sorted([f for f in os.listdir(data_dir) if f.lower().endswith(".csv")])
    if not csvs:
        raise FileNotFoundError("No CSV files found in DATA_DIR.")

    dfs = []
    for f in csvs:
        print("Loading", f)
        dfp = pd.read_csv(os.path.join(data_dir, f), low_memory=False)
        dfs.append(dfp)
    return pd.concat(dfs, ignore_index=True)

df = load_all_csvs(DATA_DIR)
print("Full shape:", df.shape)

# unifying labels
df["Label"] = df["Label"].replace("BENIGN", "Normal")
if MERGE_ATTEMPTED:
    df["Label"] = df["Label"].str.replace(" - Attempted", "", regex=False)

# Encode labels
y_le = LabelEncoder()
y = y_le.fit_transform(df["Label"])
classes = list(y_le.classes_)
num_classes = len(classes)
print("Classes:", classes)

Loading friday.csv
Loading friday_plus.csv
Loading monday.csv
Loading monday_plus.csv
Loading thursday.csv
Loading thursday_plus.csv
Loading tuesday.csv
Loading tuesday_plus.csv
Loading wednesday.csv
Loading wednesday_plus.csv
Full shape: (4199942, 105)
Classes: ['Botnet', 'Botnet - Attempted', 'DDoS', 'DoS GoldenEye', 'DoS GoldenEye - Attempted', 'DoS Hulk', 'DoS Hulk - Attempted', 'DoS Slowhttptest', 'DoS Slowhttptest - Attempted', 'DoS Slowloris', 'DoS Slowloris - Attempted', 'FTP-Patator', 'FTP-Patator - Attempted', 'Heartbleed', 'Infiltration', 'Infiltration - Attempted', 'Infiltration - Portscan', 'Normal', 'Portscan', 'SSH-Patator', 'SSH-Patator - Attempted', 'Web Attack - Brute Force', 'Web Attack - Brute Force - Attempted', 'Web Attack - SQL Injection', 'Web Attack - SQL Injection - Attempted', 'Web Attack - XSS', 'Web Attack - XSS - Attempted']


In [6]:
display(df.head())
print(df.dtypes.head(12))
print("Unique labels:", df["Label"].nunique())

Unnamed: 0,Src IP dec,Src Port,Dst IP dec,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Local_5,Local_6,Local_7,Local_8,Local_9,Local_10,Local_11,Local_12,Local_13,Local_14
0,3232238130,56108,3232238083,3268,6,59:50.3,112740690,32,16,6448,...,,,,,,,,,,
1,3232238130,42144,3232238083,389,6,59:50.3,112740560,32,16,6448,...,,,,,,,,,,
2,134610945,0,134219268,0,0,00:31.4,113757377,545,0,0,...,,,,,,,,,,
3,3232238105,5353,3758096635,5353,17,00:42.9,91997219,388,0,37151,...,,,,,,,,,,
4,3232238105,123,301796989,123,17,00:42.4,66966070,6,6,288,...,,,,,,,,,,


Src IP dec                     int64
Src Port                       int64
Dst IP dec                     int64
Dst Port                       int64
Protocol                       int64
Timestamp                     object
Flow Duration                  int64
Total Fwd Packet               int64
Total Bwd packets              int64
Total Length of Fwd Packet     int64
Total Length of Bwd Packet     int64
Fwd Packet Length Max          int64
dtype: object
Unique labels: 27


In [7]:
# ===============================
# Feature cleaning & encoding
# ===============================
# Drop obvious non-features if present
drop_cols = [c for c in ["Flow ID", "Src IP", "Dst IP", "Timestamp", "Label"] if c in df.columns]
df = df.drop(columns=drop_cols)

# Identify categorical columns (very few in CICIDS — often Protocol)
cat_candidates = [c for c in df.columns if c.lower() in ("protocol", "proto", "protocolname")]
cat_cols = [c for c in cat_candidates if c in df.columns]

# Fix NaN/Inf, drop zero-variance cols
df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
nunq = df.nunique()
zero_var = nunq[nunq <= 1].index.tolist()
if zero_var:
    print("Dropping zero-variance cols:", zero_var)
    df = df.drop(columns=zero_var)

# Encode categoricals
for c in cat_cols:
    df[c] = df[c].astype(str)
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])

num_cols = [c for c in df.columns if c not in cat_cols]

# Extract arrays
X_cat = df[cat_cols].astype(np.int32).values if cat_cols else None
X_num = df[num_cols].astype(np.float32).values if num_cols else None

# Free memory
del nunq, df
gc.collect()

4

In [8]:
# ===============================
# Split & scale
# ===============================
X_cat_train, X_cat_temp, X_num_train, X_num_temp, y_train, y_temp = train_test_split(
    X_cat, X_num, y, test_size=0.30, random_state=SEED, stratify=y
)
X_cat_val, X_cat_test, X_num_val, X_num_test, y_val, y_test = train_test_split(
    X_cat_temp, X_num_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_val   = scaler.transform(X_num_val)
X_num_test  = scaler.transform(X_num_test)

print("Train numerical shape:", X_num_train.shape)
if X_cat_train is not None:
    print("Train categorical shape:", X_cat_train.shape)
print("Classes:", len(classes))

Train numerical shape: (2939959, 101)
Train categorical shape: (2939959, 1)
Classes: 27


In [9]:
# ===============================
# Dataset & DataLoaders
# ===============================
class TabularSet(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.y     = torch.tensor(y, dtype=torch.long)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long) if X_cat is not None else None
    def __len__(self): 
        return len(self.y)
    def __getitem__(self, i):
        return self.X_num[i], (self.X_cat[i] if self.X_cat is not None else None), self.y[i]

train_ds = TabularSet(X_num_train, X_cat_train, y_train)
val_ds   = TabularSet(X_num_val,   X_cat_val,   y_val)
test_ds  = TabularSet(X_num_test,  X_cat_test,  y_test)

# Weighted sampler for imbalance
class_counts = np.bincount(y_train, minlength=num_classes)
inv_freq = 1.0 / np.maximum(class_counts, 1)
sample_weights = inv_freq[y_train]
train_sampler = WeightedRandomSampler(
    weights=torch.tensor(sample_weights, dtype=torch.double),
    num_samples=len(sample_weights),
    replacement=True
)

# DataLoaders
pin = torch.cuda.is_available()
num_workers = 4 if pin else 2
dl_args = dict(num_workers=num_workers, pin_memory=pin, persistent_workers=True, prefetch_factor=4)

train_loader = DataLoader(train_ds, batch_size=BATCH, sampler=train_sampler, **dl_args)
val_loader   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False, **dl_args)
test_loader  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False, **dl_args)

In [10]:
# ===============================
# Model & Loss
# ===============================
def get_cat_cardinalities(arr):
    if arr is None: return []
    return [int(arr[:, i].max()) + 1 for i in range(arr.shape[1])]

cat_cards = get_cat_cardinalities(X_cat_train)

class CLSPooling(nn.Module):
    def forward(self, x):  # x: [B, 1+T, D]
        return x[:, 0, :]

class TabularTransformer(nn.Module):
    def __init__(self, n_num, cat_cards, num_classes,
                 d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, ffw=FFW, dropout=DROPOUT):
        super().__init__()
        self.num_tokenizers = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_num)])
        self.cat_embeds = nn.ModuleList([nn.Embedding(c, d_model) for c in cat_cards])
        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.pos = nn.Parameter(torch.randn(1, 1+n_num+len(cat_cards), d_model) * 0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=ffw,
            dropout=dropout, batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.pool = CLSPooling()
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x_num, x_cat):
        B = x_num.size(0)
        num_tokens = torch.stack([tok(x_num[:, i:i+1]) for i, tok in enumerate(self.num_tokenizers)], dim=1)
        if self.cat_embeds:
            cat_tokens = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)], dim=1)
        else:
            cat_tokens = num_tokens.new_zeros((B, 0, num_tokens.size(-1)))
        cls = self.cls.expand(B, 1, -1)
        x = torch.cat([cls, num_tokens, cat_tokens], dim=1)
        x = x + self.pos[:, :x.size(1), :]
        x = self.encoder(x)
        x = self.pool(x)
        x = self.norm(x)
        return self.head(x)

class ClassBalancedFocalLoss(nn.Module):
    def __init__(self, samples_per_class, num_classes, beta=CB_BETA, gamma=GAMMA, weight_clip=WEIGHT_CLIP):
        super().__init__()
        samples = torch.tensor(samples_per_class, dtype=torch.float32)
        eff_num = 1.0 - torch.pow(torch.tensor(beta, dtype=torch.float32), samples)
        weights = (1.0 - beta) / torch.clamp(eff_num, min=1e-6)
        weights = weights / weights.mean()
        if weight_clip is not None:
            weights = torch.clamp(weights, max=weight_clip)
        self.register_buffer("class_weights", weights)
        self.gamma = gamma

    def forward(self, logits, target):
        w = self.class_weights.to(logits.device, dtype=logits.dtype)
        ce = nn.functional.cross_entropy(logits, target, weight=w, reduction='none')
        with torch.no_grad():
            pt = torch.softmax(logits, dim=1).gather(1, target.view(-1,1)).squeeze(1)
        focal = ((1.0 - pt).clamp_(min=1e-6) ** self.gamma) * ce
        return focal.mean()

In [11]:
# ===============================
# Training
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularTransformer(len(num_cols), cat_cards, num_classes).to(device)

# Optional: DataParallel if multiple GPUs
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = nn.DataParallel(model)

criterion = ClassBalancedFocalLoss(class_counts, num_classes).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

# Scheduler: warmup then cosine
total_steps = EPOCHS * len(train_loader)
warmup_steps = max(1000, int(0.05 * total_steps))
cosine_steps = max(1, total_steps - warmup_steps)
warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.2, total_iters=warmup_steps)
cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_steps, eta_min=LR * 0.1)
sched = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_steps])

best_val_macro = -1.0
no_improve = 0
best_path = os.path.join(OUT_DIR, "best_transformer_cbfl.pt")

def evaluate(loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        for xb_num, xb_cat, yb in loader:
            xb_num, yb = xb_num.to(device), yb.to(device)
            xb_cat = xb_cat.to(device) if xb_cat is not None else None
            logits = model(xb_num, xb_cat)
            preds.append(torch.argmax(logits, 1).cpu().numpy())
            labels.append(yb.cpu().numpy())
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    return f1_score(labels, preds, average="macro"), preds, labels

global_step = 0
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    t0 = time.time()
    for i, (xb_num, xb_cat, yb) in enumerate(train_loader, 1):
        xb_num, yb = xb_num.to(device), yb.to(device)
        xb_cat = xb_cat.to(device) if xb_cat is not None else None

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)

        # Guard against NaN/Inf
        if not torch.isfinite(loss):
            print(f"Non-finite loss at step {global_step}: {loss.item()}, skipping.")
            optimizer.zero_grad(set_to_none=True)
            global_step += 1
            sched.step()
            continue

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        if GRAD_CLIP is not None:
            params = model.module.parameters() if isinstance(model, nn.DataParallel) else model.parameters()
            torch.nn.utils.clip_grad_norm_(params, GRAD_CLIP)

        scaler.step(optimizer)
        scaler.update()
        sched.step()
        total_loss += loss.item()
        global_step += 1

        if i % 200 == 0:
            elapsed = time.time() - t0
            steps_left = len(train_loader) - i
            est = elapsed / i * steps_left
            current_lr = optimizer.param_groups[0]["lr"]
            print(f"  step {i}/{len(train_loader)}  loss={total_loss/i:.4f}  lr={current_lr:.2e}  ETA/epoch ~ {est/60:.1f} min")

    val_macro, _, _ = evaluate(val_loader)
    print(f"Epoch {epoch}/{EPOCHS}  TrainLoss: {total_loss/len(train_loader):.4f}  ValMacroF1: {val_macro:.4f}")

    if val_macro > best_val_macro + 1e-4:
        best_val_macro = val_macro
        to_save = model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict()
        torch.save({
            "model": to_save,
            "classes": classes,
            "num_cols": num_cols,
            "cat_cols": cat_cols,
        }, best_path)
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("Early stopping.")
            break

Using 2 GPUs with DataParallel
  step 200/2872  loss=1.4058  lr=1.02e-04  ETA/epoch ~ 4.7 min
  step 400/2872  loss=0.8366  lr=1.44e-04  ETA/epoch ~ 4.2 min
  step 600/2872  loss=0.5890  lr=1.85e-04  ETA/epoch ~ 3.8 min
  step 800/2872  loss=0.4525  lr=2.27e-04  ETA/epoch ~ 3.4 min
  step 1000/2872  loss=0.3667  lr=2.69e-04  ETA/epoch ~ 3.1 min
  step 1200/2872  loss=0.3079  lr=3.00e-04  ETA/epoch ~ 2.7 min
  step 1400/2872  loss=0.2654  lr=3.00e-04  ETA/epoch ~ 2.4 min
  step 1600/2872  loss=0.2332  lr=3.00e-04  ETA/epoch ~ 2.1 min
  step 1800/2872  loss=0.2079  lr=2.99e-04  ETA/epoch ~ 1.7 min
  step 2000/2872  loss=0.1879  lr=2.99e-04  ETA/epoch ~ 1.4 min
  step 2200/2872  loss=0.1713  lr=2.98e-04  ETA/epoch ~ 1.1 min
  step 2400/2872  loss=0.1573  lr=2.98e-04  ETA/epoch ~ 0.8 min
  step 2600/2872  loss=0.1455  lr=2.97e-04  ETA/epoch ~ 0.4 min
  step 2800/2872  loss=0.1353  lr=2.96e-04  ETA/epoch ~ 0.1 min
Epoch 1/8  TrainLoss: 0.1320  ValMacroF1: 0.6833
  step 200/2872  loss=0.0032

In [12]:
# ===============================
# Evaluation on test set
# ===============================
ckpt = torch.load(best_path, map_location=device)
state = ckpt["model"]
model.load_state_dict(state, strict=False)

test_macro, test_preds, test_labels = evaluate(test_loader)
print("Test Macro-F1:", test_macro)

print(classification_report(test_labels, test_preds, target_names=classes, zero_division=0))

# Extra imbalance-robust metrics
bal_acc = balanced_accuracy_score(test_labels, test_preds)
mcc     = matthews_corrcoef(test_labels, test_preds)
kappa   = cohen_kappa_score(test_labels, test_preds)
mp, mr, mf1, _ = precision_recall_fscore_support(
    test_labels, test_preds, average="macro", zero_division=0
)

print(f"\nBalanced Accuracy: {bal_acc:.6f}")
print(f"Matthews Corrcoef (MCC): {mcc:.6f}")
print(f"Cohen's Kappa: {kappa:.6f}")
print(f"Macro Precision: {mp:.6f} | Macro Recall: {mr:.6f} | Macro F1: {mf1:.6f}")

Test Macro-F1: 0.8812472839149691
                                        precision    recall  f1-score   support

                                Botnet       0.77      1.00      0.87       221
                    Botnet - Attempted       1.00      1.00      1.00      1220
                                  DDoS       1.00      1.00      1.00     28543
                         DoS GoldenEye       1.00      1.00      1.00      2270
             DoS GoldenEye - Attempted       1.00      0.92      0.96        24
                              DoS Hulk       1.00      1.00      1.00     47541
                  DoS Hulk - Attempted       0.98      0.99      0.99       174
                      DoS Slowhttptest       0.98      1.00      0.99       522
          DoS Slowhttptest - Attempted       1.00      1.00      1.00      1011
                         DoS Slowloris       1.00      1.00      1.00      1158
             DoS Slowloris - Attempted       0.99      1.00      0.99       554
     

In [13]:
# ===============================
# Save artifacts to /kaggle/working
# ===============================
pred_df = pd.DataFrame({
    "true": [classes[i] for i in test_labels],
    "pred": [classes[i] for i in test_preds]
})
pred_df.to_csv(os.path.join(OUT_DIR, "test_preds.csv"), index=False)

report = classification_report(test_labels, test_preds, target_names=classes, zero_division=0, output_dict=True)
pd.DataFrame(report).to_csv(os.path.join(OUT_DIR, "classification_report.csv"))

cm = confusion_matrix(test_labels, test_preds, labels=list(range(len(classes))))
np.save(os.path.join(OUT_DIR, "confusion_matrix.npy"), cm)

print("Saved: test_preds.csv, classification_report.csv, confusion_matrix.npy")

Saved: test_preds.csv, classification_report.csv, confusion_matrix.npy
