# TabNet Pipeline for Intrusion Detection

In [7]:
# Import Required Libraries
# Ensure you have the following installed: pytorch-tabnet, sklearn, pandas, numpy, matplotlib, seaborn
# Install with: pip install pytorch-tabnet numpy pandas scikit-learn matplotlib seaborn

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from matplotlib import pyplot as plt
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import seaborn as sns

# Create a folder for visualizations
import os
os.makedirs("visualization", exist_ok=True)

## Step 1: Load Data

In [8]:
# Column names for NSL-KDD dataset
c_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "labels"
]

# Load training and testing datasets
train = pd.read_csv("data/KDDTrain+.txt", names=c_names)
test = pd.read_csv("data/KDDTest+.txt", names=c_names)

## Step 2: Data Preprocessing

In [12]:
# Convert categorical features to numerical
categorical_features = ["protocol_type", "service", "flag"]

for col in categorical_features:
    train[col] = train[col].astype("category").cat.codes
    test[col] = test[col].astype("category").cat.codes

# Map 'labels' column to binary classes (1 for 'normal', 0 for 'attack')
train["labels"] = train["labels"].apply(lambda x: 1 if x == "normal" else 0)
test["labels"] = test["labels"].apply(lambda x: 1 if x == "normal" else 0)

# Separate features and labels
# Ensure only numeric columns are used for scaling
numerical_columns = train.select_dtypes(include=["int64", "float64"]).columns
X_train = train[numerical_columns].drop("labels", axis=1).values
X_test = test[numerical_columns].drop("labels", axis=1).values
y_train = train["labels"].values
y_test = test["labels"].values

# Normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Split training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

## Step 3: Build TabNet Model

In [13]:
clf = TabNetClassifier(
    n_d=32, n_a=32, n_steps=5,
    gamma=1.5, lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="entmax"
)



## Step 4: Train the TabNet Model

In [14]:
print("Training TabNet Model...")
clf.fit(
    x_train, y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_name=["train", "valid"],
    eval_metric=["accuracy"],
    max_epochs=50,
    patience=10,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

Training TabNet Model...
epoch 0  | loss: 0.00015 | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:00:10s
epoch 1  | loss: 2e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:00:21s
epoch 2  | loss: 1e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:00:32s
epoch 3  | loss: 1e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:00:43s
epoch 4  | loss: 1e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:00:53s
epoch 5  | loss: 1e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:04s
epoch 6  | loss: 0.0     | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:15s
epoch 7  | loss: 0.0     | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:26s
epoch 8  | loss: 0.0     | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:37s
epoch 9  | loss: 1e-05   | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:48s
epoch 10 | loss: 0.0     | train_accuracy: 1.0     | valid_accuracy: 1.0     |  0:01:



## Step 5: Evaluate the Model

In [15]:
# Evaluate on the test set
test_preds = clf.predict(X_test)
test_proba = clf.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, test_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Attack", "Normal"], yticklabels=["Attack", "Normal"])
plt.title("Confusion Matrix", fontsize=14)
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.tight_layout()
plt.savefig("visualization/tabnet_confusion_matrix.png")
plt.close()

# Classification Report
report = classification_report(y_test, test_preds, target_names=["Attack", "Normal"], output_dict=True)
print("Classification Report:\n", classification_report(y_test, test_preds, target_names=["Attack", "Normal"]))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, test_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.title("Receiver Operating Characteristic", fontsize=14)
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.savefig("visualization/tabnet_roc_curve.png")
plt.close()

IndexError: index 1 is out of bounds for axis 1 with size 1

## Step 6: Feature Importance

In [16]:
# Plot Feature Importance
feature_importances = clf.feature_importances_
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances)
plt.title("TabNet Feature Importances", fontsize=14)
plt.xlabel("Feature Index", fontsize=12)
plt.ylabel("Importance", fontsize=12)
plt.tight_layout()
plt.savefig("visualization/tabnet_feature_importance.png")
plt.close()

print("Pipeline complete. All visualizations saved to the 'visualization' folder.")

Pipeline complete. All visualizations saved to the 'visualization' folder.
