# GPT for Tabular Classification with SHAP Analysis

In [10]:
# Import Required Libraries
# Ensure TensorFlow and Transformers libraries are installed in your environment:
# pip install tensorflow numpy pandas scikit-learn matplotlib seaborn transformers shap
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import shap

## Step 1: Data Loading

In [11]:
# Column names for NSL-KDD dataset
c_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "labels", "difficulty_degree"
]

# Load training and testing datasets
train = pd.read_csv("data/KDDTrain+.txt", names=c_names)
test = pd.read_csv("data/KDDTest+.txt", names=c_names)

## Step 2: Data Preprocessing

In [16]:
# Drop 'difficulty_degree' column as it does not add value
# Remove 'difficulty_degree' column if it exists
if "difficulty_degree" in train.columns:
    del train["difficulty_degree"]

if "difficulty_degree" in test.columns:
    del test["difficulty_degree"]


# Convert categorical features to numerical
categorical_features = ["protocol_type", "service", "flag"]

for col in categorical_features:
    train[col] = train[col].astype("category").cat.codes
    test[col] = test[col].astype("category").cat.codes

# Map 'labels' column to binary classes (1 for 'normal', 0 for 'attack')
train["labels"] = train["labels"].apply(lambda x: 1 if x == "normal" else 0)
test["labels"] = test["labels"].apply(lambda x: 1 if x == "normal" else 0)

# Separate features and labels
X_train = train.drop("labels", axis=1)
y_train = train["labels"]
X_test = test.drop("labels", axis=1)
y_test = test["labels"]

# Normalize numerical features
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert each row into a string format for tokenization
train_sequences = [" ".join(map(str, row)) for row in X_train]
test_sequences = [" ".join(map(str, row)) for row in X_test]



In [17]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_sequences, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_sequences, truncation=True, padding=True, max_length=128)

# Ensure labels are NumPy arrays and have the correct dtype
y_train = y_train.to_numpy().astype("float32")
y_test = y_test.to_numpy().astype("float32")

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), y_train
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), y_test
)).batch(32)



## Step 3: Define the GPT Model

In [None]:
# Define the GPT Model
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# Compile the model with correct loss function
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

# Train the model
print("Training GPT Model...")
history = model.fit(train_dataset, epochs=3,    batch_size=64, validation_data=test_dataset)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Training GPT Model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
 355/3937 [=>............................] - ETA: 1:49:38 - loss: 1.9907e-08 - accuracy: 1.0000

KeyboardInterrupt: 

## Step 4: Train the Model

In [15]:
# Train the Model
print("Training GPT Model...")
history = model.fit(train_dataset, epochs=3, validation_data=test_dataset)

Training GPT Model...
Epoch 1/3


AttributeError: in user code:

    File "/home/wakili/anaconda3/envs/mlenv/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/wakili/anaconda3/envs/mlenv/lib/python3.8/site-packages/transformers/modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "/home/wakili/anaconda3/envs/mlenv/lib/python3.8/site-packages/keras/src/engine/training.py", line 1139, in compute_loss  **
        return self.compiled_loss(
    File "/home/wakili/anaconda3/envs/mlenv/lib/python3.8/site-packages/keras/src/engine/compile_utils.py", line 263, in __call__
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "/home/wakili/anaconda3/envs/mlenv/lib/python3.8/site-packages/keras/src/engine/compile_utils.py", line 840, in match_dtype_and_rank
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


## Step 5: Evaluate the Model

In [None]:
# Evaluate on the test set
print("Evaluating GPT Model...")
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

## Step 6: Visualizations

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history["accuracy"], label="Train Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history["loss"], label="Train Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.title("Model Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
# Confusion Matrix
y_pred = tf.argmax(model.predict(test_dataset).logits, axis=1).numpy()
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Attack", "Normal"], yticklabels=["Attack", "Normal"])
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# Classification Report
print(classification_report(y_test, y_pred, target_names=["Attack", "Normal"]))

In [None]:
# ROC Curve
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

y_test_binarized = label_binarize(y_test, classes=[0, 1])
y_pred_proba = tf.nn.softmax(model.predict(test_dataset).logits)[:, 1].numpy()

fpr, tpr, _ = roc_curve(y_test_binarized, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.title("Receiver Operating Characteristic")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

## Step 7: SHAP Analysis

In [None]:
# SHAP Analysis
explainer = shap.Explainer(model, dict(test_encodings))
shap_values = explainer(dict(test_encodings))

# Summary Plot
shap.summary_plot(shap_values, feature_names=train.columns[:-1])

# Force Plot (First Instance)
shap.force_plot(explainer.expected_value[1], shap_values[1][0].values, feature_names=train.columns[:-1])

# Feature Importance Plot
shap.summary_plot(shap_values, plot_type="bar", feature_names=train.columns[:-1])