In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

print("TensorFlow version:", tf.__version__)

# --- Configuration ---
TRAIN_PATH = "UNSW_NB15_APT_features_train.csv"
TEST_PATH  = "UNSW_NB15_APT_features_test.csv"

# Helper function for sparse matrices
def to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X

TensorFlow version: 2.11.0


In [3]:
# --- Load Data ---
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Original Train Shape:", train_df.shape)
print("Original Test Shape:", test_df.shape)

# --- Map Attacks to APT Stages ---
stage_mapping = {
    "Reconnaissance": "Recon", 
    "Fuzzers": "Recon", 
    "Analysis": "Recon",
    "Exploits": "Initial", 
    "DoS": "Exploit", 
    "Generic": "Exploit",
    "Backdoor": "Install", 
    "Shellcode": "Install", 
    "Worms": "Install",
    "Normal": "Normal"
}

train_df["APT_stage"] = train_df["attack_cat"].map(stage_mapping)
test_df["APT_stage"]  = test_df["attack_cat"].map(stage_mapping)

print("\nTRAIN stage distribution:\n", train_df["APT_stage"].value_counts())
print("\nTEST stage distribution:\n", test_df["APT_stage"].value_counts())

Original Train Shape: (175341, 37)
Original Test Shape: (82332, 37)

TRAIN stage distribution:
 APT_stage
Normal     56000
Exploit    52264
Initial    33393
Recon      30675
Install     3009
Name: count, dtype: int64

TEST stage distribution:
 APT_stage
Normal     37000
Exploit    22960
Initial    11132
Recon      10235
Install     1005
Name: count, dtype: int64


In [4]:
# --- Feature Engineering ---
meta_cols = ["attack_cat", "label", "APT_stage"]
feature_cols = [c for c in train_df.columns if c not in meta_cols]

X_full_train = train_df[feature_cols]

numeric_features = X_full_train.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = X_full_train.select_dtypes(include=["object"]).columns.tolist()

# Define Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# Fit on full training feature space
preprocessor.fit(X_full_train)
print("Global preprocessor fitted successfully.")

Global preprocessor fitted successfully.


In [7]:
def create_stagewise_datasets(train_df, test_df):
    datasets = {}
    
    # Logic: (Key Name, Target Class, Class to Remove from Dataset)
    definitions = [
        ("stage0_attack", "Normal", None),              # Attack vs Normal (Target is NOT Normal)
        ("stage1_recon", "Recon", None),                # Recon vs Others
        ("stage2_initial", "Initial", "Recon"),         # Initial vs Others (Remove Recon)
        ("stage3_exploit", "Exploit", "Initial"),       # Exploit vs Others (Remove Initial)
        ("stage4_install", "Install", "Exploit")        # Install vs Others (Remove Exploit)
    ]

    for key, target_class, class_to_remove in definitions:
        current_train = train_df.copy()
        current_test = test_df.copy()

        if class_to_remove:
            current_train = current_train[current_train["APT_stage"] != class_to_remove]
            current_test  = current_test[current_test["APT_stage"] != class_to_remove]
            
        # Create Binary Labels
        if key == "stage0_attack":
            # For Stage 0, we want 1 if it is an ATTACK (not Normal)
            current_train["stage_label"] = (current_train["APT_stage"] != "Normal").astype(int)
            current_test["stage_label"]  = (current_test["APT_stage"] != "Normal").astype(int)
        else:
            # For other stages, 1 if it matches the specific stage
            current_train["stage_label"] = (current_train["APT_stage"] == target_class).astype(int)
            current_test["stage_label"]  = (current_test["APT_stage"] == target_class).astype(int)

        datasets[key] = {"train": current_train, "test": current_test}

    return datasets

print("Dataset splitting function defined.")

Dataset splitting function defined.


In [9]:
def make_balanced_stage(stage_key, datasets, preprocessor, feature_cols):
    data = datasets[stage_key]
    
    # Transform features
    X_train = to_dense(preprocessor.transform(data["train"][feature_cols]))
    y_train = data["train"]["stage_label"].values
    
    X_test  = to_dense(preprocessor.transform(data["test"][feature_cols]))
    y_test  = data["test"]["stage_label"].values

    print(f"Balancing {stage_key}...")
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
    
    print(f"  -> Original shape: {X_train.shape}, Balanced shape: {X_train_bal.shape}")
    
    return {
        "X_train_bal": X_train_bal, 
        "y_train_bal": y_train_bal,
        "X_test": X_test, 
        "y_test": y_test
    }

# Execute creation and balancing
datasets = create_stagewise_datasets(train_df, test_df)
balanced_sets = {}
stage_keys = ["stage0_attack", "stage1_recon", "stage2_initial", "stage3_exploit", "stage4_install"]

for key in stage_keys:
    balanced_sets[key] = make_balanced_stage(key, datasets, preprocessor, feature_cols)
    
print("\nAll datasets balanced and ready.")

Balancing stage0_attack...
  -> Original shape: (175341, 187), Balanced shape: (238682, 187)
Balancing stage1_recon...
  -> Original shape: (175341, 187), Balanced shape: (289332, 187)
Balancing stage2_initial...
  -> Original shape: (144666, 187), Balanced shape: (222546, 187)
Balancing stage3_exploit...
  -> Original shape: (141948, 187), Balanced shape: (179368, 187)
Balancing stage4_install...
  -> Original shape: (123077, 187), Balanced shape: (240136, 187)

All datasets balanced and ready.


In [11]:
def build_stage_model(input_dim, l2_reg=1e-4, dropout_rate=0.3):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2_reg)),
        layers.Dropout(dropout_rate),
        layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(l2_reg)),
        layers.Dropout(dropout_rate),
        layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(l2_reg)),
        layers.Dense(1, activation="sigmoid")
    ])
    return model

def train_stage(X_train, y_train, X_test, y_test, stage_name, 
                base_weights_path=None, save_weights_path=None, 
                frozen_epochs=15, finetune_epochs=5, class_weights=None):
    
    input_dim = X_train.shape[1]
    model = build_stage_model(input_dim)

    # --- Transfer Learning Logic ---
    if base_weights_path:
        print(f"\n[{stage_name}] Loading base weights from {base_weights_path}...")
        model.load_weights(base_weights_path)
        
        # 1. Freeze layers
        for layer in model.layers[:-1]: layer.trainable = False
        
        model.compile(optimizer=keras.optimizers.Adam(5e-4), loss="binary_crossentropy", 
                      metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()])
        
        print(f"[{stage_name}] Training frozen layers...")
        model.fit(X_train, y_train, validation_split=0.1, epochs=frozen_epochs, 
                  batch_size=256, verbose=1, class_weight=class_weights)
        
        # 2. Unfreeze
        for layer in model.layers: layer.trainable = True
        print(f"[{stage_name}] Fine-tuning all layers...")
        
    model.compile(optimizer=keras.optimizers.Adam(1e-4 if base_weights_path else 1e-3),
                  loss="binary_crossentropy",
                  metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()])

    early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    
    model.fit(X_train, y_train, validation_split=0.1, 
              epochs=finetune_epochs if base_weights_path else 20, 
              batch_size=256, callbacks=[early_stop], verbose=1, class_weight=class_weights)

    if save_weights_path:
        model.save_weights(save_weights_path)
        print(f"[{stage_name}] Weights saved to {save_weights_path}")

    return model

print("Model architecture and training loop defined.")

Model architecture and training loop defined.


In [13]:
# Stage 0: Attack vs Normal (From Scratch)
s0 = balanced_sets["stage0_attack"]
model0 = train_stage(s0["X_train_bal"], s0["y_train_bal"], s0["X_test"], s0["y_test"],
                     stage_name="Stage 0", save_weights_path="stage0.h5")

# Stage 1: Recon vs Others (Transfer from Stage 0)
s1 = balanced_sets["stage1_recon"]
model1 = train_stage(s1["X_train_bal"], s1["y_train_bal"], s1["X_test"], s1["y_test"],
                     stage_name="Stage 1", base_weights_path="stage0.h5", save_weights_path="stage1.h5")

# Stage 2: Initial vs Others (Transfer from Stage 1)
s2 = balanced_sets["stage2_initial"]
model2 = train_stage(s2["X_train_bal"], s2["y_train_bal"], s2["X_test"], s2["y_test"],
                     stage_name="Stage 2", base_weights_path="stage1.h5", save_weights_path="stage2.h5")

# Stage 3: Exploit vs Others (Transfer from Stage 2)
s3 = balanced_sets["stage3_exploit"]
model3 = train_stage(s3["X_train_bal"], s3["y_train_bal"], s3["X_test"], s3["y_test"],
                     stage_name="Stage 3", base_weights_path="stage2.h5", save_weights_path="stage3.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
[Stage 0] Weights saved to stage0.h5

[Stage 1] Loading base weights from stage0.h5...
[Stage 1] Training frozen layers...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[Stage 1] Fine-tuning all layers...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[Stage 1] Weights saved to stage1.h5

[Stage 2] Loading base weights from stage1.h5...
[Stage 2] Training frozen layers...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[Stage 2] Fine-tuning all layers...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[Stage 2] Weights saved to stage2.h5

[Stage 3] Loading base weights from stage2.h5...
[Stage 3] Training frozen layers...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6

In [21]:
# Stage 4: Install vs Others (Transfer from Stage 3 + Class Weights)
s4 = balanced_sets["stage4_install"]

cw_vals = compute_class_weight(class_weight="balanced", classes=np.array([0, 1]), y=s4["y_train_bal"])
class_weights4 = {0: cw_vals[0], 1: cw_vals[1]}

model4 = train_stage(s4["X_train_bal"], s4["y_train_bal"], s4["X_test"], s4["y_test"],
                     stage_name="Stage 4", base_weights_path="stage3.h5", save_weights_path="stage4.h5",
                     class_weights=class_weights4)

# --- Refinement Model: Install vs Exploit Binary ---
# Filter data containing ONLY Install or Exploit
ie_train = train_df[train_df["APT_stage"].isin(["Install", "Exploit"])].copy()
ie_test  = test_df[test_df["APT_stage"].isin(["Install", "Exploit"])].copy()

# Label: Install=1, Exploit=0
ie_train["stage_label"] = (ie_train["APT_stage"] == "Install").astype(int)
ie_test["stage_label"]  = (ie_test["APT_stage"] == "Install").astype(int)

# Preprocess & Balance
X_ie_train = to_dense(preprocessor.transform(ie_train[feature_cols]))
y_ie_train = ie_train["stage_label"].values

sm_ie = SMOTE(random_state=42)
X_ie_bal, y_ie_bal = sm_ie.fit_resample(X_ie_train, y_ie_train)

# Calculate Weights
cw_ie = compute_class_weight("balanced", classes=np.array([0, 1]), y=y_ie_bal)
class_weights_ie = {0: cw_ie[0], 1: cw_ie[1]}

# Train Refinement Model
model_ie = train_stage(X_ie_bal, y_ie_bal, to_dense(preprocessor.transform(ie_test[feature_cols])), ie_test["stage_label"].values,
                       stage_name="Refinement (Install/Exploit)", 
                       base_weights_path="stage3.h5",
                       save_weights_path="stage_ie.h5",
                       class_weights=class_weights_ie)


[Stage 4] Loading base weights from stage3.h5...
[Stage 4] Training frozen layers...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[Stage 4] Fine-tuning all layers...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[Stage 4] Weights saved to stage4.h5

[Refinement (Install/Exploit)] Loading base weights from stage3.h5...
[Refinement (Install/Exploit)] Training frozen layers...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[Refinement (Install/Exploit)] Fine-tuning all layers...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[Refinement (Install/Exploit)] Weights saved to stage_ie.h5


In [23]:
# --- Final Inference Logic ---
X_all = test_df[feature_cols]
X_all_proc = to_dense(preprocessor.transform(X_all))
n_samples = len(test_df)

# Default Prediction is "Normal"
y_pred_final = np.array(["Normal"] * n_samples, dtype=object)

# Thresholds (Tuned manually based on validation)
THR = {"Attack": 0.55, "Recon": 0.50, "Initial": 0.50, "Exploit": 0.60, "Install_Refined": 0.15}

print("Running cascade inference...")

# Step 1: Detect Attacks
p0 = model0.predict(X_all_proc).ravel()
mask_attack = p0 >= THR["Attack"]
attack_indices = np.where(mask_attack)[0]

if len(attack_indices) > 0:
    X_attack = X_all_proc[attack_indices]
    
    # Step 2: Detect Recon
    p1 = model1.predict(X_attack).ravel()
    recon_mask = p1 >= THR["Recon"]
    y_pred_final[attack_indices[recon_mask]] = "Recon"
    
    remaining_idx = attack_indices[~recon_mask]
    
    if len(remaining_idx) > 0:
        X_rem = X_all_proc[remaining_idx]
        
        # Step 3: Detect Initial
        p2 = model2.predict(X_rem).ravel()
        initial_mask = p2 >= THR["Initial"]
        y_pred_final[remaining_idx[initial_mask]] = "Initial"
        
        # Remaining indices (Not Recon, Not Initial)
        deep_idx = remaining_idx[~initial_mask]
        
        if len(deep_idx) > 0:
            X_deep = X_all_proc[deep_idx]
            
            # Step 4: Detect Exploit (High confidence)
            p3 = model3.predict(X_deep).ravel()
            exploit_mask = p3 >= THR["Exploit"]
            y_pred_final[deep_idx[exploit_mask]] = "Exploit"
            
            # Step 5: Refine Exploit/Install
            exploit_candidates = deep_idx[exploit_mask] 
            if len(exploit_candidates) > 0:
                 p_ie = model_ie.predict(X_all_proc[exploit_candidates]).ravel()
                 # If the refinement model says it's Install (>= 0.15), overwrite prediction
                 install_mask = p_ie >= THR["Install_Refined"]
                 y_pred_final[exploit_candidates[install_mask]] = "Install"

print("Inference complete.")

Running cascade inference...
Inference complete.


In [25]:
# --- Evaluation ---
labels_order = ["Normal", "Recon", "Initial", "Exploit", "Install"]

print("\n===== FINAL CLASSIFICATION REPORT =====\n")
print(classification_report(test_df["APT_stage"], y_pred_final, labels=labels_order, digits=4))

print("\nConfusion Matrix (Rows=True, Cols=Predicted):")
print(confusion_matrix(test_df["APT_stage"], y_pred_final, labels=labels_order))


===== FINAL CLASSIFICATION REPORT =====

              precision    recall  f1-score   support

      Normal     0.9050    0.8220    0.8615     37000
       Recon     0.3722    0.6014    0.4598     10235
     Initial     0.6070    0.7598    0.6749     11132
     Exploit     0.9982    0.7920    0.8833     22960
     Install     0.0000    0.0000    0.0000      1005

    accuracy                         0.7678     82332
   macro avg     0.5765    0.5950    0.5759     82332
weighted avg     0.8134    0.7678    0.7819     82332


Confusion Matrix (Rows=True, Cols=Predicted):
[[30415  6052   531     2     0]
 [ 2856  6155  1216     7     1]
 [  216  2412  8458    23    23]
 [   92  1179  3494 18185    10]
 [   30   740   235     0     0]]
