<a href="https://colab.research.google.com/github/BillBs-13/IRICT-Conference/blob/main/Attention_Enhanced_CNN%E2%80%93ResNet_with_XGBoost_Ensem_ble_and_DAMCE_Loss_for_Intrusion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NSL-KDD

In [None]:
# =========================
# NSL-KDD (kdd_train.csv / kdd_test.csv)
# =========================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import (classification_report, accuracy_score, precision_score,
                             recall_score, f1_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay)
from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.layers import (Dense, Dropout, BatchNormalization, Input, Conv1D,
                                     MaxPooling1D, Flatten, Add, Layer, Multiply)
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical

import xgboost as xgb
import matplotlib.pyplot as plt

# --------------------
# CONFIG
# --------------------
TRAIN_PATH   = "/content/kdd_train.csv"
TEST_PATH    = "/content/kdd_test.csv"
RANDOM_STATE = 42
TOP_K        = 30

LOSS_CHOICE  = "damce"   # "cce","bce","mse","mae_ce","mse_ce","focal","dice","tversky","poly1","damce"
BATCH_SIZE   = 1024
EPOCHS       = 100
VAL_PATIENCE = 12

ENSEMBLE_W_NN  = 0.7
ENSEMBLE_W_XGB = 0.3

# --------------------
# Losses
# --------------------
def loss_cce():  return tf.keras.losses.CategoricalCrossentropy()
def loss_bce():  return tf.keras.losses.BinaryCrossentropy()
def loss_mse():  return tf.keras.losses.MeanSquaredError()

def mae_ce_loss(alpha=0.5):
    mae = tf.keras.losses.MeanAbsoluteError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred): return alpha*mae(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn

def mse_ce_loss(alpha=0.5):
    mse = tf.keras.losses.MeanSquaredError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred): return alpha*mse(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn

def focal_loss(gamma=2.0, alpha=0.25):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        ce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1-y_true)*(1-y_pred)
        return tf.reduce_mean(alpha * tf.pow(1.-p_t, gamma) * ce)
    return fn

def dice_loss(eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        inter = tf.reduce_sum(y_true*y_pred)
        union = tf.reduce_sum(y_true) + tf.reduce_sum(y_pred)
        return 1. - (2.*inter + eps) / (union + eps)
    return fn

def tversky_loss(alpha=0.7,beta=0.3,eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        TP = tf.reduce_sum(y_true*y_pred); FP = tf.reduce_sum((1-y_true)*y_pred); FN = tf.reduce_sum(y_true*(1-y_pred))
        return 1. - (TP+eps)/(TP+alpha*FN+beta*FP+eps)
    return fn

def poly1_bce(epsilon=1.0):
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1.0 - y_true)*(1.0 - y_pred)
        return tf.reduce_mean(bce + epsilon * (1.0 - p_t))
    return fn

def damce_loss(alpha=0.9, gamma=2.0):
    """DAMCE = alpha * w * CE + (1-alpha) * (1-w) * MSE, w=(1-p_t)^gamma"""
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)      # (B,)
        mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)      # (B,)
        p_t = y_true*y_pred + (1.0 - y_true)*(1.0 - y_pred)            # (B,2)
        p_t = tf.reduce_mean(p_t, axis=-1)                              # (B,)
        w = tf.pow(1.0 - p_t, gamma)
        return tf.reduce_mean(alpha*w*bce + (1.0-alpha)*(1.0-w)*mse)
    return fn

def get_loss(name):
    n = name.lower()
    return {
        "cce":    loss_cce(),
        "bce":    loss_bce(),
        "mse":    loss_mse(),
        "mae_ce": mae_ce_loss(alpha=0.5),
        "mse_ce": mse_ce_loss(alpha=0.5),
        "focal":  focal_loss(),
        "dice":   dice_loss(),
        "tversky": tversky_loss(),
        "poly1":  poly1_bce(epsilon=1.0),
        "damce":  damce_loss(alpha=0.9, gamma=2.0),
    }.get(n, loss_bce())

# --------------------
# Model
# --------------------
class Attention(Layer):
    def __init__(self, units):
        super().__init__()
        self.dense = Dense(units, activation='tanh')
        self.score = Dense(1, activation='sigmoid')
    def call(self, inputs):
        scores = self.score(self.dense(inputs))
        return Multiply()([inputs, scores])

def residual_block(x, units, dropout_rate=0.3):
    sc = x
    x = Dense(units, activation='relu')(x); x = BatchNormalization()(x); x = Dropout(dropout_rate)(x)
    x = Dense(units, activation=None)(x);   x = BatchNormalization()(x)
    if sc.shape[-1] != units: sc = Dense(units, activation=None)(sc)
    x = Add()([sc, x]); return tf.keras.activations.relu(x)

def build_model(input_shape, num_classes, loss_choice):
    use_softmax = loss_choice.lower() == "cce"
    final_activation = "softmax" if use_softmax else "sigmoid"
    inp = Input(shape=input_shape)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(inp)
    x = MaxPooling1D(pool_size=2)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)
    cnn_features = Dense(64, activation='relu')(x)
    attention_output = Attention(64)(cnn_features)
    x = residual_block(attention_output, 64)
    x = residual_block(x, 64)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    out = Dense(num_classes, activation=final_activation, dtype='float32')(x)
    return Model(inputs=inp, outputs=out)

# --------------------
# Load NSL-KDD
# --------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

label_col = "labels"

# Binary mapping: normal -> 0, attack -> 1
def make_binary(y):
    if y.dtype == object:
        return (y.str.lower() != "normal").astype(int).values
    return (y.astype(str).str.lower() != "normal").astype(int).values

y_train_raw = make_binary(train_df[label_col])
y_test_raw  = make_binary(test_df[label_col])

# Feature dataframe (drop label)
X_train_df = train_df.drop(columns=[label_col], errors="ignore")
X_test_df  = test_df.drop(columns=[label_col],  errors="ignore")

# Categorical columns in NSL-KDD
cat_cols = ["protocol_type", "service", "flag"]
for c in cat_cols:
    if c not in X_train_df.columns:
        raise ValueError(f"Expected categorical column '{c}' not found in data.")

# Encode categoricals jointly (train+test) to keep vocab consistent
full = pd.concat([X_train_df, X_test_df], axis=0, ignore_index=True)
for c in cat_cols:
    le = LabelEncoder()
    full[c] = le.fit_transform(full[c].astype(str))
X_train_df = full.iloc[:len(X_train_df)].copy()
X_test_df  = full.iloc[len(X_train_df):].copy()

# --------------------
# MI feature selection on TRAIN ONLY
# --------------------
mi_scores = mutual_info_classif(X_train_df.values, y_train_raw, discrete_features='auto',
                                random_state=RANDOM_STATE)
mi_ranking = pd.Series(mi_scores, index=X_train_df.columns).sort_values(ascending=False)
top_features = mi_ranking.head(TOP_K).index.tolist()
print("Top-{} MI features:".format(TOP_K), top_features)

X_train_raw = X_train_df[top_features].values
X_test_raw  = X_test_df[top_features].values

# --------------------
# SMOTE, scale, reshape
# --------------------
sm = SMOTE(random_state=RANDOM_STATE)
X_train_bal, y_train_bal = sm.fit_resample(X_train_raw, y_train_raw)
X_train_bal, y_train_bal = shuffle(X_train_bal, y_train_bal, random_state=RANDOM_STATE)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_bal)
X_test  = scaler.transform(X_test_raw)

y_train = to_categorical(y_train_bal, num_classes=2)
y_test  = to_categorical(y_test_raw,  num_classes=2)
num_classes = 2

X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn  = X_test.reshape((X_test.shape[0],  X_test.shape[1],  1))

# --------------------
# Train NN
# --------------------
tf.keras.backend.clear_session()
model = build_model((X_train.shape[1], 1), num_classes, LOSS_CHOICE)
loss_fn = get_loss(LOSS_CHOICE)
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate=1e-3, decay_steps=2000)
optimizer   = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=1.0)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=VAL_PATIENCE, restore_best_weights=True)]
history = model.fit(
    X_train_cnn, y_train,
    epochs=EPOCHS, batch_size=BATCH_SIZE,
    validation_split=0.1, callbacks=callbacks, verbose=2
)

# --------------------
# Train XGBoost
# --------------------
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, eval_metric='logloss', n_jobs=-1,
    tree_method="hist", max_depth=6, n_estimators=500, learning_rate=0.08,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0
)
xgb_model.fit(X_train, y_train_bal)

# --------------------
# Inference + metrics
# --------------------
preds_nn  = model.predict(X_test_cnn, verbose=0)
preds_xgb = xgb_model.predict_proba(X_test)
final_preds = ENSEMBLE_W_NN * preds_nn + ENSEMBLE_W_XGB * preds_xgb

final_class = np.argmax(final_preds, axis=1)
true_class  = np.argmax(y_test, axis=1)

print("\nClassification Report (Weighted Voting Ensemble):")
print(classification_report(true_class, final_class, digits=4))

acc       = accuracy_score(true_class, final_class)
precision = precision_score(true_class, final_class, average='weighted', zero_division=0)
recall    = recall_score(true_class, final_class, average='weighted', zero_division=0)
f1        = f1_score(true_class, final_class, average='weighted', zero_division=0)
fpr, tpr, _ = roc_curve(y_test.ravel(), final_preds.ravel())
roc_auc   = auc(fpr, tpr)

print(f"\nAccuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {roc_auc:.4f}")

plt.figure()
ConfusionMatrixDisplay(confusion_matrix(true_class, final_class)).plot(cmap='Blues')
plt.title("Confusion Matrix")

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curve"); plt.legend()

plt.figure()
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.xlabel("Epochs"); plt.ylabel("Accuracy"); plt.title("Accuracy over Epochs"); plt.legend()

plt.figure()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel("Epochs"); plt.ylabel("Loss"); plt.title("Loss over Epochs"); plt.legend()
plt.show()


UNSW-NB15

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay)
import tensorflow as tf
from tensorflow.keras.layers import (Dense, Dropout, BatchNormalization, Input, Conv1D,
                                     MaxPooling1D, Flatten, Add, Layer, Multiply)
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical
import xgboost as xgb
import matplotlib.pyplot as plt


# =========================
LOSS_CHOICE = "damce"  # one of: "cce","bce","mse","mae_ce","mse_ce","focal","dice","tversky","poly1","damce"

# === Custom Layers ===
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.dense = Dense(units, activation='tanh')
        self.score = Dense(1, activation='sigmoid')
    def call(self, inputs):
        scores = self.score(self.dense(inputs))
        return Multiply()([inputs, scores])

def residual_block(x, units, dropout_rate=0.3):
    shortcut = x
    x = Dense(units, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(units, activation=None)(x)
    x = BatchNormalization()(x)
    if shortcut.shape[-1] != units:
        shortcut = Dense(units, activation=None)(shortcut)
    x = Add()([shortcut, x])
    x = tf.keras.activations.relu(x)
    return x

# =========================
# Loss functions (full set)
# =========================
def loss_cce():
    return tf.keras.losses.CategoricalCrossentropy()

def loss_bce():
    return tf.keras.losses.BinaryCrossentropy()

def loss_mse():
    return tf.keras.losses.MeanSquaredError()

def mae_ce_loss(alpha=0.5):
    mae = tf.keras.losses.MeanAbsoluteError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred):
        return alpha*mae(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn

def mse_ce_loss(alpha=0.5):
    mse = tf.keras.losses.MeanSquaredError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred):
        return alpha*mse(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn

def focal_loss(gamma=2.0, alpha=0.25):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        ce = tf.keras.losses.binary_crossentropy(y_true, y_pred)   # per-sample BCE
        p_t = y_true*y_pred + (1-y_true)*(1-y_pred)
        return tf.reduce_mean(alpha * tf.pow(1. - p_t, gamma) * ce)
    return fn

def dice_loss(eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        inter = tf.reduce_sum(y_true*y_pred)
        union = tf.reduce_sum(y_true) + tf.reduce_sum(y_pred)
        return 1. - (2.*inter + eps) / (union + eps)
    return fn

def tversky_loss(alpha=0.7, beta=0.3, eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        TP = tf.reduce_sum(y_true*y_pred)
        FP = tf.reduce_sum((1-y_true)*y_pred)
        FN = tf.reduce_sum(y_true*(1-y_pred))
        return 1. - (TP + eps) / (TP + alpha*FN + beta*FP + eps)
    return fn

def poly1_bce(epsilon=1.0):
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1.0 - y_true)*(1.0 - y_pred)
        return tf.reduce_mean(bce + epsilon * (1.0 - p_t))
    return fn

def damce_loss(alpha=0.1, gamma=2.0):
    """
    Difficulty-Aware MSE+CE (DAMCE):
    loss = alpha * w * CE + (1-alpha) * (1-w) * MSE
    with w = (1 - p_t)^gamma, p_t = y*p + (1-y)*(1-p)
    """
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.-eps)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)        # (B,)
        mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)        # (B,)
        p_t = y_true*y_pred + (1.0 - y_true)*(1.0 - y_pred)              # (B, C)
        p_t = tf.reduce_mean(p_t, axis=-1)                                # (B,)
        w = tf.pow(1.0 - p_t, gamma)                                      # (B,)
        return tf.reduce_mean(alpha*w*bce + (1.0-alpha)*(1.0-w)*mse)
    return fn

def get_loss(name):
    n = name.lower()
    table = {
        "cce":    loss_cce(),
        "bce":    loss_bce(),
        "mse":    loss_mse(),
        "mae_ce": mae_ce_loss(alpha=0.5),
        "mse_ce": mse_ce_loss(alpha=0.5),
        "focal":  focal_loss(),
        "dice":   dice_loss(),
        "tversky": tversky_loss(),
        "poly1":  poly1_bce(epsilon=1.0),
        "damce":  damce_loss(alpha=0.9, gamma=2.0),
    }
    return table.get(n, loss_bce())

def build_model(input_shape, num_classes, loss_choice=LOSS_CHOICE):

    use_softmax = loss_choice.lower() == "cce"
    final_activation = "softmax" if use_softmax else "sigmoid"

    inp = Input(shape=input_shape)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(inp)
    x = MaxPooling1D(pool_size=2)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)
    cnn_features = Dense(64, activation='relu')(x)
    attention_output = Attention(64)(cnn_features)
    x = residual_block(attention_output, 64)
    x = residual_block(x, 64)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    out = Dense(num_classes, activation=final_activation, dtype='float32')(x)
    return Model(inputs=inp, outputs=out)

# === Load and preprocess data ===
train_df = pd.read_csv('/content/UNSW_NB15_training-set.csv')
test_df  = pd.read_csv('/content/UNSW_NB15_testing-set.csv')
train_df = train_df.drop(columns=['id', 'attack_cat'])
test_df  = test_df.drop(columns=['id', 'attack_cat'])

# === Encode categorical columns ===
categorical_cols = ['proto', 'service', 'state']
le = LabelEncoder()
for col in categorical_cols:
    combined = pd.concat([train_df[col], test_df[col]], ignore_index=True)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])

# === Prepare raw features and labels ===
X_train_raw = train_df.drop(columns=['label']).values
y_train_raw = train_df['label'].values
X_test_raw  = test_df.drop(columns=['label']).values
y_test_raw  = test_df['label'].values

# === Feature Selection using Mutual Information ===
feature_names = train_df.drop(columns=['label']).columns
mi_scores  = mutual_info_classif(X_train_raw, y_train_raw, discrete_features='auto', random_state=42)
mi_ranking = pd.Series(mi_scores, index=feature_names).sort_values(ascending=False)
top_features = mi_ranking.head(30).index.tolist()
print("Top 30 selected features:\n", top_features)

# === Reduce data to selected features ===
X_train_raw = train_df[top_features].values
X_test_raw  = test_df[top_features].values

# === Apply SMOTE and shuffle ===
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_raw, y_train_raw)
X_train_bal, y_train_bal = shuffle(X_train_bal, y_train_bal, random_state=42)

# === Scaling ===
scaler  = StandardScaler()
X_train = scaler.fit_transform(X_train_bal)
X_test  = scaler.transform(X_test_raw)

# === One-hot encode labels ===
y_train = to_categorical(y_train_bal)
y_test  = to_categorical(y_test_raw)
num_classes = y_train.shape[1]

# === Reshape for CNN ===
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn  = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# === Train CNN model ===
tf.keras.backend.clear_session()
model = build_model((X_train.shape[1], 1), num_classes, loss_choice=LOSS_CHOICE)
loss_fn = get_loss(LOSS_CHOICE)

# safer schedule
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate=1e-3, decay_steps=2000)
optimizer   = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=1.0)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=12, restore_best_weights=True)]
history = model.fit(X_train_cnn, y_train, epochs=100, batch_size=1024,
                    validation_split=0.1, callbacks=callbacks, verbose=2)

# === Train XGBoost ===

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
xgb_model.fit(X_train, y_train_bal)

# === Ensemble Inference (Weighted Voting) ===
preds_nn  = model.predict(X_test_cnn, verbose=0)
preds_xgb = xgb_model.predict_proba(X_test)
final_preds = 0.7 * preds_nn + 0.3 * preds_xgb
final_class = np.argmax(final_preds, axis=1)
true_class  = np.argmax(y_test, axis=1)

print("\nClassification Report (Weighted Voting Ensemble):")
print(classification_report(true_class, final_class))

# === Metrics ===
acc       = accuracy_score(true_class, final_class)
precision = precision_score(true_class, final_class, average='weighted', zero_division=0)
recall    = recall_score(true_class, final_class, average='weighted', zero_division=0)
f1        = f1_score(true_class, final_class, average='weighted', zero_division=0)
fpr, tpr, _ = roc_curve(y_test.ravel(), final_preds.ravel())
roc_auc   = auc(fpr, tpr)

print(f"\nAccuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {roc_auc:.4f}")

# === Confusion Matrix ===
plt.figure()
ConfusionMatrixDisplay(confusion_matrix(true_class, final_class)).plot(cmap='Blues')
plt.title("Confusion Matrix")

# === ROC Curve ===
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

# === Accuracy over Epochs ===
plt.figure()
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy over Epochs")
plt.legend()

# === Loss over Epochs ===
plt.figure()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()

plt.show()


CICIDS2017

In [None]:
# ==================================================
# BLOCK 1: MI Feature Selection + Stats Cache
# ==================================================
import os, gc, math, json, numpy as np, pandas as pd
from sklearn.feature_selection import mutual_info_classif
from pandas.errors import ParserError

# --------------------
# CONFIG (static per dataset)
# --------------------
FILES = {
    "Mon": "/content/monday.csv",
    "Tue": "/content/tuesday.csv",
    "Wed": "/content/wednesday.csv",
    "Thu": "/content/thursday.csv",
    "Fri": "/content/friday.csv",
}
CHUNKSIZE      = 250_000
MI_CAP_PER_CLS = 150_000   # sample cap PER CLASS for MI
TOP_K          = 30
RANDOM_STATE   = 42

DROP_COLS = ["id","Flow ID","Src IP","Dst IP","Timestamp","Label","Attempted Category"]
MI_CACHE  = "/content/mi_cache.json"

# --------------------
# Helpers
# --------------------
def read_csv_smart(path, chunksize):
    try:
        it = pd.read_csv(path, chunksize=chunksize, low_memory=False,
                         na_values=["NaN","Infinity","-Infinity"])
        yield next(it)
        for c in it: yield c
    except ParserError:
        # If default separator fails, try semicolon
        for c in pd.read_csv(path, chunksize=chunksize, low_memory=False,
                             sep=';', na_values=["NaN","Infinity","-Infinity"]):
            yield c
    except Exception as e:
        print(f"An unexpected error occurred while reading {path}: {e}")
        raise

def map_binary_labels(df):
    if "Label" not in df.columns: return None
    ls = df["Label"].astype(str)
    attempted = df["Attempted Category"].fillna(-1).astype("int32").ne(-1).values \
                if "Attempted Category" in df.columns else ls.str.contains(" - Attempted", regex=False).values
    benign = ls.eq("BENIGN").values
    return np.where(benign | attempted, 0, 1)

# Welford running mean/var (NaN-safe per column)
class RunningMoments:
    def __init__(self):
        self.n = 0
        self.mean = None
        self.M2 = None
    def update(self, X_np):
        if self.mean is None:
            self.mean = np.zeros(X_np.shape[1], dtype=np.float64)
            self.M2   = np.zeros(X_np.shape[1], dtype=np.float64)
        valid = ~np.isnan(X_np)
        counts = valid.sum(axis=0).astype(np.int64)
        for j in range(X_np.shape[1]):
            cnt = counts[j]
            if cnt == 0: continue
            xj = X_np[valid[:, j], j]
            batch_mean = float(xj.mean())
            delta = batch_mean - self.mean[j]
            tot_n = self.n + cnt
            self.mean[j] += delta * (cnt / max(1, tot_n))
            self.M2[j] += xj.var(ddof=0)*cnt + (delta**2) * (self.n * cnt / max(1, tot_n))
            self.n += cnt
    def finalize(self):
        var = self.M2 / np.maximum(1, self.n)
        std = np.sqrt(np.maximum(var, 1e-12))
        return self.mean.astype(np.float32), std.astype(np.float32)

# --------------------
# PASS 1: schema + stats + MI
# --------------------
print("[PASS1] Streaming for schema, running stats, MI sample…")
keep_cols = None
moments = RunningMoments()
mi_parts0, mi_parts1 = [], []
n0 = n1 = 0
total_rows = 0
global_counts = {0:0, 1:0}

for _, path in FILES.items():
    for chunk in read_csv_smart(path, CHUNKSIZE):
        y = map_binary_labels(chunk)
        if y is None: continue

        total_rows += len(y)
        global_counts[0] += int((y==0).sum())
        global_counts[1] += int((y==1).sum())

        X = chunk.drop(columns=[c for c in DROP_COLS if c in chunk.columns], errors="ignore")
        X = X.apply(pd.to_numeric, errors="coerce").replace([np.inf,-np.inf], np.nan)

        if keep_cols is None:
            keep_cols = list(X.columns)
        X = X.reindex(columns=keep_cols).astype(np.float32)

        # update running moments
        moments.update(X.values)

        # bounded MI sample (stratified)
        need0 = max(0, MI_CAP_PER_CLS - n0)
        need1 = max(0, MI_CAP_PER_CLS - n1)
        if need0 > 0:
            X0 = X[y==0]
            if len(X0) > 0:
                take0 = X0.sample(n=min(need0, len(X0)), random_state=RANDOM_STATE)
                mi_parts0.append(take0); n0 += len(take0)
        if need1 > 0:
            X1 = X[y==1]
            if len(X1) > 0:
                take1 = X1.sample(n=min(need1, len(X1)), random_state=RANDOM_STATE)
                mi_parts1.append(take1); n1 += len(take1)

        del chunk, X
        gc.collect()

print(f"[PASS1] Rows={total_rows:,} | global class counts: {global_counts}")
col_means, col_stds = moments.finalize()

# MI on sample
if mi_parts0 or mi_parts1:
    X_mi = pd.concat(mi_parts0 + mi_parts1, ignore_index=True)
    y_mi = np.array([0]*n0 + [1]*n1, dtype=np.int8)
    mi_means = pd.Series(col_means, index=keep_cols)
    X_mi = X_mi.fillna(mi_means)
    mi_scores = mutual_info_classif(X_mi.values, y_mi, discrete_features='auto', random_state=RANDOM_STATE)
    top_features = pd.Series(mi_scores, index=keep_cols).sort_values(ascending=False).head(TOP_K).index.tolist()
else:
    top_features = keep_cols[:TOP_K]
print(f"[PASS1] Top-{TOP_K} features: {top_features[:10]} …")

# class weights / xgb spw
neg, pos = global_counts[0], global_counts[1]
class_weight = {0: 1.0, 1: (neg / max(1, pos))}
xgb_spw = neg / max(1, pos)

# save cache
with open(MI_CACHE, "w") as f:
    json.dump({
        "keep_cols": keep_cols,
        "top_features": top_features,
        "col_means": col_means.tolist(),
        "col_stds": col_stds.tolist(),
        "class_weight": class_weight,
        "xgb_spw": float(xgb_spw),
        "total_rows": int(total_rows)
    }, f)
print(f"[PASS1] Cached MI & stats -> {MI_CACHE}")

In [None]:
# ==================================================
# BLOCK 2 (CNS 5-day): Train + Evaluate (loads MI cache; fast experiments)
# ==================================================
import os, gc, math, json, numpy as np, pandas as pd, tensorflow as tf, xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Conv1D, MaxPooling1D, Flatten, Add, Layer, Multiply
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical

# --------------------
# CONFIG
# --------------------
FILES = {
    "Mon": "/content/monday.csv",
    "Tue": "/content/tuesday.csv",
    "Wed": "/content/wednesday.csv",
    "Thu": "/content/thursday.csv",
    "Fri": "/content/friday.csv",
}
CHUNKSIZE      = 250_000
BATCH_SIZE_NN  = 1024
EPOCHS         = 10
LOSS_CHOICE    = "damce"        # "bce","cce","mse","focal","dice","tversky","poly1","bhfdl"
ENSEMBLE_W_NN  = 0.7
ENSEMBLE_W_XGB = 0.3
XGB_MAX_DEPTH  = 6
XGB_ROUNDS_PER_BATCH = 30
RANDOM_STATE   = 42

DROP_COLS = ["id","Flow ID","Src IP","Dst IP","Timestamp","Label","Attempted Category"]
MI_CACHE  = "/content/mi_cache.json"

# --------------------
# Load MI cache
# --------------------
with open(MI_CACHE, "r") as f:
    _c = json.load(f)
keep_cols    = _c["keep_cols"]
top_features = _c["top_features"]
col_means    = np.array(_c["col_means"], dtype=np.float32)
col_stds     = np.array(_c["col_stds"],  dtype=np.float32)
mean_dict    = {c: col_means[i] for i, c in enumerate(keep_cols)}
std_dict     = {c: (col_stds[i] if col_stds[i] > 1e-12 else 1.0) for i, c in enumerate(keep_cols)}
class_weight = {int(k): float(v) for k,v in _c["class_weight"].items()}
xgb_spw      = float(_c["xgb_spw"])
total_rows   = int(_c["total_rows"])
feat_dim     = len(top_features)
print(f"[CACHE] Loaded. feat_dim={feat_dim}, total_rows≈{total_rows:,}")

# --------------------
# Losses
# --------------------
def loss_cce():  return tf.keras.losses.CategoricalCrossentropy()
def loss_bce():  return tf.keras.losses.BinaryCrossentropy()
def loss_mse():  return tf.keras.losses.MeanSquaredError()  # Brier-like; stable
def damce_loss(alpha=0.5, gamma=2.0):
    """
    Difficulty-Aware MSE+CE (DAMCE):
    - CE weight w = (1 - p_t)^gamma  (harder → higher CE weight)
    - MSE weight = 1 - w             (easier → higher MSE weight)
    Final = alpha * w * CE + (1-alpha) * (1-w) * MSE
    """
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)


        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)


        mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)  # shape (B,)

        # true-class confidence p_t (average across classes for 1-hot -> same as true class prob)
        p_t = y_true * y_pred + (1.0 - y_true) * (1.0 - y_pred)    # (B, C)
        p_t = tf.reduce_mean(p_t, axis=-1)                         # (B,)

        # difficulty weight
        w = tf.pow(1.0 - p_t, gamma)                               # (B,)

        loss_vec = alpha * w * bce + (1.0 - alpha) * (1.0 - w) * mse
        return tf.reduce_mean(loss_vec)
    return fn

def focal_loss(gamma=2.0, alpha=0.25):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        ce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1-y_true)*(1-y_pred)
        return tf.reduce_mean(alpha*tf.pow(1.-p_t, gamma)*ce)
    return fn

def dice_loss(eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        inter = tf.reduce_sum(y_true*y_pred)
        union = tf.reduce_sum(y_true)+tf.reduce_sum(y_pred)
        return 1. - (2.*inter+eps)/(union+eps)
    return fn

def tversky_loss(alpha=0.8,beta=0.2,eps=1e-6):
    def fn(y_true,y_pred):
        y_true = tf.cast(y_true, tf.float32)
        TP = tf.reduce_sum(y_true*y_pred); FP = tf.reduce_sum((1-y_true)*y_pred); FN = tf.reduce_sum(y_true*(1-y_pred))
        return 1. - (TP+eps)/(TP+alpha*FN+beta*FP+eps)
    return fn


def poly1_bce(epsilon=1.0):
    def fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1.0 - y_true)*(1.0 - y_pred)
        poly = bce + epsilon * (1.0 - p_t)
        return tf.reduce_mean(poly)
    return fn
def mae_ce_loss(alpha=0.5):
    mae = tf.keras.losses.MeanAbsoluteError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred):
        return alpha*mae(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn

def mse_ce_loss(alpha=0.5):
    mse = tf.keras.losses.MeanSquaredError()
    bce = tf.keras.losses.BinaryCrossentropy()
    def fn(y_true, y_pred):
        return alpha*mse(y_true,y_pred) + (1.0-alpha)*bce(y_true,y_pred)
    return fn


def bhfdl(lambda_=0.5,gamma=2.0,alpha=0.25):
    fl = focal_loss(gamma=gamma, alpha=alpha); dl = dice_loss()
    def fn(y_true,y_pred): return lambda_*fl(y_true,y_pred) + (1.-lambda_)*dl(y_true,y_pred)
    return fn

def get_loss(name):
    n = name.lower()
    table = {
        "cce":    loss_cce(),
        "bce":    loss_bce(),
        "mse":    loss_mse(),
        "focal":  focal_loss(),
        "dice":   dice_loss(),
        "tversky": tversky_loss(),
        "poly1":  poly1_bce(epsilon=1.0),
        "bhfdl":  bhfdl(),
        "mae_ce": mae_ce_loss(alpha=0.5),
        "mse_ce": mse_ce_loss(alpha=0.5),
        "damce":  damce_loss(alpha=0.5, gamma=2.0),  # <— NEW
    }
    return table.get(n, loss_bce())


# --------------------
# Model
# --------------------
class Attention(Layer):
    def __init__(self, units):
        super().__init__()
        self.dense = Dense(units, activation='tanh')
        self.score = Dense(1, activation='sigmoid')
    def call(self, inputs):
        scores = self.score(self.dense(inputs))
        return Multiply()([inputs, scores])

def residual_block(x, units, dropout_rate=0.3):
    sc = x
    x = Dense(units, activation='relu')(x); x = BatchNormalization()(x); x = Dropout(dropout_rate)(x)
    x = Dense(units, activation=None)(x);   x = BatchNormalization()(x)
    if sc.shape[-1] != units: sc = Dense(units, activation=None)(sc)
    x = Add()([sc, x]); return tf.keras.activations.relu(x)

def build_model(input_shape, num_classes=2):
    inp = Input(shape=input_shape)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(inp)
    x = MaxPooling1D(pool_size=2)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)
    cnn_features = Dense(64, activation='relu')(x)
    attention_output = Attention(64)(cnn_features)
    x = residual_block(attention_output, 64)
    x = residual_block(x, 64)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    out = Dense(num_classes, activation='sigmoid', dtype='float32')(x)
    return Model(inputs=inp, outputs=out)

# --------------------
# Streaming readers
# --------------------
def read_csv_smart(path, chunksize):
    try:
        it = pd.read_csv(path, chunksize=chunksize, low_memory=False,
                         na_values=["NaN","Infinity","-Infinity"])
        first = next(it)
        if first.shape[1] == 1: raise ValueError("Bad sep, retry ;")
        yield first
        for c in it: yield c
    except Exception:
        for c in pd.read_csv(path, chunksize=chunksize, low_memory=False, sep=';',
                             na_values=["NaN","Infinity","-Infinity"]):
            yield c

def map_binary_labels(df):
    if "Label" not in df.columns: return None
    ls = df["Label"].astype(str)
    attempted = df["Attempted Category"].fillna(-1).astype("int32").ne(-1).values \
                if "Attempted Category" in df.columns else ls.str.contains(" - Attempted", regex=False).values
    benign = ls.eq("BENIGN").values
    return np.where(benign | attempted, 0, 1)

def stream_batches(files_map, batch_size, selected_cols, means, stds, y_onehot=True, repeat_epochs=1):
    cols = selected_cols
    for _ in range(repeat_epochs):
        for _, path in files_map.items():
            X_buf, y_buf = [], []
            for chunk in read_csv_smart(path, CHUNKSIZE):
                y = map_binary_labels(chunk)
                if y is None: continue
                X = chunk.drop(columns=[c for c in DROP_COLS if c in chunk.columns], errors="ignore")
                X = X.reindex(columns=keep_cols).astype(np.float32)
                X = X.fillna(pd.Series(means))
                X = X[cols]
                X = (X - pd.Series({c: means[c] for c in cols})) / pd.Series({c: stds[c] for c in cols})

                Xv = X.values.astype(np.float32)
                yv = y.astype(np.int8)
                if y_onehot: yv = to_categorical(yv, num_classes=2).astype(np.float32)

                X_buf.append(Xv); y_buf.append(yv)

                while sum(len(b) for b in X_buf) >= batch_size:
                    need = batch_size; xb_parts, yb_parts = [], []
                    while need > 0 and X_buf:
                        take = min(need, len(X_buf[0]))
                        xb_parts.append(X_buf[0][:take]); yb_parts.append(y_buf[0][:take])
                        X_buf[0] = X_buf[0][take:]; y_buf[0] = y_buf[0][take:]
                        if len(X_buf[0]) == 0: X_buf.pop(0); y_buf.pop(0)
                        need -= take
                    Xb = np.vstack(xb_parts)
                    yb = np.vstack(yb_parts) if y_onehot else np.concatenate(yb_parts).astype(np.int8)
                    yield Xb, yb

                del chunk, X, Xv, yv; gc.collect()

            if X_buf:
                Xb = np.vstack(X_buf)
                yb = np.vstack(y_buf) if y_onehot else np.concatenate(y_buf).astype(np.int8)
                for i in range(0, len(Xb), batch_size):
                    yield Xb[i:i+batch_size], yb[i:i+batch_size]
            gc.collect()

def steps_per_epoch(total_rows, batch_size):
    return math.ceil(total_rows / batch_size)

# --------------------
# Train NN (streamed over ALL rows)
# --------------------
print("[NN] Training…")
tf.keras.backend.clear_session()
nn = build_model((feat_dim, 1), num_classes=2)
loss_fn = get_loss(LOSS_CHOICE)
lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(0.001, first_decay_steps=1000)
nn.compile(optimizer=tf.keras.optimizers.Adam(lr_schedule), loss=loss_fn, metrics=['accuracy'])

def gen():
    for xb, yb in stream_batches(FILES, BATCH_SIZE_NN, top_features, mean_dict, std_dict, y_onehot=True, repeat_epochs=EPOCHS):
        yield (xb.reshape((xb.shape[0], xb.shape[1], 1)), yb)

output_signature = (
    tf.TensorSpec(shape=(None, feat_dim, 1), dtype=tf.float32),
    tf.TensorSpec(shape=(None, 2), dtype=tf.float32),
)
ds = tf.data.Dataset.from_generator(gen, output_signature=output_signature).prefetch(tf.data.AUTOTUNE)
spe = steps_per_epoch(total_rows, BATCH_SIZE_NN)
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, restore_best_weights=True)]
nn.fit(ds, epochs=EPOCHS, steps_per_epoch=spe, verbose=2, class_weight=class_weight, callbacks=callbacks)

# --------------------
# Train XGBoost incrementally over ALL rows
# --------------------
print("[XGB] Training incrementally…")
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
    "max_depth": XGB_MAX_DEPTH,
    "learning_rate": 0.08,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "max_bin": 256,
    "reg_lambda": 1.0,
    "nthread": -1,
    "scale_pos_weight": xgb_spw,
    "random_state": RANDOM_STATE,
}
booster = None
for xb, yb_1hot in stream_batches(FILES, batch_size=50_000, selected_cols=top_features,
                                  means=mean_dict, stds=std_dict, y_onehot=True, repeat_epochs=1):
    yflat = np.argmax(yb_1hot, axis=1).astype(np.float32).reshape(-1)
    dtrain = xgb.DMatrix(xb, label=yflat)
    booster = xgb.train(params=xgb_params, dtrain=dtrain,
                        num_boost_round=XGB_ROUNDS_PER_BATCH, xgb_model=booster)
    gc.collect()

# --------------------
# Evaluate (stream once over ALL rows)
# --------------------
print("[EVAL] Evaluating on ALL rows…")
y_true_all, y_pred_all = [], []
cm = np.array([[0,0],[0,0]], dtype=np.int64)

for xb, yb_1hot in stream_batches(FILES, batch_size=60_000, selected_cols=top_features,
                                  means=mean_dict, stds=std_dict, y_onehot=True, repeat_epochs=1):
    ytrue = np.argmax(yb_1hot, axis=1).astype(np.int64)

    # NN probs
    nn_probs = nn.predict(xb.reshape((xb.shape[0], xb.shape[1], 1)), verbose=0)
    # XGB probs
    dx = xgb.DMatrix(xb)
    xgb_probs1 = booster.predict(dx)
    xgb_probs  = np.stack([1.0 - xgb_probs1, xgb_probs1], axis=1)

    # Ensemble
    probs = ENSEMBLE_W_NN * nn_probs + ENSEMBLE_W_XGB * xgb_probs
    preds = np.argmax(probs, axis=1)

    y_true_all.append(ytrue); y_pred_all.append(preds)
    cm += confusion_matrix(ytrue, preds, labels=[0,1])
    gc.collect()

y_true_all = np.concatenate(y_true_all)
y_pred_all = np.concatenate(y_pred_all)

acc  = accuracy_score(y_true_all, y_pred_all)
prec = precision_score(y_true_all, y_pred_all, average='weighted', zero_division=0)
rec  = recall_score(y_true_all, y_pred_all, average='weighted', zero_division=0)
f1   = f1_score(y_true_all, y_pred_all, average='weighted', zero_division=0)

print("\n=== Metrics (ALL ROWS, merged days) ===")
print(classification_report(y_true_all, y_pred_all, digits=4))
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("Confusion Matrix [[TN, FP],[FN, TP]]:\n", cm)
