In [1]:
from google.colab import drive

import pandas as pd
drive.mount('/content/drive')


df = pd.read_csv('/content/drive/MyDrive/DNN-EdgeIIoT-dataset.csv')

print(f"Dataset loaded: {df.shape}")


Mounted at /content/drive


  df = pd.read_csv('/content/drive/MyDrive/DNN-EdgeIIoT-dataset.csv')


Dataset loaded: (2219201, 63)


In [2]:
# The labels are already in the dataset
print("Dataset columns:")
print(df.columns.tolist())

# The last column should be the label column
print("\nLabel column name:", df.columns[-1])
print("Label distribution:")
print(df.iloc[:, -1].value_counts())

# Use cleaned dataframe directly
df_with_labels = df.copy()
print("\nDataset with labels shape:", df_with_labels.shape)

Dataset columns:
['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4', 'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused', 'http.file_data', 'http.content_length', 'http.request.uri.query', 'http.request.method', 'http.referer', 'http.request.full_uri', 'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options', 'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conack.flags', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.pr

In [4]:
attack_types_to_move = [
    "MITM",
    "Fingerprinting",
    "Ransomware",
    "XSS"
]


In [5]:
# Rows that match the chosen attack types
df_moved = df[df["Attack_type"].isin(attack_types_to_move)]

# Remaining dataset (everything else)
df_remaining = df[~df["Attack_type"].isin(attack_types_to_move)]

print("Moved subset shape:", df_moved.shape)
print("Remaining dataset shape:", df_remaining.shape)


Moved subset shape: (29055, 63)
Remaining dataset shape: (2190146, 63)


In [6]:
print("\nRemaining label distribution:")
print(df_remaining["Attack_type"].value_counts())

print("\nMoved subset label distribution:")
print(df_moved["Attack_type"].value_counts())



Remaining label distribution:
Attack_type
Normal                   1615643
DDoS_UDP                  121568
DDoS_ICMP                 116436
SQL_injection              51203
Password                   50153
Vulnerability_scanner      50110
DDoS_TCP                   50062
DDoS_HTTP                  49911
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
Name: count, dtype: int64

Moved subset label distribution:
Attack_type
XSS               15915
Ransomware        10925
MITM               1214
Fingerprinting     1001
Name: count, dtype: int64


In [7]:
output_remaining_path = "/content/drive/MyDrive/DNN-EdgeIIoT-main_filtered.csv"
output_moved_path = "/content/drive/MyDrive/DNN-EdgeIIoT-moved_attacks.csv"

df_remaining.to_csv(output_remaining_path, index=False)
df_moved.to_csv(output_moved_path, index=False)

print("Saved:")
print(" → Remaining:", output_remaining_path)
print(" → Moved:", output_moved_path)


Saved:
 → Remaining: /content/drive/MyDrive/DNN-EdgeIIoT-main_filtered.csv
 → Moved: /content/drive/MyDrive/DNN-EdgeIIoT-moved_attacks.csv


In [None]:
# ============================================================
# 0. Mount Google Drive
# ============================================================
from google.colab import drive
drive.mount('/content/drive')

# ============================================================
# 1. Imports
# ============================================================
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

# ============================================================
# 2. Paths (EDIT THESE IF NEEDED)
# ============================================================

# Path to your main filtered dataset (without the 4 removed attack types)
DATA_PATH = "/content/drive/MyDrive/DNN-EdgeIIoT-main_filtered.csv"

# Folder in Drive where you want to save model & artifacts
OUTPUT_DIR = "/content/drive/MyDrive/EdgeIIoT_filtered_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("DATA_PATH:", DATA_PATH)
print("OUTPUT_DIR:", OUTPUT_DIR)

# ============================================================
# 3. Load and basic preprocess
# ============================================================
print("\nLoading filtered dataset...")
df = pd.read_csv(DATA_PATH)
print("Full filtered dataset shape:", df.shape)
print("\nAttack_type distribution:")
print(df["Attack_type"].value_counts())

# --------- Separate features and label ----------
label_col = "Attack_type"

y = df[label_col].values

# Keep only numeric features to match CNN/LSTM expectations
X = df.drop(columns=[label_col, "Attack_label"], errors="ignore")
X = X.select_dtypes(include=[np.number])

print("\nNumeric feature matrix shape:", X.shape)
print("Columns used as features:")
print(list(X.columns))

# ============================================================
# 4. Train / Val / Test split
# ============================================================
print("\nSplitting into train / val / test (60 / 20 / 20)...")

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full
)  # 0.25 of 0.8 = 0.2 → 60/20/20

print(f"Train set:      {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set:       {X_test.shape}")

# ============================================================
# 5. Prepare data (scaling, label encoding, reshape)
# ============================================================
def prepare_data(X_train, X_val, X_test, y_train, y_val, y_test):
    """Normalize features and encode labels, reshape for CNN-LSTM."""
    print("\nPreparing data for CNN-LSTM...")

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)
    y_test_encoded = label_encoder.transform(y_test)

    num_classes = len(label_encoder.classes_)
    print(f"Number of classes: {num_classes}")
    print("Classes:", label_encoder.classes_)

    # Reshape for CNN-LSTM: (samples, timesteps, features)
    n_features = X_train_scaled.shape[1]
    timesteps = 1

    X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], timesteps, n_features)
    X_val_reshaped = X_val_scaled.reshape(X_val_scaled.shape[0], timesteps, n_features)
    X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], timesteps, n_features)

    print("\nReshaped data:")
    print("X_train:", X_train_reshaped.shape)
    print("X_val:  ", X_val_reshaped.shape)
    print("X_test: ", X_test_reshaped.shape)

    return (
        X_train_reshaped, X_val_reshaped, X_test_reshaped,
        y_train_encoded, y_val_encoded, y_test_encoded,
        scaler, label_encoder, num_classes
    )

X_train_prep, X_val_prep, X_test_prep, y_train_enc, y_val_enc, y_test_enc, scaler, label_encoder, num_classes = prepare_data(
    X_train, X_val, X_test, y_train, y_val, y_test
)

# ============================================================
# 6. Model definitions (same as your original)
# ============================================================
def build_multiclass_cnn_lstm(input_shape, num_classes):
    """Build CNN-LSTM model for multiclass classification."""
    model = models.Sequential([
        # CNN Layers for feature extraction
        layers.Conv1D(filters=64, kernel_size=3, activation='relu',
                      input_shape=input_shape, padding='same', name='conv1d_1'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=1),
        layers.Dropout(0.2),

        layers.Conv1D(filters=128, kernel_size=3, activation='relu',
                      padding='same', name='conv1d_2'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=1),
        layers.Dropout(0.2),

        layers.Conv1D(filters=256, kernel_size=3, activation='relu',
                      padding='same', name='conv1d_3'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=1),
        layers.Dropout(0.2),

        # LSTM Layers for temporal dependencies
        layers.LSTM(128, return_sequences=True, name='lstm_1'),
        layers.Dropout(0.2),

        layers.LSTM(64, return_sequences=False, name='lstm_2'),
        layers.Dropout(0.2),

        # Dense Layers for classification
        layers.Dense(128, activation='relu', name='dense_1'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        layers.Dense(64, activation='relu', name='dense_2'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        # Output layer for multiclass classification
        layers.Dense(num_classes, activation='softmax', name='output')
    ])
    return model


def compile_and_train_multiclass(model, X_train, y_train, X_val, y_val,
                                 epochs=50, batch_size=128, ckpt_path=None):
    """Compile and train multiclass classification model."""
    print("\nCompiling multiclass classification model...")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    print("\nModel Summary:")
    model.summary()

    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )

    if ckpt_path is None:
        ckpt_path = os.path.join(OUTPUT_DIR, "best_multiclass_cnn_lstm_model.h5")

    model_checkpoint = ModelCheckpoint(
        ckpt_path,
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    )

    print("\nTraining multiclass classification model...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, model_checkpoint, reduce_lr],
        verbose=1
    )

    return model, history


def evaluate_model(model, X_test, y_test, label_encoder):
    """Evaluate model performance."""
    print("\n" + "="*70)
    print("MODEL EVALUATION")
    print("="*70)

    # Predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    print(f"\nAccuracy:  {accuracy*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")
    print(f"Recall:    {recall*100:.2f}%")
    print(f"F1-Score:  {f1*100:.2f}%")

    print("\n" + "="*70)
    print("CLASSIFICATION REPORT")
    print("="*70)
    print(classification_report(
        y_test, y_pred,
        target_names=label_encoder.classes_,
        zero_division=0
    ))

    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, cm, y_pred


def plot_training_history(history, save_path):
    """Plot training and validation metrics."""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    # Loss
    axes[0].plot(history.history['loss'], label='Training Loss', linewidth=2)
    axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].set_title('Model Loss', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(alpha=0.3)

    # Accuracy
    axes[1].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
    axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Accuracy', fontsize=12)
    axes[1].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[1].legend()
    axes[1].grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nTraining history plot saved: {save_path}")
    plt.show()


def plot_confusion_matrix(cm, labels, save_path):
    """Plot confusion matrix."""
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels,
                cbar_kws={'label': 'Count'})
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Confusion matrix plot saved: {save_path}")
    plt.show()

# ============================================================
# 7. Main: train on MAIN FILTERED DATASET and save to Drive
# ============================================================

print("\n" + "="*70)
print("MULTICLASS CLASSIFICATION ON MAIN FILTERED ATTACKS")
print("="*70)

input_shape = (X_train_prep.shape[1], X_train_prep.shape[2])
model = build_multiclass_cnn_lstm(input_shape, num_classes)

ckpt_file = os.path.join(OUTPUT_DIR, "best_multiclass_cnn_lstm_model.h5")

model, history = compile_and_train_multiclass(
    model,
    X_train_prep, y_train_enc,
    X_val_prep, y_val_enc,
    epochs=50,
    batch_size=128,
    ckpt_path=ckpt_file
)

# Evaluation
accuracy, precision, recall, f1, cm, y_pred = evaluate_model(
    model, X_test_prep, y_test_enc, label_encoder
)

# Plots
history_plot_path = os.path.join(OUTPUT_DIR, "multiclass_training_history.png")
cm_plot_path = os.path.join(OUTPUT_DIR, "multiclass_confusion_matrix.png")

plot_training_history(history, history_plot_path)
plot_confusion_matrix(cm, label_encoder.classes_, cm_plot_path)

# ============================================================
# 8. Save model + artifacts to Drive
# ============================================================
print("\n" + "="*70)
print("SAVING MODEL AND RESULTS TO GOOGLE DRIVE")
print("="*70)

# Save final model (Keras format)
final_model_path = os.path.join(OUTPUT_DIR, "cnn_lstm_filtered_final_model.keras")
model.save(final_model_path)
print("Model saved:", final_model_path)

# Save label encoder
le_path = os.path.join(OUTPUT_DIR, "label_encoder_filtered.pkl")
with open(le_path, 'wb') as f:
    pickle.dump(label_encoder, f)
print("Label encoder saved:", le_path)

# Save scaler
scaler_path = os.path.join(OUTPUT_DIR, "scaler_filtered.pkl")
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved:", scaler_path)

# Save results
results = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'num_classes': num_classes,
    'classes': label_encoder.classes_,
    'confusion_matrix': cm
}
results_path = os.path.join(OUTPUT_DIR, "model_results_filtered.pkl")
with open(results_path, 'wb') as f:
    pickle.dump(results, f)
print("Results saved:", results_path)

print("\n" + "="*70)
print("TRAINING ON FILTERED DATASET COMPLETE!")
print("="*70)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
TensorFlow version: 2.19.0
Keras version: 3.10.0
DATA_PATH: /content/drive/MyDrive/DNN-EdgeIIoT-main_filtered.csv
OUTPUT_DIR: /content/drive/MyDrive/EdgeIIoT_filtered_model

Loading filtered dataset...
Full filtered dataset shape: (2190146, 63)

Attack_type distribution:
Attack_type
Normal                   1615643
DDoS_UDP                  121568
DDoS_ICMP                 116436
SQL_injection              51203
Password                   50153
Vulnerability_scanner      50110
DDoS_TCP                   50062
DDoS_HTTP                  49911
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
Name: count, dtype: int64

Numeric feature matrix shape: (2190146, 43)
Columns used as features:
['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused', 'http.content_length', 


Training multiclass classification model...
Epoch 1/50
[1m 7365/10267[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m44s[0m 15ms/step - accuracy: 0.8911 - loss: 0.3898