<a href="https://colab.research.google.com/github/Anton3090/nslkdd-robust-ids/blob/main/NSL_KDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
hassan06_nslkdd_path = kagglehub.dataset_download('hassan06/nslkdd')

print('Data source import complete.')


Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nslkdd/KDDTest+.arff
/kaggle/input/nslkdd/KDDTest-21.arff
/kaggle/input/nslkdd/KDDTest1.jpg
/kaggle/input/nslkdd/KDDTrain+.txt
/kaggle/input/nslkdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/KDDTest-21.txt
/kaggle/input/nslkdd/KDDTest+.txt
/kaggle/input/nslkdd/KDDTrain+.arff
/kaggle/input/nslkdd/index.html
/kaggle/input/nslkdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/KDDTrain1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTest1.jpg
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest-21.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTest+.txt
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+.arff
/kaggle/input/nslkdd/nsl-kdd/index.html
/kaggle/input/nslkdd/nsl-kdd/KDDTrain+_20Percent.arff
/kaggle/input/nslkdd/nsl-kdd/KDDTrain1.jpg


**Dataset Import**

In [3]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Download and load the dataset
path = kagglehub.dataset_download("hassan06/nslkdd")
print("Path to dataset files:", path)

# Load train and test datasets
train_df = pd.read_csv(f"{path}/KDDTrain+.txt", header=None)
test_df = pd.read_csv(f"{path}/KDDTest+.txt", header=None)

# NSL-KDD has 41 features + 1 label column
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label', 'difficulty_level'  # added last column
]

train_df.columns = test_df.columns = columns





Path to dataset files: /kaggle/input/nslkdd


**Preprocessing**

In [4]:
# Binary classification: normal vs attack
train_df['label'] = train_df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')
test_df['label'] = test_df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

# Encode categorical features
cat_cols = ['protocol_type', 'service', 'flag']
encoder = LabelEncoder()
for col in cat_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

# Separate features and labels
X_train = train_df.drop('label', axis=1)
y_train = LabelEncoder().fit_transform(train_df['label'])

X_test = test_df.drop('label', axis=1)
y_test = LabelEncoder().fit_transform(test_df['label'])

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


**Build & Train Deep Learning Model**

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=128, shuffle=True)

# Define model
class IDSModel(nn.Module):
    def __init__(self, input_dim):
        super(IDSModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.layers(x)

model = IDSModel(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(10):
    for xb, yb in train_loader:
        out = model(xb)
        loss = criterion(out, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")


Epoch 1 - Loss: 0.0100
Epoch 2 - Loss: 0.0014
Epoch 3 - Loss: 0.0079
Epoch 4 - Loss: 0.0000
Epoch 5 - Loss: 0.0253
Epoch 6 - Loss: 0.0007
Epoch 7 - Loss: 0.0000
Epoch 8 - Loss: 0.0172
Epoch 9 - Loss: 0.0006
Epoch 10 - Loss: 0.0003


**Evaluation**

In [6]:
model.eval()
with torch.no_grad():
    preds = torch.argmax(model(X_test_tensor), dim=1)
    acc = (preds == y_test_tensor).float().mean()
print("Test Accuracy:", acc.item())


Test Accuracy: 0.8662615418434143


**Real-Time Packet Detection with Scapy**

In [7]:
# Save model
torch.save(model.state_dict(), 'ids_model.pth')


In [8]:
pip install scapy torch numpy joblib


Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

In [9]:
from sklearn.preprocessing import StandardScaler
import joblib

# Example: Fit scaler to your training features
scaler = StandardScaler()
scaler.fit(X_train)  # X_train should be your training features (without labels)

# Save to a file
joblib.dump(scaler, "scaler.save")


['scaler.save']

In [10]:
from scapy.all import sniff, IP, TCP, UDP
import torch
import torch.nn as nn
import numpy as np
import joblib
from datetime import datetime

# Define the same model architecture
class IDSModel(nn.Module):
    def __init__(self, input_dim):
        super(IDSModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.layers(x)

# Load model and scaler
model = IDSModel(input_dim=42)  # NSL-KDD has 41 features
model.load_state_dict(torch.load("ids_model.pth"))
model.eval()

scaler = joblib.load("scaler.save")

# Example feature extractor (customize to match training features)
def extract_features(pkt):
    try:
        # Very basic feature simulation
        length = len(pkt)
        ttl = pkt[IP].ttl if IP in pkt else 0
        dport = pkt[TCP].dport if TCP in pkt else (pkt[UDP].dport if UDP in pkt else 0)
        features = [length, ttl, dport]
        features += [0] * (42 - len(features))  # Pad to 41 features
        return np.array(features).reshape(1, -1)
    except:
        return np.zeros((1, 41))  # Return dummy data on failure

# Packet classification and logging
def classify_packet(pkt):
    features = extract_features(pkt)
    scaled = scaler.transform(features)
    tensor = torch.tensor(scaled, dtype=torch.float32)
    output = model(tensor)
    pred = torch.argmax(output).item()
    label = "attack" if pred == 1 else "normal"

    # Print & log
    print(f"[{datetime.now()}] Packet classified as: {label}")
    with open("log.txt", "a") as f:
        f.write(f"{datetime.now()} | {pkt.summary()} | Result: {label}\n")

# Start sniffing
print("Sniffing... Press Ctrl+C to stop.")
sniff(prn=classify_packet, count=200)


Sniffing... Press Ctrl+C to stop.
[2025-05-20 20:38:03.457974] Packet classified as: normal
[2025-05-20 20:38:03.460795] Packet classified as: normal
[2025-05-20 20:38:03.670040] Packet classified as: normal
[2025-05-20 20:38:03.672376] Packet classified as: normal
[2025-05-20 20:38:03.685652] Packet classified as: normal
[2025-05-20 20:38:03.687751] Packet classified as: normal
[2025-05-20 20:38:03.689522] Packet classified as: normal
[2025-05-20 20:38:03.700061] Packet classified as: normal
[2025-05-20 20:38:03.702143] Packet classified as: normal
[2025-05-20 20:38:03.704390] Packet classified as: normal
[2025-05-20 20:38:03.708900] Packet classified as: normal
[2025-05-20 20:38:03.710808] Packet classified as: normal
[2025-05-20 20:38:03.712649] Packet classified as: normal
[2025-05-20 20:38:03.714329] Packet classified as: normal
[2025-05-20 20:38:03.800954] Packet classified as: normal
[2025-05-20 20:38:03.804132] Packet classified as: normal
[2025-05-20 20:38:03.877905] Packet cl

<Sniffed: TCP:200 UDP:0 ICMP:0 Other:0>

**Logging to a Text File**

In [11]:
import datetime

def log_packet(pkt, result):
    with open("log.txt", "a") as f:
        f.write(f"{datetime.datetime.now()} | {pkt.summary()} | Result: {result}\n")


**Secure Model with Adversarial Robustness Toolbox (ART)**

In [12]:
!pip install adversarial-robustness-toolbox

from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import FastGradientMethod
import numpy as np
import torch

# Convert X_test to float32 numpy array before generating adversarial examples
X_test_float32 = X_test.astype(np.float32)

classifier = PyTorchClassifier(
    model=model,
    loss=criterion,
    optimizer=optimizer,
    input_shape=(X_train.shape[1],),
    nb_classes=2,
)

# Generate adversarial examples using float32 inputs
fgsm = FastGradientMethod(estimator=classifier, eps=0.1)
X_test_adv = fgsm.generate(X_test_float32)

# Predict on adversarial examples
preds = np.argmax(classifier.predict(X_test_adv), axis=1)

accuracy = np.mean(preds == y_test)
print("Robust Accuracy under FGSM attack:", accuracy)


Collecting adversarial-robustness-toolbox
  Downloading adversarial_robustness_toolbox-1.19.1-py3-none-any.whl.metadata (11 kB)
Downloading adversarial_robustness_toolbox-1.19.1-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: adversarial-robustness-toolbox
Successfully installed adversarial-robustness-toolbox-1.19.1
Robust Accuracy under FGSM attack: 0.8226135557132718
