In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pathlib import Path
import shap
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
def load_cicids2017_data(file_path):
    print("Loading and preparing data...")
    df = pd.read_csv(file_path)
    df = df.dropna()
    print(f"Dataset dimensions: {df.shape}")
    print(f"Class distribution:\n{df['Label'].value_counts()}")
    label_col = 'Label'
    if label_col not in df.columns:
        raise ValueError(f"Column '{label_col}' not found in CSV file")
    
    non_feature_cols = [label_col,'Destination Port']
    features = df.drop(columns=[col for col in non_feature_cols if col in df.columns],errors='ignore')
    labels = df[label_col]
    features = features.select_dtypes(include=[np.number])
    scaler = RobustScaler()
    features_scaled = scaler.fit_transform(features)
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    labels_one_hot = to_categorical(labels_encoded)
    
    return features_scaled, labels_encoded, labels_one_hot, label_encoder.classes_, features.columns

In [None]:
def create_sequences(X, y, time_steps=5, stride=1):
    Xs, ys = [], []
    for i in range(0, len(X) - time_steps + 1, stride):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps - 1])
    return np.array(Xs), np.array(ys)

In [None]:
file_path = "C:/Users/ayana/Desktop/IDS project/Data/balanced_data.csv" 
X, y_encoded, y_one_hot, class_names, feature_names = load_cicids2017_data(file_path)
print(f"Detected classes: {class_names}")
print(f"Total samples: {len(X)}")
unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
class_distribution = dict(zip(unique_classes, class_counts))
print("Class distribution:")
for class_idx, count in class_distribution.items():
    print(f"  Class {class_names[class_idx]}: {count} samples")

time_steps = 5  
stride = 2
X_seq, y_seq_one_hot = create_sequences(X, y_one_hot, time_steps, stride)
y_seq_encoded = np.argmax(y_seq_one_hot, axis=1)
unique_seq_classes, seq_class_counts = np.unique(y_seq_encoded, return_counts=True)
seq_class_distribution = dict(zip(unique_seq_classes, seq_class_counts))
print("\nClass distribution after sequencing:")
for class_idx, count in seq_class_distribution.items():
    print(f"  Class {class_names[class_idx]}: {count} samples")

Loading and preparing data...
Dataset dimensions: (35000, 79)
Class distribution:
Label
BENIGN          5000
Attaques web    5000
Bot             5000
Infiltration    5000
PortScan        5000
DoS             5000
Brute Force     5000
Name: count, dtype: int64
Detected classes: ['Attaques web' 'BENIGN' 'Bot' 'Brute Force' 'DoS' 'Infiltration'
 'PortScan']
Total samples: 35000
Class distribution:
  Class Attaques web: 5000 samples
  Class BENIGN: 5000 samples
  Class Bot: 5000 samples
  Class Brute Force: 5000 samples
  Class DoS: 5000 samples
  Class Infiltration: 5000 samples
  Class PortScan: 5000 samples

Class distribution after sequencing:
  Class Attaques web: 2467 samples
  Class BENIGN: 2465 samples
  Class Bot: 2560 samples
  Class Brute Force: 2502 samples
  Class DoS: 2524 samples
  Class Infiltration: 2510 samples
  Class PortScan: 2470 samples


In [None]:
print("\nSplitting data: 80% train, 10% validation, 10% test")
X_train, X_temp, y_train, y_temp, y_train_one_hot, y_temp_one_hot = train_test_split( X_seq, y_seq_encoded, y_seq_one_hot, test_size=0.2, random_state=RANDOM_SEED, stratify=y_seq_encoded)
X_val, X_test, y_val, y_test, y_val_one_hot, y_test_one_hot = train_test_split(X_temp, y_temp, y_temp_one_hot, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Input data shape: {X_train.shape}")



Splitting data: 80% train, 10% validation, 10% test
Training set size: 13998 samples
Validation set size: 1750 samples
Test set size: 1750 samples
Input data shape: (13998, 5, 77)


In [None]:
output_dir = Path('plots_lstm_improved')
output_dir.mkdir(exist_ok=True)
MODEL_DIR = Path('models')
MODEL_DIR.mkdir(exist_ok=True)

## 

In [None]:
input_shape = (time_steps, X.shape[1])  
num_classes = len(class_names)

model = build_model_with_attention(input_shape=input_shape, num_classes=num_classes)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from pathlib import Path
MODEL_DIR = Path("models")  
MODEL_DIR.mkdir(parents=True, exist_ok=True)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1),
    ModelCheckpoint(filepath=MODEL_DIR / 'best_lstm_model.keras', save_best_only=True, verbose=1)
]
history = model.fit(
    X_train, y_train_one_hot,
    validation_data=(X_val, y_val_one_hot),
    epochs=250,
    batch_size=128,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/250
[1m109/110[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - accuracy: 0.1779 - loss: 5.3427
Epoch 1: val_loss improved from inf to 2.45880, saving model to models\best_lstm_model.keras
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 35ms/step - accuracy: 0.1787 - loss: 5.3162 - val_accuracy: 0.4126 - val_loss: 2.4588 - learning_rate: 0.0030
Epoch 2/250
[1m108/110[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - accuracy: 0.3829 - loss: 2.1753
Epoch 2: val_loss improved from 2.45880 to 1.64598, saving model to models\best_lstm_model.keras
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.3845 - loss: 2.1689 - val_accuracy: 0.6451 - val_loss: 1.6460 - learning_rate: 0.0030
Epoch 3/250
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5764 - loss: 1.4533
Epoch 3: val_loss improved from 1.64598 to 1.30583, saving model to models\best_lstm_model.

In [None]:
ModelCheckpoint(filepath=MODEL_DIR/'best_lstm_model.keras',save_best_only=True, verbose=1)


<keras.src.callbacks.model_checkpoint.ModelCheckpoint at 0x242dacdfa70>

In [None]:
loss, accuracy = best_model.evaluate(X_test, y_test_one_hot, verbose=1)
print(f"Test Loss: {loss:.4f} | Test Accuracy: {accuracy:.4f}")


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8146 - loss: 0.4744
Test Loss: 0.4690 | Test Accuracy: 0.8189
