In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, models


In [23]:
# ---------------------------
# Step 1: Load dataset
# ---------------------------
train_file = '/Users/pheonix/Documents/SRM/IDS/KDDTrain+.txt'

columns = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
    'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
    'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files',
    'num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate',
    'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
    'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',
    'dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
    'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
    'attack','level'
]

df = pd.read_csv(train_file, header=None, names=columns)

# ---------------------------
# Step 2: One-Hot Encode categorical features
# ---------------------------
cat_cols = ['protocol_type','service','flag']
df = pd.get_dummies(df, columns=cat_cols)

# ---------------------------
# Step 3: Separate features & labels
# ---------------------------
X = df.drop(['attack','level'], axis=1)  # Features
y_attack = df['attack']                  # For CNN+LSTM (known attack labels)


# ---------------------------
# Step 4: Scale numerical features
# ---------------------------
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

print("Shape of input features after one-hot encoding:", X_scaled.shape)



Shape of input features after one-hot encoding: (125973, 122)


In [24]:

# ---------------------------
# Step 5: Encode attack labels
# ---------------------------
le_attack = LabelEncoder()
y_encoded = le_attack.fit_transform(y_attack)  # CNN+LSTM expects numeric labels

print("Number of classes:", len(le_attack.classes_))

# Optional: check mapping
attack_mapping = dict(zip(le_attack.classes_, range(len(le_attack.classes_))))
print("Attack label mapping:", attack_mapping)

# ---------------------------
# Step 6: Split data into train and test
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)

Number of classes: 23
Attack label mapping: {'back': 0, 'buffer_overflow': 1, 'ftp_write': 2, 'guess_passwd': 3, 'imap': 4, 'ipsweep': 5, 'land': 6, 'loadmodule': 7, 'multihop': 8, 'neptune': 9, 'nmap': 10, 'normal': 11, 'perl': 12, 'phf': 13, 'pod': 14, 'portsweep': 15, 'rootkit': 16, 'satan': 17, 'smurf': 18, 'spy': 19, 'teardrop': 20, 'warezclient': 21, 'warezmaster': 22}
Training features shape: (100778, 122)
Test features shape: (25195, 122)
Training labels shape: (100778,)
Test labels shape: (25195,)


In [25]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = X_train.shape[1]

# ---------------------------
# Build AE
# ---------------------------
ae_input = layers.Input(shape=(input_dim,))
# Encoder
encoded = layers.Dense(64, activation='relu')(ae_input)
encoded = layers.Dense(32, activation='relu')(encoded)
# Bottleneck
bottleneck = layers.Dense(16, activation='relu')(encoded)
# Decoder
decoded = layers.Dense(32, activation='relu')(bottleneck)
decoded = layers.Dense(64, activation='relu')(decoded)
decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = models.Model(ae_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ---------------------------
# Train AE (use X_train only)
# ---------------------------
history = autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=256,
    validation_split=0.1,
    shuffle=True
)

# Save AE model
autoencoder.save('ae_model.h5')


Epoch 1/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.0356 - val_loss: 0.0093
Epoch 2/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0072 - val_loss: 0.0059
Epoch 3/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0054 - val_loss: 0.0050
Epoch 4/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0048 - val_loss: 0.0046
Epoch 5/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0046 - val_loss: 0.0044
Epoch 6/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0045 - val_loss: 0.0043
Epoch 7/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0044 - val_loss: 0.0043
Epoch 8/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0043 - val_loss: 0.0042
Epoch 9/50
[1m355/355[0m [32m━━━━━━━━



In [26]:
from tensorflow.keras.utils import to_categorical

# Encode labels as one-hot for classification
num_classes = len(le_attack.classes_)
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# Reshape input for CNN+LSTM: (samples, timesteps, features_per_step)
# Here, we treat the whole feature vector as a "sequence of 1 step" (can adjust)
X_train_seq = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_seq = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# ---------------------------
# Build CNN+LSTM
# ---------------------------
cnn_lstm_input = layers.Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
x = layers.Conv1D(filters=64, kernel_size=1, activation='relu')(cnn_lstm_input)
x = layers.MaxPooling1D(pool_size=1)(x)
x = layers.LSTM(64, return_sequences=False)(x)
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(num_classes, activation='softmax')(x)

cnn_lstm_model = models.Model(cnn_lstm_input, output)
cnn_lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# ---------------------------
# Train CNN+LSTM
# ---------------------------
history_cnn_lstm = cnn_lstm_model.fit(
    X_train_seq, y_train_cat,
    epochs=50,
    batch_size=256,
    validation_split=0.1,
    shuffle=True
)

# Save CNN+LSTM model
cnn_lstm_model.save('cnn_lstm_model.h5')


Epoch 1/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9122 - loss: 0.5100 - val_accuracy: 0.9632 - val_loss: 0.1276
Epoch 2/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9733 - loss: 0.0902 - val_accuracy: 0.9764 - val_loss: 0.0718
Epoch 3/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9842 - loss: 0.0564 - val_accuracy: 0.9857 - val_loss: 0.0532
Epoch 4/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9879 - loss: 0.0427 - val_accuracy: 0.9864 - val_loss: 0.0435
Epoch 5/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9894 - loss: 0.0366 - val_accuracy: 0.9879 - val_loss: 0.0384
Epoch 6/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9907 - loss: 0.0319 - val_accuracy: 0.9886 - val_loss: 0.0369
Epoch 7/50
[1m355/355[0m 



# Model Evalution

In [28]:
import numpy as np

# Predict on test set
X_test_recon = autoencoder.predict(X_test)

# Compute reconstruction error (MSE per sample)
recon_error = np.mean(np.power(X_test - X_test_recon, 2), axis=1)

print("Reconstruction error stats:")
print("Min:", np.min(recon_error))
print("Max:", np.max(recon_error))
print("Mean:", np.mean(recon_error))

# Optional: set threshold for unknown attack
threshold = np.percentile(recon_error, 95)  # e.g., top 5% as unknown
print("Threshold for unknown attack:", threshold)

# Flag unknown attacks
unknown_flags = recon_error > threshold
print("Number of unknown samples detected:", np.sum(unknown_flags))


# -------------------------------------------------------------

from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict on test sequences
y_pred_prob = cnn_lstm_model.predict(X_test_seq)
y_pred = np.argmax(y_pred_prob, axis=1)

# Accuracy
accuracy = np.mean(y_pred == y_test)
print("CNN+LSTM Test Accuracy:", accuracy)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Generate classification report for only labels present in y_test
unique_labels = np.unique(y_test)
report = classification_report(
    y_test, y_pred,
    labels=unique_labels,
    target_names=le_attack.inverse_transform(unique_labels)
)
print("Classification Report:\n", report)


[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230us/step
Reconstruction error stats:
Min: 1.6785914952903164e-06
Max: 0.04153541515780681
Mean: 0.002877896018685787
Threshold for unknown attack: 0.008459876365030237
Number of unknown samples detected: 1260
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285us/step
CNN+LSTM Test Accuracy: 0.9948402460805715
Confusion Matrix:
 [[  184     0     0     0     0     0     0     0     0     0     0     7
      0     0     0     0     0     0     0     0     0]
 [    0     3     0     0     0     0     0     0     0     0     0     3
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     2
      0     0     0     0     0     0     0     0     0]
 [    0     0     0    11     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     2     0     0     0     0     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [32]:
import joblib

# Save MinMaxScaler
joblib.dump(scaler, 'scaler.save')

# Save LabelEncoder for attack classes
joblib.dump(le_attack, 'le_attack.save')


['le_attack.save']

In [31]:
# Example: single test sample
sample = X_test[3].reshape(1, -1)

# 1. AE reconstruction error
recon_err = np.mean(np.power(sample - autoencoder.predict(sample), 2))

if recon_err > threshold:
    print("Predicted: Unknown Attack")
else:
    # 2. CNN+LSTM prediction
    sample_seq = sample.reshape(1, 1, input_dim)
    pred_class = np.argmax(cnn_lstm_model.predict(sample_seq), axis=1)[0]
    print("Predicted Known Attack Class:", le_attack.inverse_transform([pred_class])[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Predicted Known Attack Class: normal
