In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, confusion_matrix, classification_report, 
                             roc_curve, auc)
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Verify TensorFlow
print("TensorFlow Version:", tf.__version__)
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
    print("Keras imports successful!")
except ImportError as e:
    print("Keras import error:", e)
    print("Please reinstall TensorFlow: pip uninstall tensorflow -y; pip install tensorflow==2.18.0")
    raise

TensorFlow Version: 2.18.0
Keras imports successful!


In [3]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
try:
    train_df = pd.read_csv('../../../data/processed/train.csv')
    val_df = pd.read_csv('../../../data/processed/val.csv')
    test_df = pd.read_csv('../../../data/processed/test.csv')
except FileNotFoundError as e:
    print("Error: Preprocessed data files not found. Ensure 'train.csv', 'val.csv', 'test.csv' exist.")
    raise


In [5]:
# Separate features and labels
X_train = train_df.drop('binary_label', axis=1).values
y_train = train_df['binary_label'].values
X_val = val_df.drop('binary_label', axis=1).values
y_val = val_df['binary_label'].values
X_test = test_df.drop('binary_label', axis=1).values
y_test = test_df['binary_label'].values


In [6]:

# Reshape data for LSTM: [samples, timesteps, features]
timesteps = 5  # Number of time steps (adjust based on dataset)
n_features_total = X_train.shape[1]  # 49 for UNSW-NB15
target_features = n_features_total + (timesteps - n_features_total % timesteps) % timesteps  # Next multiple of 5 (50)
n_features = target_features // timesteps  # 50 // 5 = 10

# Pad features with zeros
X_train_padded = np.pad(X_train, ((0, 0), (0, target_features - n_features_total)), mode='constant')
X_val_padded = np.pad(X_val, ((0, 0), (0, target_features - n_features_total)), mode='constant')
X_test_padded = np.pad(X_test, ((0, 0), (0, target_features - n_features_total)), mode='constant')

# Reshape
X_train = X_train_padded.reshape((X_train.shape[0], timesteps, n_features))
X_val = X_val_padded.reshape((X_val.shape[0], timesteps, n_features))
X_test = X_test_padded.reshape((X_test.shape[0], timesteps, n_features))

print("Loaded and reshaped preprocessed data:")
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

Loaded and reshaped preprocessed data:
Train shape: (2932930, 5, 8), Validation shape: (628485, 5, 8), Test shape: (628485, 5, 8)


In [8]:
# Evaluation function (same as original for consistency)
def evaluate_model(y_true, y_pred, y_prob, model_name):
    print(f"\n{model_name} Evaluation:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-Score:", f1_score(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_prob))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('lstm_confusion_matrix.png')
    plt.close()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig('lstm_roc_curve.png')
    plt.close()
    
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_prob)
    }


In [9]:
# Define LSTM Model
def create_lstm_model(timesteps, n_features):
    model = Sequential([
        LSTM(64, input_shape=(timesteps, n_features), return_sequences=False),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [10]:

# Train LSTM
lstm_model = create_lstm_model(timesteps, n_features)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('lstm_best.h5', monitor='val_loss', save_best_only=True)

history = lstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=128,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)


Epoch 1/50
[1m22912/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9658 - loss: 0.0835



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 13ms/step - accuracy: 0.9658 - loss: 0.0835 - val_accuracy: 0.9904 - val_loss: 0.0284
Epoch 2/50
[1m22912/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.9902 - loss: 0.0272



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 16ms/step - accuracy: 0.9902 - loss: 0.0272 - val_accuracy: 0.9935 - val_loss: 0.0182
Epoch 3/50
[1m22913/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.9924 - loss: 0.0214



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 16ms/step - accuracy: 0.9924 - loss: 0.0214 - val_accuracy: 0.9942 - val_loss: 0.0163
Epoch 4/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 14ms/step - accuracy: 0.9931 - loss: 0.0193 - val_accuracy: 0.9847 - val_loss: 0.0312
Epoch 5/50
[1m22912/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.9935 - loss: 0.0178



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 12ms/step - accuracy: 0.9935 - loss: 0.0178 - val_accuracy: 0.9942 - val_loss: 0.0151
Epoch 6/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 13ms/step - accuracy: 0.9938 - loss: 0.0168 - val_accuracy: 0.9936 - val_loss: 0.0165
Epoch 7/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 8ms/step - accuracy: 0.9939 - loss: 0.0161 - val_accuracy: 0.9932 - val_loss: 0.0168
Epoch 8/50
[1m22910/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.9941 - loss: 0.0153



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 8ms/step - accuracy: 0.9941 - loss: 0.0153 - val_accuracy: 0.9945 - val_loss: 0.0145
Epoch 9/50
[1m22910/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9943 - loss: 0.0148



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 7ms/step - accuracy: 0.9943 - loss: 0.0148 - val_accuracy: 0.9948 - val_loss: 0.0130
Epoch 10/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9945 - loss: 0.0142



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 11ms/step - accuracy: 0.9945 - loss: 0.0142 - val_accuracy: 0.9948 - val_loss: 0.0129
Epoch 11/50
[1m22913/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9947 - loss: 0.0135



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 14ms/step - accuracy: 0.9947 - loss: 0.0135 - val_accuracy: 0.9949 - val_loss: 0.0122
Epoch 12/50
[1m22913/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9949 - loss: 0.0131



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 14ms/step - accuracy: 0.9949 - loss: 0.0131 - val_accuracy: 0.9953 - val_loss: 0.0116
Epoch 13/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 12ms/step - accuracy: 0.9951 - loss: 0.0126 - val_accuracy: 0.9947 - val_loss: 0.0131
Epoch 14/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 9ms/step - accuracy: 0.9951 - loss: 0.0123 - val_accuracy: 0.9938 - val_loss: 0.0151
Epoch 15/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9814s[0m 428ms/step - accuracy: 0.9952 - loss: 0.0121 - val_accuracy: 0.9948 - val_loss: 0.0126
Epoch 16/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9952 - loss: 0.0120



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m301s[0m 11ms/step - accuracy: 0.9952 - loss: 0.0120 - val_accuracy: 0.9957 - val_loss: 0.0107
Epoch 17/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 8ms/step - accuracy: 0.9954 - loss: 0.0117 - val_accuracy: 0.9940 - val_loss: 0.0159
Epoch 18/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m342s[0m 15ms/step - accuracy: 0.9954 - loss: 0.0115 - val_accuracy: 0.9937 - val_loss: 0.0178
Epoch 19/50
[1m22907/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.9954 - loss: 0.0116



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 5ms/step - accuracy: 0.9954 - loss: 0.0116 - val_accuracy: 0.9957 - val_loss: 0.0106
Epoch 20/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 4ms/step - accuracy: 0.9954 - loss: 0.0115 - val_accuracy: 0.9955 - val_loss: 0.0118
Epoch 21/50
[1m22912/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.9954 - loss: 0.0114



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 7ms/step - accuracy: 0.9954 - loss: 0.0114 - val_accuracy: 0.9957 - val_loss: 0.0105
Epoch 22/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 5ms/step - accuracy: 0.9955 - loss: 0.0112 - val_accuracy: 0.9945 - val_loss: 0.0138
Epoch 23/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 5ms/step - accuracy: 0.9955 - loss: 0.0110 - val_accuracy: 0.9958 - val_loss: 0.0106
Epoch 24/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 5ms/step - accuracy: 0.9956 - loss: 0.0111 - val_accuracy: 0.9957 - val_loss: 0.0105
Epoch 25/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 6ms/step - accuracy: 0.9956 - loss: 0.0109 - val_accuracy: 0.9950 - val_loss: 0.0117
Epoch 26/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 6ms/step - accuracy: 0.9956 - loss: 0.0109 - val_accuracy: 0.9956 - val_loss: 0.011

In [11]:
# Save model
lstm_model.save('lstm_final.h5')
print("LSTM model trained and saved.")



LSTM model trained and saved.


In [12]:
# Evaluate on test set
lstm_pred = (lstm_model.predict(X_test, verbose=0) > 0.5).astype(int).flatten()
lstm_prob = lstm_model.predict(X_test, verbose=0).flatten()
lstm_metrics = evaluate_model(y_test, lstm_pred, lstm_prob, "LSTM")


LSTM Evaluation:
Accuracy: 0.9956816789581294
Precision: 0.9935456682784013
Recall: 0.9978456094347669
F1-Score: 0.9956909965007208
ROC-AUC: 0.9999108690564813

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    314243
           1       0.99      1.00      1.00    314242

    accuracy                           1.00    628485
   macro avg       1.00      1.00      1.00    628485
weighted avg       1.00      1.00      1.00    628485



In [13]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('LSTM Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('LSTM Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('lstm_training_history.png')
plt.close()

In [14]:
# Save metrics
metrics_df = pd.DataFrame([lstm_metrics]).set_index('Model')
metrics_df.to_csv('lstm_metrics.csv')
print("\nLSTM Metrics:")
print(metrics_df)



LSTM Metrics:
       Accuracy  Precision    Recall  F1-Score   ROC-AUC
Model                                                   
LSTM   0.995682   0.993546  0.997846  0.995691  0.999911


In [15]:

# Real-time feasibility test
import time
start_time = time.time()
lstm_model.predict(X_test[:100], verbose=0)
inference_time = time.time() - start_time
print(f"Inference time for 100 samples: {inference_time:.4f} seconds")

Inference time for 100 samples: 0.0917 seconds
