# Network Traffic Anomaly Detection - Offline Training and Evaluation


This notebook demonstrates the offline training and evaluation of machine learning models for network traffic anomaly detection. It leverages the feature engineering concepts from `src/features.py` and model architectures discussed in the original `NW_sec_package (1).ipynb`.


In [None]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Ensure reproducibility
tf.random.set_seed(42)
np.random.seed(42)


## 1. Data Loading and Preparation


For this demonstration, we'll use a synthetic dataset or assume the UNSW-NB15 dataset (or similar pre-processed CSVs) are available in a `dataset/` subdirectory. In a real-world scenario, you would download and prepare your training data here.


In [None]:
# Synthetic data generation for demonstration if real dataset is not available
def generate_synthetic_data(num_samples=1000):
    data = {
        'dur': np.random.uniform(0.0, 60.0, num_samples),
        'proto': np.random.choice(['tcp', 'udp', 'icmp'], num_samples),
        'service': np.random.choice(['-', 'http', 'dns', 'ftp'], num_samples),
        'state': np.random.choice(['FIN', 'INT', 'REQ'], num_samples),
        'spkts': np.random.randint(1, 100, num_samples),
        'dpkts': np.random.randint(0, 100, num_samples),
        'sbytes': np.random.randint(100, 10000, num_samples),
        'dbytes': np.random.randint(0, 10000, num_samples),
        'rate': np.random.uniform(0.0, 1000.0, num_samples),
        'sload': np.random.uniform(1000.0, 1000000.0, num_samples),
        'dload': np.random.uniform(0.0, 1000000.0, num_samples),
        'synack': np.random.uniform(0.0, 0.1, num_samples),
        'ackdat': np.random.uniform(0.0, 0.1, num_samples),
        'ct_srv_src': np.random.randint(1, 20, num_samples),
        'ct_state_ttl': np.random.randint(0, 255, num_samples),
        'ct_dst_ltm': np.random.randint(1, 20, num_samples),
        'ct_src_dport_ltm': np.random.randint(1, 10, num_samples),
        'ct_dst_sport_ltm': np.random.randint(1, 10, num_samples),
        'ct_dst_src_ltm': np.random.randint(1, 20, num_samples),
        'ct_src_ltm': np.random.randint(1, 20, num_samples),
        'ct_srv_dst': np.random.randint(1, 20, num_samples),
        'is_sm_ips_ports': np.random.choice([0, 1], num_samples),
        'label': np.random.choice([0, 1, 2], num_samples, p=[0.7, 0.15, 0.15]) # 0: Normal, 1: DoS, 2: Reconnaissance
    }
    df = pd.DataFrame(data)
    
    # Introduce some anomalies for specific labels
    df.loc[df['label'] == 1, 'sload'] = np.random.uniform(1e6, 1e7, df[df['label'] == 1].shape[0]) # High sload for DoS
    df.loc[df['label'] == 2, 'ct_dst_sport_ltm'] = np.random.randint(5, 15, df[df['label'] == 2].shape[0]) # High distinct ports for Reconnaissance

    # Generate additional columns as if from `src/features.py` derived features
    df["pkt_ratio"] = df["spkts"] / (df["dpkts"] + 1)
    df["byte_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
    df["syn_rate"] = df["synack"] / (df["spkts"] + 1)
    df["ack_rate"] = df["ackdat"] / (df["spkts"] + 1)
    df["avg_pkt_size"] = df["sbytes"] / (df["spkts"] + 1)
    df['bytes_per_pkt'] = (df['sbytes'] + df['dbytes']) / (df['spkts'] + df['dpkts'] + 1)
    df['pkt_size_ratio'] = df['sbytes'] / (df['dbytes'] + 1)
    df['port_diversity'] = (df['ct_src_dport_ltm'] + df['ct_dst_sport_ltm']) / (df['ct_srv_dst'] + 1)

    return df

# Load dataset (or generate synthetic data)
try:
    # Adjust path as necessary if you have the actual UNSW-NB15 dataset CSVs
    train_df = pd.read_csv("dataset/UNSW_NB15_training-set.csv")
    test_df = pd.read_csv("dataset/UNSW_NB15_testing-set.csv")
    print("Loaded UNSW-NB15 dataset.")
except FileNotFoundError:
    print("UNSW-NB15 dataset not found. Generating synthetic data...")
    train_df = generate_synthetic_data(num_samples=10000)
    test_df = generate_synthetic_data(num_samples=2000)
    # Map numerical labels back to original attack_cat for consistency with original notebook
    label_map = {0: 'Normal', 1: 'DoS', 2: 'Reconnaissance'}
    train_df['attack_cat'] = train_df['label'].map(label_map)
    test_df['attack_cat'] = test_df['label'].map(label_map)


print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("Train head:\n", train_df.head())


## 2. Feature Engineering and Preprocessing


We select features based on their relevance as identified in the original notebook and `src/features.py`. We then apply Min-Max Scaling to numerical features and One-Hot Encoding to categorical features.


In [None]:
selected_features = [
    'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
    'synack', 'ackdat', 'state',
    'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm',
    'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst',
    'proto', 'service',
    'pkt_ratio', 'byte_ratio', 'syn_rate', 'ack_rate', 'avg_pkt_size',
    'bytes_per_pkt', 'pkt_size_ratio', 'port_diversity'
]

# Ensure all selected features exist in both train and test data
for feature in selected_features:
    if feature not in train_df.columns:
        train_df[feature] = 0 # Default value
    if feature not in test_df.columns:
        test_df[feature] = 0 # Default value

# Define target column
target_column = 'attack_cat' # Or 'label' if using synthetic numeric labels

X_train = train_df[selected_features]
y_train = train_df[target_column]
X_test = test_df[selected_features]
y_test = test_df[target_column]

# Identify numerical and categorical columns
numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

print(f"Numerical columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

# Initialize preprocessors
scaler = MinMaxScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
label_encoder = LabelEncoder()

# Scale numerical features
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

X_train[numeric_cols] = X_train_scaled
X_test[numeric_cols] = X_test_scaled

# Encode categorical features
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Get feature names for encoded categorical features
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Combine scaled numerical and encoded categorical features
X_train_final = pd.concat([X_train[numeric_cols].reset_index(drop=True), 
                           pd.DataFrame(X_train_encoded, columns=encoded_feature_names)], axis=1)
X_test_final = pd.concat([X_test[numeric_cols].reset_index(drop=True), 
                          pd.DataFrame(X_test_encoded, columns=encoded_feature_names)], axis=1)

# Encode target labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"Final X_train shape: {X_train_final.shape}")
print(f"Final X_test shape: {X_test_final.shape}")
print(f"Encoded target classes: {list(label_encoder.classes_)}")


## 3. Model Training and Evaluation


### 3.1 Isolation Forest


In [None]:
print("\n--- Training Isolation Forest Model ---")
iso_forest = IsolationForest(random_state=42, contamination=0.05) # Adjust contamination as needed
iso_forest.fit(X_train_final)

# Predict anomaly scores (-1 for anomaly, 1 for normal)
isol_predictions = iso_forest.predict(X_test_final)

# Convert to binary labels for evaluation (e.g., -1 -> anomaly, 1 -> normal)
y_pred_iso = np.array([1 if p == -1 else 0 for p in isol_predictions])
y_true_iso = np.array([1 if l != 'Normal' else 0 for l in y_test]) # Assuming 'Normal' is the non-anomaly label

print("Isolation Forest Evaluation:")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_true_iso, y_pred_iso):.4f}")
print("Classification Report:\n", classification_report(y_true_iso, y_pred_iso))

# Confusion Matrix
cm_iso = confusion_matrix(y_true_iso, y_pred_iso)
plt.figure(figsize=(6,4))
sns.heatmap(cm_iso, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('Isolation Forest Confusion Matrix')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.show()


### 3.2 LSTM Model (from original notebook)


In [None]:
print("\n--- Training LSTM Model ---")

# Reshape data for LSTM (samples, timesteps, features)
n_features = X_train_final.shape[1]
timesteps = 1  # For a single-step input

X_train_lstm = np.array(X_train_final, dtype=np.float32).reshape((-1, timesteps, n_features))
X_test_lstm = np.array(X_test_final, dtype=np.float32).reshape((-1, timesteps, n_features))

num_classes = len(label_encoder.classes_)

# Define the LSTM model architecture (from original notebook)
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(timesteps, n_features)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=6, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5
)

# Train the model
history = model.fit(
    X_train_lstm, y_train_encoded,
    validation_split=0.2,
    epochs=50,
    batch_size=128,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluate LSTM Model
print("\nLSTM Model Evaluation:")
loss, accuracy = model.evaluate(X_test_lstm, y_test_encoded, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Predictions
predictions_lstm = np.argmax(model.predict(X_test_lstm), axis=1)
y_pred_lstm_labels = label_encoder.inverse_transform(predictions_lstm)
y_true_lstm_labels = label_encoder.inverse_transform(y_test_encoded)

print("Classification Report (LSTM):\n", classification_report(y_true_lstm_labels, y_pred_lstm_labels))

# Confusion Matrix
cm_lstm = confusion_matrix(y_true_lstm_labels, y_pred_lstm_labels)
plt.figure(figsize=(7,5))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('LSTM Confusion Matrix')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.show()


Save the trained models and preprocessors to the `data/models/` directory for use in the real-time detection system.


In [None]:
models_dir = "data/models"
os.makedirs(models_dir, exist_ok=True)

# Save Isolation Forest model and its preprocessors
joblib.dump(iso_forest, os.path.join(models_dir, "IsolationForest.pkl"))
joblib.dump(scaler, os.path.join(models_dir, "IsolationForest_scaler.pkl")) # Assuming same scaler for both models for now
joblib.dump(encoder, os.path.join(models_dir, "IsolationForest_encoder.pkl"))
joblib.dump(label_encoder, os.path.join(models_dir, "IsolationForest_label_encoder.pkl"))
print("Saved IsolationForest model and preprocessors.")

# Save LSTM model and its preprocessors
model.save(os.path.join(models_dir, "lstm_model.h5"))
joblib.dump(scaler, os.path.join(models_dir, "LSTM_scaler.pkl")) # Assuming same scaler for both models for now
joblib.dump(encoder, os.path.join(models_dir, "LSTM_encoder.pkl"))
joblib.dump(label_encoder, os.path.join(models_dir, "LSTM_label_encoder.pkl"))
print("Saved LSTM model and preprocessors.")

print("All models and preprocessors saved to data/models/")
