In [1]:
############## AIS ###################

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt


In [3]:
# Load dataset
df_train = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_train.parquet')
df_test = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_test.parquet')

In [4]:
## Data Manipulation
# Create a random number generator instance with a fixed seed
rng = np.random.default_rng(seed=42)

# Sample fraction of the data for faster runtime
df_train_sampled = df_train.sample(frac=1, random_state=42)
df_test_sampled = df_test.sample(frac=1, random_state=42)

# Define features and labels
X_train = df_train_sampled.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
y_train = df_train_sampled['class_label']

X_test = df_test_sampled.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
y_test = df_test_sampled['class_label']

# Manually encoding class labels: Benign -> 0, Attack -> 1
label_mapping = {'Benign': 0, 'Attack': 1}

# Map labels using this mapping
y_train = y_train.map(label_mapping).values
y_test = y_test.map(label_mapping).values

# Define labels
benign_label = 0
attack_label = 1

# Extract indices of normal traffic in training data (Benign data only)
self_set_indices = np.where(y_train == benign_label)[0]
self_set = X_train.iloc[self_set_indices].values  # Only benign samples are used for training

In [5]:
### AIS Model Functions

def matches_self(detector, self_set, matching_threshold):
    distances = np.linalg.norm(self_set - detector, axis=1)
    return np.any(distances <= matching_threshold)

# Adjust detector generation range based on the entire training data
feature_min = np.min(self_set, axis=0) - 5  # Use the original features (no PCA)
feature_max = np.max(self_set, axis=0) + 5

def generate_detectors(self_set, num_detectors, matching_threshold, rng, max_iterations=100000): 
    detectors = []
    iterations = 0
    while len(detectors) < num_detectors and iterations < max_iterations:
        detector = rng.uniform(low=feature_min, high=feature_max)
        if not matches_self(detector, self_set, matching_threshold):
            detectors.append(detector)
        iterations += 1
    return np.array(detectors)

def detect_intrusions(test_set, detectors, matching_threshold):
    if detectors.size == 0:
        # If no detectors, classify all samples as benign
        return np.full(test_set.shape[0], benign_label, dtype=int)
    else:
        distances = cdist(test_set, detectors)
        is_intrusion = np.any(distances <= matching_threshold, axis=1)
        predictions = np.full(test_set.shape[0], benign_label, dtype=int)
        # Assign an 'Attack' label to detected intrusions
        predictions[is_intrusion] = attack_label
        return predictions

def analyze_distances(test_set, detectors):
    distances = cdist(test_set, detectors)
    min_dist = np.min(distances)
    max_dist = np.max(distances)
    mean_dist = np.mean(distances)
    median_dist = np.median(distances)
    print(f"Distance Statistics - Min: {min_dist}, Max: {max_dist}, Mean: {mean_dist}, Median: {median_dist}")
    return distances

In [6]:
# AIS Model Training and Evaluation

print("\nAIS MODEL")
num_detectors = 10000  # impacts run time

# Run AIS model 
## IMPROVEMENT IDEA: Use 45,000,000 as initial MT, then run analyze_distances, 
## then set MT to average dist between test data and randomly distributed detectors.

for matching_threshold in [35000000, 45000000, 55000000]:
    print(f"\nUsing matching threshold: {matching_threshold}")
    detectors = generate_detectors(self_set, num_detectors, matching_threshold, rng)
    print(f"Generated {len(detectors)} detectors.")
    
    # Visualize self set and test set (no PCA)
    #plt.figure(figsize=(12, 8))
    
    # Self set: only benign samples used in training (the self set)
    #plt.scatter(self_set[:, 0], self_set[:, 1], c='blue', label='Self Set (Benign Train)', alpha=0.5, s=1)
    
    # Test data: separate benign and attack samples
    #benign_test_indices = np.where(y_test == benign_label)[0]
    #attack_test_indices = np.where(y_test == attack_label)[0]
    #benign_test = X_test.iloc[benign_test_indices].values
    #attack_test = X_test.iloc[attack_test_indices].values
    
    # Plot test data (benign and attack)
    #plt.scatter(benign_test[:, 0], benign_test[:, 1], c='cyan', label='Benign Test', alpha=0.5, s=1)
    #plt.scatter(attack_test[:, 0], attack_test[:, 1], c='magenta', label='Attack Test', alpha=0.5, s=1)
    
    # Plot detectors
    #plt.scatter(detectors[:, 0], detectors[:, 1], c='green', label='Detectors', alpha=0.5, s=1)
    
    #plt.xlabel('Feature 1')
    #plt.ylabel('Feature 2')
    #plt.title(f'Self Set and Test Set Data (Matching Threshold: {matching_threshold})')
    #plt.legend()
    #plt.grid(True)
    #plt.show()
    
    #distances = analyze_distances(X_test.values, detectors)
    predictions = detect_intrusions(X_test.values, detectors, matching_threshold)
    
    # Evaluate model
    target_names = ['Benign', 'Attack']
    print("Classification Report:")
    print(classification_report(y_test, predictions, target_names=target_names))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))



AIS MODEL

Using matching threshold: 35000000
Generated 10000 detectors.
Classification Report:
              precision    recall  f1-score   support

      Benign       0.70      1.00      0.82     37607
      Attack       1.00      0.99      0.99   1576575

    accuracy                           0.99   1614182
   macro avg       0.85      0.99      0.91   1614182
weighted avg       0.99      0.99      0.99   1614182

Confusion Matrix:
[[  37607       0]
 [  16459 1560116]]

Using matching threshold: 45000000
Generated 10000 detectors.
Classification Report:
              precision    recall  f1-score   support

      Benign       0.70      1.00      0.82     37607
      Attack       1.00      0.99      0.99   1576575

    accuracy                           0.99   1614182
   macro avg       0.85      0.99      0.91   1614182
weighted avg       0.99      0.99      0.99   1614182

Confusion Matrix:
[[  37607       0]
 [  16446 1560129]]

Using matching threshold: 55000000
Generated 100