# 🚨 Cybersecurity Anomaly Detection Workflow

This notebook outlines a comprehensive workflow for detecting suspicious activities within logs using advanced machine learning techniques applied to the BETH dataset.

The dataset, as discussed in the article [<<article_name>>], serves as the foundation for preprocessing and performance comparison.

## 📚 Libraries

- Utilizing essential data science, machine learning, and natural language processing packages for a robust analysis.
- Downloading the punkt tokenization module to enhance natural language processing capabilities.


In [1]:
from pathlib import Path

import mlflow
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score



In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
experiment_name = "beth_anomaly_detection"
mlflow.set_experiment(experiment_name=experiment_name)
mlflow.sklearn.autolog()

In [3]:
input_directory = Path("data/np_arrays")

X_train = np.load(input_directory / "X_train.npy", allow_pickle=True).astype(
    np.float32
)
y_train = np.load(input_directory / "y_train.npy", allow_pickle=True).astype(
    np.float32
)
X_column_names = np.load(
    input_directory / "X_column_names.npy", allow_pickle=True
)

X_val = np.load(input_directory / "X_val.npy", allow_pickle=True).astype(
    np.float32
)
y_val = np.load(input_directory / "y_val.npy", allow_pickle=True).astype(
    np.float32
)

X_test = np.load(input_directory / "X_test.npy", allow_pickle=True).astype(
    np.float32
)
y_test = np.load(input_directory / "y_test.npy", allow_pickle=True).astype(
    np.float32
)

In [4]:
def anomaly_results_to_binary(y: np.array) -> np.array:
    """
    Transforms an sklearn anomaly detection results format {-1, 0, 1} to
    binary format {0, 1}, where inliers are 0 and ouitliers are 1.
    Args:
        y (np.array): Array of predictions
    
    Returns:
        np.array: Same array, transformed so that 1 is outlier
    """
    y[y == 1] = 0  # inliers
    y[y == -1] = 1  # outliers
    return y

## 🏋️ Model Training
### 🌲 Isolation Forest

In [5]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [6]:
with mlflow.start_run() as run:
    iforest_model = IsolationForest(
    n_estimators=100,
    contamination=0.1,
    random_state=312,
    )
    iforest_model.fit(X_train, y_train)
    y_hat = iforest_model.predict(X_val)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_val, y_hat)
    mlflow.log_metric("val_auroc", auroc)
    print("val auroc:", auroc)
    
    y_hat = iforest_model.predict(X_test)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_test, y_hat)
    mlflow.log_metric("test_auroc", auroc)
    print("test_auroc", auroc)
    

val auroc: 0.8345678115405457
test_auroc 0.8613378831734406


### ↗️ One-Class Support Vector Machine (Supervized)

In [7]:
with mlflow.start_run() as run:
    one_class_svm_model = SGDOneClassSVM()
    one_class_svm_model.fit(X_train)
    
    y_hat = one_class_svm_model.predict(X_val)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_val, y_hat)
    mlflow.log_metric("val_auroc", auroc)
    print("val auroc:", auroc)
    
    y_hat = one_class_svm_model.predict(X_test)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_test, y_hat)
    mlflow.log_metric("test_auroc", auroc)
    print("test_auroc", auroc)
    



val auroc: 0.8841538727271303
test_auroc 0.6883634471993398


In [8]:
class DoSE_SVM:
    def __init__(self, X):
        self.pca = PCA(whiten=True, random_state=12).fit(X)
        self.clf = SGDOneClassSVM(random_state=12).fit(
            self.pca.transform(X)
        )

    def detect_outliers(self, test_summary_stats):
        return self.clf.predict(self.pca.transform(test_summary_stats)) == -1

In [18]:
with mlflow.start_run() as run:
    dose_svm_model = DoSE_SVM(X_train)
    
    y_hat = dose_svm_model.detect_outliers(X_val)
    auroc = roc_auc_score(y_val, y_hat)
    mlflow.log_metric("val_auroc", auroc)
    print("val auroc:", auroc)
    
    y_hat = dose_svm_model.detect_outliers(X_test)
    auroc = roc_auc_score(y_test, y_hat)
    mlflow.log_metric("test_auroc", auroc)
    print("test_auroc", auroc)
    



val auroc: 0.6868809633538215
test_auroc 0.5


In [16]:
with mlflow.start_run() as run:
    envelope = EllipticEnvelope(contamination=0.1)
    envelope.fit(X_train)

    y_hat = envelope.predict(X_val)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_val, y_hat)
    mlflow.log_metric("val_auroc", auroc)
    print("AUC Score:", auroc)

    y_hat = envelope.predict(X_test)
    y_hat = anomaly_results_to_binary(y_hat)
    auroc = roc_auc_score(y_test, y_hat)
    mlflow.log_metric("test_auroc", auroc)
    print("AUC Score:", auroc)



AUC Score: 0.5830658772529015
AUC Score: 0.8689093154036897


AUC Score: 0.5762426591809388


AUC Score: 0.8555474704873424
