In [2]:
import os
notebook_path = os.getcwd()
print("Notebook is located in:", notebook_path)


Notebook is located in: C:\Users\Dell\AppData\Roaming\Python\Python312\Scripts


In [1]:
# === Step 1: Imports and Load Data ===
import numpy as np
import pandas as pd
from joblib import load
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load the log sequence matrix (already vectorized)
X = load('log_sequence_matrix.joblib')

# Optional: Standardize for models that need it
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# === Step 2: Helper to Convert Model Output to Binary Labels ===
def convert_to_binary(predictions, anomaly_value=-1):
    return np.where(predictions == anomaly_value, 1, 0)  # 1 = anomaly
    

In [None]:
# === Step 3: Run 5 Unsupervised Anomaly Detection Models ===

# 1. Isolation Forest
iso_model = IsolationForest(random_state=42)
iso_preds = iso_model.fit_predict(X_scaled)
iso_labels = convert_to_binary(iso_preds)

# 2. Local Outlier Factor
lof_model = LocalOutlierFactor(n_neighbors=20, contamination='auto')
lof_preds = lof_model.fit_predict(X_scaled)
lof_labels = convert_to_binary(lof_preds)

# 3. One-Class SVM
svm_model = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
svm_preds = svm_model.fit_predict(X_scaled)
svm_labels = convert_to_binary(svm_preds)

# 4. Elliptic Envelope
ee_model = EllipticEnvelope(contamination=0.05)
ee_preds = ee_model.fit_predict(X_scaled)
ee_labels = convert_to_binary(ee_preds)

# 5. DBSCAN
dbscan_model = DBSCAN(eps=1.5, min_samples=5)
dbscan_preds = dbscan_model.fit_predict(X_scaled)
# Treat only noise (-1) as anomaly
dbscan_labels = convert_to_binary(dbscan_preds, anomaly_value=-1)


In [None]:
# === Step 4: Add Rule-Based Labels ===

# Load original classified log sequences
df_classified = pd.read_csv('classified_logs.csv')

# Define anomaly event IDs
anomalous_ids = ['E10', 'E11', 'E12']

# Recreate windowed sequences
event_ids = df_classified['EventId'].tolist()
window_size = 3
stride = 1
windows = [
    event_ids[i:i+window_size] 
    for i in range(0, len(event_ids) - window_size + 1, stride)
]

# Rule-based label: 1 if E10/11/12 present in window
rule_labels = [
    int(any(e in anomalous_ids for e in window)) 
    for window in windows
]


In [None]:
# === Step 5: Save All Labels to CSV ===

# Combine into DataFrame
labels_df = pd.DataFrame({
    'rule_based': rule_labels,
    'isolation_forest': iso_labels,
    'local_outlier_factor': lof_labels,
    'one_class_svm': svm_labels,
    'elliptic_envelope': ee_labels,
    'dbscan': dbscan_labels
})

# Add window index for traceability
labels_df.index.name = 'window_index'

# Save to CSV
labels_df.to_csv('unsupervised_labels.csv')

print("✅ Unsupervised labelling complete. Saved to 'unsupervised_labels.csv'")
labels_df.head()
