In [20]:
import pandas as pd

In [21]:
df_normal = pd.read_parquet("../data/processed/df_normal.parquet")
df_anomaly = pd.read_parquet("../data/processed/df_anomaly.parquet")

print(f"Normal data shape: {df_normal.shape}")
print(f"Anomaly data shape: {df_anomaly.shape}")

Normal data shape: (316443, 13)
Anomaly data shape: (127391, 13)


#### Isolation Forest

In [22]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

model = IsolationForest(contamination=0.05, random_state=42)
model.fit(df_normal)

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

# Compute evaluation metrics
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.05      0.09    127391
           1       0.71      0.95      0.81    316443

    accuracy                           0.69    443834
   macro avg       0.50      0.50      0.45    443834
weighted avg       0.59      0.69      0.61    443834



In [23]:
# Tuning IsolationForest for better performance
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Tune Isolation Forest
model = IsolationForest(
    n_estimators=200,        # Increase trees for better anomaly detection
    max_samples=256,         # Control the number of samples per tree
    contamination=0.1,       # Adjust based on expected anomaly proportion
    random_state=42
)
model.fit(df_normal)

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.10      0.15    127391
           1       0.71      0.90      0.80    316443

    accuracy                           0.67    443834
   macro avg       0.50      0.50      0.47    443834
weighted avg       0.59      0.67      0.61    443834



#### SVM: OnceClassSVM

In [24]:
from sklearn.svm import OneClassSVM

n_data =  5000
model = OneClassSVM(nu=0.1, kernel="rbf", gamma="auto")
model.fit(df_normal[:n_data])

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.11      0.16    127391
           1       0.71      0.89      0.79    316443

    accuracy                           0.67    443834
   macro avg       0.50      0.50      0.48    443834
weighted avg       0.59      0.67      0.61    443834



In [25]:
from sklearn.preprocessing import StandardScaler

# Apply feature scaling
scaler = StandardScaler()
df_normal_scaled = scaler.fit_transform(df_normal)
df_anomaly_scaled = scaler.transform(df_anomaly)

In [26]:
# OneClassSVM on scaled data
from sklearn.svm import OneClassSVM

n_data =  3000

model = OneClassSVM(nu=0.1, kernel="rbf", gamma="auto")
model.fit(df_normal_scaled[:n_data])

y_pred_normal = model.predict(df_normal_scaled)
y_pred_anomaly = model.predict(df_anomaly_scaled)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

# Evaluate model
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.12      0.17    127391
           1       0.71      0.88      0.79    316443

    accuracy                           0.66    443834
   macro avg       0.50      0.50      0.48    443834
weighted avg       0.59      0.66      0.61    443834



In [27]:
# Tuned OneClassSVM
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report

n_data =  30000

model = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.001)
model.fit(df_normal[:n_data])


y_pred_normal = model.predict(df_normal[:n_data])
y_pred_anomaly = model.predict(df_anomaly[:n_data])

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.48      0.05      0.08     30000
           1       0.50      0.95      0.65     30000

    accuracy                           0.50     60000
   macro avg       0.49      0.50      0.37     60000
weighted avg       0.49      0.50      0.37     60000



#### Gaussian Mixture Model (GMM)

In [33]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report
import numpy as np

# Fit GMM on normal data
model = GaussianMixture(n_components=2, covariance_type="full", random_state=42)
model.fit(df_normal)

# Compute log-likelihood scores for normal and anomalous data
normal_scores = model.score_samples(df_normal)
anomaly_scores = model.score_samples(df_anomaly)

# Set threshold based on normal score distribution
threshold_percentile = 5  # Adjust as needed
threshold = np.percentile(normal_scores, threshold_percentile)

# Classify data based on the threshold (anomalies <- below threshold)
y_pred_normal = (normal_scores < threshold).astype(int)  # 1 if anomaly, 0 if normal
y_pred_anomaly = (anomaly_scores < threshold).astype(int)

# Combine predictions
y_pred = np.concatenate([y_pred_normal, y_pred_anomaly])  # 1 (anomaly), 0 (normal)
y_pred = np.where(y_pred == 1, -1, 1)  # Convert to -1 (anomaly), 1 (normal)

# Create true labels
y_true = np.concatenate([np.ones(len(df_normal)), -1 * np.ones(len(df_anomaly))])

# Ensure labels align correctly
print("True labels:", np.unique(y_true, return_counts=True))
print("Predicted labels:", np.unique(y_pred, return_counts=True))

# Evaluate performance
print(classification_report(y_true, y_pred, zero_division=0))


True labels: (array([-1.,  1.]), array([127391, 316443]))
Predicted labels: (array([-1,  1]), array([ 22635, 421199]))
              precision    recall  f1-score   support

        -1.0       0.30      0.05      0.09    127391
         1.0       0.71      0.95      0.82    316443

    accuracy                           0.69    443834
   macro avg       0.51      0.50      0.45    443834
weighted avg       0.60      0.69      0.61    443834



In [32]:
from scipy.spatial.distance import mahalanobis
import numpy as np
from sklearn.metrics import classification_report

# Compute mean and covariance of normal data
cov_matrix = np.cov(df_normal.T)
inv_cov = np.linalg.inv(cov_matrix)
mean_normal = df_normal.mean(axis=0)

# Compute Mahalanobis distance for normal and anomalous data
normal_distances = [mahalanobis(x, mean_normal, inv_cov) for x in df_normal.values]
anomaly_distances = [mahalanobis(x, mean_normal, inv_cov) for x in df_anomaly.values]

# Set threshold based on normal data (e.g., 95th percentile)
threshold = np.percentile(normal_distances, 95)

# Classify both normal and anomaly data points
y_pred_normal = np.where(np.array(normal_distances) > threshold, -1, 1)  # Normal points
y_pred_anomaly = np.where(np.array(anomaly_distances) > threshold, -1, 1)  # Anomalies

# Combine predictions and true labels
y_pred = np.concatenate([y_pred_normal, y_pred_anomaly])  # Merge predictions
y_true = np.concatenate(
    [np.ones(len(y_pred_normal)), -1 * np.ones(len(y_pred_anomaly))])  # 1 for normal, -1 for anomalies

# Evaluate model
print(classification_report(y_true, y_pred, zero_division=0))


              precision    recall  f1-score   support

        -1.0       0.30      0.05      0.09    127391
         1.0       0.71      0.95      0.82    316443

    accuracy                           0.69    443834
   macro avg       0.51      0.50      0.45    443834
weighted avg       0.60      0.69      0.61    443834

