In [20]:
import pandas as pd

In [21]:
df_normal = pd.read_parquet("../data/processed/df_normal.parquet")
df_anomaly = pd.read_parquet("../data/processed/df_anomaly.parquet")

print(f"Normal data shape: {df_normal.shape}")
print(f"Anomaly data shape: {df_anomaly.shape}")

Normal data shape: (316443, 13)
Anomaly data shape: (127391, 13)


#### Isolation Forest

In [22]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

model = IsolationForest(contamination=0.05, random_state=42)
model.fit(df_normal)

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

# Compute evaluation metrics
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.05      0.09    127391
           1       0.71      0.95      0.81    316443

    accuracy                           0.69    443834
   macro avg       0.50      0.50      0.45    443834
weighted avg       0.59      0.69      0.61    443834



In [23]:
# Tuning IsolationForest for better performance
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Tune Isolation Forest
model = IsolationForest(
    n_estimators=200,        # Increase trees for better anomaly detection
    max_samples=256,         # Control the number of samples per tree
    contamination=0.1,       # Adjust based on expected anomaly proportion
    random_state=42
)
model.fit(df_normal)

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.10      0.15    127391
           1       0.71      0.90      0.80    316443

    accuracy                           0.67    443834
   macro avg       0.50      0.50      0.47    443834
weighted avg       0.59      0.67      0.61    443834



#### SVM: OnceClassSVM

In [None]:
from sklearn.svm import OneClassSVM

n_data =  5000
model = OneClassSVM(nu=0.1, kernel="rbf", gamma="auto")
model.fit(df_normal[:n_data])

y_pred_normal = model.predict(df_normal)
y_pred_anomaly = model.predict(df_anomaly)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

In [8]:
from sklearn.preprocessing import StandardScaler

# Apply feature scaling
scaler = StandardScaler()
df_normal_scaled = scaler.fit_transform(df_normal)
df_anomaly_scaled = scaler.transform(df_anomaly)

In [9]:
# OneClassSVM on scaled data
from sklearn.svm import OneClassSVM

n_data =  3000

model = OneClassSVM(nu=0.1, kernel="rbf", gamma="auto")
model.fit(df_normal_scaled[:n_data])

y_pred_normal = model.predict(df_normal_scaled)
y_pred_anomaly = model.predict(df_anomaly_scaled)

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

# Evaluate model
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.29      0.15      0.20    127391
           1       0.71      0.85      0.78    316443

    accuracy                           0.65    443834
   macro avg       0.50      0.50      0.49    443834
weighted avg       0.59      0.65      0.61    443834



In [17]:
# Tuned OneClassSVM
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report

n_data =  30000

model = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.001)
model.fit(df_normal[:n_data])


y_pred_normal = model.predict(df_normal[:n_data])
y_pred_anomaly = model.predict(df_anomaly[:n_data])

y_true = [1] * len(y_pred_normal) + [-1] * len(y_pred_anomaly)
y_pred = list(y_pred_normal) + list(y_pred_anomaly)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.48      0.05      0.08     30000
           1       0.50      0.95      0.65     30000

    accuracy                           0.50     60000
   macro avg       0.49      0.50      0.37     60000
weighted avg       0.49      0.50      0.37     60000

