### Generate Random Dataset

In [1]:
import pandas as pd
from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=6,
    n_redundant=2,
    n_clusters_per_class=1,
    weights=[1 - 0.1],
    flip_y=0,
    random_state=42)

X= pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
print(X.head(10))

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.342106   1.881539   2.279653  -0.848153   1.917259   1.608363   
1   0.057864  -0.668963  -0.557796  -1.759497  -0.265593  -2.418103   
2  -0.715513  -0.217041  -0.658529  -0.894018   1.121499  -2.233489   
3  -0.143660   1.784894  -0.114805  -0.748890  -0.165081  -2.181948   
4   0.165240  -0.072079   0.988434  -1.454180   0.012522  -2.480565   
5  -0.432295  -0.179989  -3.422354  -1.282937   1.272715  -1.663167   
6  -0.929136  -0.587044  -1.387926  -1.147817  -1.134020  -1.517007   
7   1.083000   1.112442   1.189773  -1.494011  -1.076381  -1.109834   
8   2.337828   0.337494  -1.947356  -1.412170  -0.860757  -0.955628   
9   1.416412   1.016696   0.895345   0.449392   0.773240  -1.544849   

   feature_6  feature_7  feature_8  feature_9  
0  -2.430795   2.275756   0.269796   0.041784  
1  -1.352909   1.791205   2.842525   1.755104  
2   0.320325   0.203804   1.877039   1.512151  
3  -2.105435   0.091462   

### Comparing Isolation Forest Model

In [3]:
from sklearn.ensemble import IsolationForest as skIsolationForest

import sys
import os

sys.path.append(os.path.abspath("../code"))

from IsolationForest import IsolationForest as myIsolationForest

In [4]:
# Load models
sk_model = skIsolationForest(contamination=0.1, random_state=42)
my_model= myIsolationForest(contamination=0.1, random_state= 42)

In [5]:
# Evaluate Function
import time
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

def evaluate_model(model, X, y_true, model_name="Model"):
    start = time.time()
    model.fit(X)
    end = time.time()
    
    scores = -model.decision_function(X)
    #print(scores)
    
    y_pred = model.predict(X)
    #print(y_pred, y_true)
    y_pred = np.where(y_pred == -1, 1, 0)

    print(f"\n--- {model_name} ---")
    print(f"Execution Time: {end - start:.4f} seconds")
    print(classification_report(y_true, y_pred, digits=4))
    
    roc = roc_auc_score(y_true, scores)
    precision, recall, _ = precision_recall_curve(y_true, scores)
    pr_auc = auc(recall, precision)
    print(f"ROC-AUC: {roc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}")

In [6]:
# Compare
evaluate_model(sk_model, X, y, "Sklearn Isolation Forest")
evaluate_model(my_model, X, y, "Custom Isolation Forest")


--- Sklearn Isolation Forest ---
Execution Time: 0.1262 seconds
              precision    recall  f1-score   support

           0     0.9511    0.9501    0.9506       901
           1     0.5500    0.5556    0.5528        99

    accuracy                         0.9110      1000
   macro avg     0.7506    0.7528    0.7517      1000
weighted avg     0.9114    0.9110    0.9112      1000

ROC-AUC: 0.8519
PR-AUC: 0.5435

--- Custom Isolation Forest ---
Execution Time: 0.3924 seconds
              precision    recall  f1-score   support

           0     0.9489    0.9478    0.9484       901
           1     0.5300    0.5354    0.5327        99

    accuracy                         0.9070      1000
   macro avg     0.7394    0.7416    0.7405      1000
weighted avg     0.9074    0.9070    0.9072      1000

ROC-AUC: 0.8535
PR-AUC: 0.5381
