In [4]:
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = scipy.io.loadmat('shuttle.mat')  
if 'X' in data:
    X = data['X']
elif 'data' in data:
    X = data['data']
else:
    raise ValueError('Could not find data matrix in .mat file')

import pprint
pprint.pprint(X)

X_train, X_test = train_test_split(X, test_size=0.4, random_state=1)

# normalize the data
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

print(f'Train shape: {X_train_norm.shape}, Test shape: {X_test_norm.shape}')

array([[ 50,  21,  77, ...,  27,  48,  22],
       [ 53,   0,  82, ...,  29,  30,   2],
       [ 37,   0,  76, ...,  40,  48,   8],
       ...,
       [ 49,   0,  87, ...,  38,  41,   2],
       [ 80,   0,  84, ...,   4, 120, 116],
       [ 37,   0, 103, ...,  66,  85,  20]], shape=(49097, 9), dtype=int16)
Train shape: (29458, 9), Test shape: (19639, 9)


In [None]:
import torch
# disable pinned memory to avoid the errors
torch.backends.cuda.matmul.allow_tf32 = False  # optional tweak
# monkeypatch dataloader defaults
from torch.utils import data
old_init = data.DataLoader.__init__
def new_init(self, *args, **kwargs):
    kwargs["pin_memory"] = False
    old_init(self, *args, **kwargs)
data.DataLoader.__init__ = new_init



KeyboardInterrupt: 

In [11]:
import numpy as np
n_splits = 10
ba_scores = {'IForest': [], 'LODA': [], 'DIF': []}
roc_auc_scores = {'IForest': [], 'LODA': [], 'DIF': []}

for split in range(n_splits):
    print(f"Processing split {split+1}/{n_splits}...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=split)
    scaler = StandardScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)

    # IForest
    clf = IForest(contamination=0.02, random_state=split)
    clf.fit(X_train_norm)
    scores = clf.decision_function(X_test_norm)
    y_pred = clf.predict(X_test_norm)
    ba_scores['IForest'].append(balanced_accuracy_score(y_test, y_pred))
    roc_auc_scores['IForest'].append(roc_auc_score(y_test, scores))

    # LODA
    loda = LODA(contamination=0.02)
    loda.fit(X_train_norm)
    scores = loda.decision_function(X_test_norm)
    y_pred = loda.predict(X_test_norm)
    ba_scores['LODA'].append(balanced_accuracy_score(y_test, y_pred))
    roc_auc_scores['LODA'].append(roc_auc_score(y_test, scores))

    # DIF
    dif = DIF(contamination=0.02, random_state=split)
    dif.fit(X_train_norm)
    scores = dif.decision_function(X_test_norm)
    y_pred = dif.predict(X_test_norm)
    ba_scores['DIF'].append(balanced_accuracy_score(y_test, y_pred))
    roc_auc_scores['DIF'].append(roc_auc_score(y_test, scores))

for model in ['IForest', 'LODA', 'DIF']:
    print(f"{model}: Mean BA = {np.mean(ba_scores[model]):.4f}, Mean ROC AUC = {np.mean(roc_auc_scores[model]):.4f}")

Processing split 1/10...




KeyboardInterrupt: 