In [1]:
import numpy as np

from sklearn.datasets import fetch_kddcup99
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from src.cade_outliers import CADEOutliers

In [8]:
# Download the KDD Cup 99 dataset containing the data about cyber attacks
X, y = fetch_kddcup99(subset='SA', return_X_y=True, percent10=True, shuffle=True)

In [9]:
# Convert target from object to int. Every not "normal" target is anomaly
y = 1 - (y == b'normal.').astype('int')

In [10]:
# Drop object columns and convert everything to float
X_cut = X[:, np.append(0, np.arange(4, 41))].astype('float')

In [11]:
# Create CADE outliers object with uniform distribution of artifical anomalies and with size of 50% of the given dataset
cade = CADEOutliers(
    classifier=RandomForestClassifier(max_depth=2), 
    A_dist='uniform', 
    A_size=0.5
)

In [12]:
ranking_by_dens = cade.outliers_ranking(X_cut)

In [13]:
print('ROC AUC of ranking anomalies by CADE: {}'.format(roc_auc_score(y, -ranking_by_dens)))

ROC AUC of ranking anomalies by CADE: 0.7786877779701831
