In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import sklearn
import xgboost

import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_kddcup99

dataset_name = "http"
dataset = fetch_kddcup99(subset=dataset_name)

In [3]:
(X, y) = (dataset.data, dataset.target)

print(f"Features: {dataset.feature_names}")
print(f"Targets: {dataset.target_names}")

print(dataset.DESCR)

Features: ['duration', 'src_bytes', 'dst_bytes']
Targets: ['labels']
.. _kddcup99_dataset:

Kddcup 99 dataset
-----------------

The KDD Cup '99 dataset was created by processing the tcpdump portions
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
created by MIT Lincoln Lab [2]_. The artificial data (described on the `dataset's
homepage <https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>`_) was
generated using a closed network and hand-injected attacks to produce a
large number of different types of attack with normal activity in the
background. As the initial goal was to produce a large training set for
supervised learning algorithms, there is a large proportion (80.1%) of
abnormal data which is unrealistic in real world, and inappropriate for
unsupervised anomaly detection which aims at detecting 'abnormal' data, i.e.:

* qualitatively different from normal data
* in large minority among the observations.

We thus transform the KDD Data set into two diff

In [4]:
np.unique(y)

array([b'back.', b'ipsweep.', b'normal.', b'phf.', b'satan.'],
      dtype=object)

In [5]:
X.shape

(58725, 3)

In [6]:
# preprocess y by one-hot encoding it
y_regular = y
y = pd.get_dummies(y).to_numpy()

y

array([[0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [7]:
y_regular

array([b'normal.', b'normal.', b'normal.', ..., b'normal.', b'normal.',
       b'normal.'], dtype=object)

In [8]:
X

array([[-2.3025850929940455, 5.199049364889368, 8.60338923611281],
       [-2.3025850929940455, 5.476881874464279, 6.1864143640514095],
       [-2.3025850929940455, 5.460010955546024, 7.198258368620619],
       ...,
       [-2.3025850929940455, 5.313698468586339, 7.090160165637395],
       [-2.3025850929940455, 5.6736668507515775, 7.090160165637395],
       [-2.3025850929940455, 5.3895282466014205, 7.118097238459137]],
      dtype=object)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

train_ratio = 0.80
(X_train, X_ideal, y_train, y_ideal) = train_test_split(X, y, test_size=(1.0 - train_ratio))
(X_val, X_test, y_val, y_test) = train_test_split(X_ideal, y_ideal, test_size=0.5)

standard_preprocessor = Pipeline([
    ("standardizer", StandardScaler())
])

positive_preprocessor = Pipeline([
    ("normalizer", MinMaxScaler())
])

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_uniform = Pipeline([
    ("preprocessor", standard_preprocessor),
    ("knn", KNeighborsClassifier(n_neighbors=8, weights="uniform"))
])

knn_dist = Pipeline([
    ("preprocessor", standard_preprocessor),
    ("knn", KNeighborsClassifier(n_neighbors=8, weights="distance"))
])

knn_uniform.fit(X_train, y_train)
knn_dist.fit(X_train, y_train)

In [11]:
y_train_ordinal = np.apply_along_axis(np.argmax, 1, y_train)
print(len(y_train_ordinal))
y_train_ordinal

46980


array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [12]:
np.unique(y_train_ordinal)

array([0, 1, 2, 3, 4], dtype=int64)

In [13]:
from sklearn.naive_bayes import CategoricalNB

naive_bayes_clf = Pipeline([
    ("preprocessor", positive_preprocessor),
    ("naive bayes classifier", CategoricalNB(alpha=0.0001, fit_prior=True))
])

naive_bayes_clf.fit(X_train, y_train_ordinal)

In [14]:
from xgboost import XGBClassifier

xgb_clf = Pipeline([
    ("preprocessor", standard_preprocessor),
    ("xgboost classifier", XGBClassifier(n_estimators=256, learning_rate=0.02))
])

xgb_clf.fit(X_train, y_train)

In [21]:
from tensorflow.keras.metrics import categorical_crossentropy, categorical_accuracy

y_val_tensor = tf.constant(y_val)

print(f"XGB error: {categorical_crossentropy(y_val_tensor, tf.constant(xgb_clf.predict(X_val), dtype=tf.float32))}")
print(f"KNN Uniform error: {categorical_crossentropy(y_val_tensor, tf.constant(knn_uniform.predict(X_val), dtype=tf.float32))}")
print(f"KNN Distance error: {categorical_crossentropy(y_val_tensor, tf.constant(knn_dist.predict(X_val), dtype=tf.float32))}")

y_val_ordinal_tensor = tf.constant(np.apply_along_axis(np.argmax, 1, y_val))
print(f"Naive Bayes error: {categorical_crossentropy(y_val_ordinal_tensor, tf.constant(naive_bayes_clf.predict(X_val), dtype=tf.float32))}")

XGB error: [1.192093e-07 1.192093e-07 1.192093e-07 ... 1.192093e-07 1.192093e-07
 1.192093e-07]
KNN Uniform error: [1.192093e-07 1.192093e-07 1.192093e-07 ... 1.192093e-07 1.192093e-07
 1.192093e-07]
KNN Distance error: [1.192093e-07 1.192093e-07 1.192093e-07 ... 1.192093e-07 1.192093e-07
 1.192093e-07]
Naive Bayes error: 97673.734375


In [None]:
# Damn! Naive Bayes was ass!