In [38]:
from pyod.utils.data import generate_data, generate_data_categorical
from pyod.models.hbos import HBOS
from pyod.utils.utility import invert_order
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

**Generate some outlier data**

In [5]:
X_num, y_num = generate_data(n_train=1000, n_test=0, n_features=3, train_only=True)

In [23]:
X_cat1, y_cat1, _, _ = generate_data_categorical(n_train=1000, n_test=5, n_features=2, n_category_in=5, n_category_out=5)
X_cat2, y_cat2, _, _ = generate_data_categorical(n_train=1000, n_test=5, n_features=2, n_category_in=3, n_category_out=3)

In [24]:
print(len(set(X_cat1[:, 0])))
print(len(set(X_cat1[:, 1])))


10
10


In [34]:
X_cat = np.hstack([X_cat1, X_cat2])
ord_encoder = OrdinalEncoder()
ord_encoder.fit(X_cat)
X_cat_encoded = ord_encoder.transform(X_cat)
X_cat_encoded

X_full = np.hstack([X_num, X_cat_encoded])

**Initialize model**

In [37]:
number_of_columns = X_full.shape[1]
classifiers = []
categorical_indices = [3,4,5,6]
for col_index in range(number_of_columns):
    if col_index in categorical_indices:
        nr_different_categories = len(set(X_full[:, col_index]))
        clf = HBOS(n_bins=nr_different_categories)
        clf.fit(X_full[:, col_index].reshape(-1, 1))
        classifiers.append(clf)
    else:
        clf = HBOS(n_bins="auto")
        clf.fit(X_full[:, col_index].reshape(-1, 1))
        classifiers.append(clf)
classifiers

[HBOS(alpha=0.1, contamination=0.1, n_bins='auto', tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins='auto', tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins='auto', tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins=10, tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins=10, tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins=6, tol=0.5),
 HBOS(alpha=0.1, contamination=0.1, n_bins=6, tol=0.5)]

**get combined outlier scores**

In [40]:
n_samples, n_features = X_full.shape[0], X_full.shape[1]
outlier_scores = np.zeros(shape=(n_samples, n_features))
for col_index in range(n_features):
    outlier_scores[:, col_index] = classifiers[col_index].decision_scores_
final_outlier_scores = invert_order(np.sum(outlier_scores, axis=1))

In [43]:
final_outlier_scores.shape

(1000,)

In [47]:
contamination = 0.1
threshold = sorted(final_outlier_scores, reverse=True)[int(contamination * n_samples)]
prediction = (final_outlier_scores > threshold).astype('int').ravel()