[Reference](https://towardsdatascience.com/perform-outlier-detection-more-effectively-using-subsets-of-features-d984bde99981)

In [1]:
import pandas as pd
import numpy as np

def get_random_subspaces(features_arr, num_base_detectors,
                         num_feats_per_detector):
    num_feats = len(features_arr)
    feat_sets_arr = []
    ft_used_counts = np.zeros(num_feats)
    ft_pair_mtx = np.zeros((num_feats, num_feats))

    # Each loop generates one subspace, which is one set of features
    for _ in range(num_base_detectors):
        # Get the set of features with the minimum count
        min_count = ft_used_counts.min()
        idxs = np.where(ft_used_counts == min_count)[0]

        # Pick one of these randomly and add to the current set
        feat_set = [np.random.choice(idxs)]

        # Find the remaining set of features
        while len(feat_set) < num_feats_per_detector:
            mtx_with_set = ft_pair_mtx[:, feat_set]
            sums = mtx_with_set.sum(axis=1)
            min_sum = sums.min()
            min_idxs = np.where(sums==min_sum)[0]
            new_feat = np.random.choice(min_idxs)
            feat_set.append(new_feat)
            feat_set = list(set(feat_set))

            # Updates ft_pair_mtx
            for c in feat_set:
                ft_pair_mtx[c][new_feat] += 1
                ft_pair_mtx[new_feat][c] += 1

        # Updates ft_used_counts
        for c in feat_set:
            ft_used_counts[c] += 1

        feat_sets_arr.append(feat_set)

    return feat_sets_arr

np.random.seed(0)
features_arr = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
num_base_detectors = 4
num_feats_per_detector = 5

feat_sets_arr = get_random_subspaces(features_arr,
                                     num_base_detectors,
                                     num_feats_per_detector)
for feat_set in feat_sets_arr:
    print([features_arr[x] for x in feat_set])

['A', 'E', 'F', 'G', 'H']
['B', 'C', 'D', 'F', 'H']
['A', 'B', 'C', 'D', 'E']
['B', 'D', 'E', 'F', 'G']


In [3]:
pip install pyod

Collecting pyod
  Downloading pyod-2.0.2.tar.gz (165 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/165.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-2.0.2-py3-none-any.whl size=198469 sha256=1b691cc7f530b47645526451ad2631200048719196760d567144a31db843c321
  Stored in directory: /root/.cache/pip/wheels/77/c2/20/34d1f15b41b701ba69f42a32304825810d680754d509f91391
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-2.0.2


In [4]:
import pandas as pd
import numpy as np
from pyod.models.sod import SOD

np.random.seed(0)
d = np.random.randn(100, 35)
d = pd.DataFrame(d)

#A Ensure features 8 and 9 are correlated, while all others are irrelevant
d[9] = d[9] + d[8]

# Insert a single outlier
d.loc[99, 8] = 3.5
d.loc[99, 9] = -3.8

#C Execute SOD, flagging only 1 outlier
clf = SOD(ref_set=3, contamination=0.01)
d['SOD Scores'] = clf.fit (d)
d['SOD Scores'] = clf.labels_