[Reference](https://github.com/navdeep-G/robust-random-cut-forest)

In [4]:
# generate a normally distributed dataset of dimension [n, p] called X
# this is normal non-anomalous data
n = 1000
p = 20
X = np.random.randn(n * p).reshape(n, p)

In [5]:
# now add anomalies to the dataset
outlier_prob = 0.05
is_outlier = np.random.rand(n) > 0.95
n_outliers = np.sum(is_outlier)
X[is_outlier] = 3 * np.random.rand(n_outliers * p).reshape(n_outliers, p)

In [22]:
X.shape

(1000, 20)

In [24]:
X[is_outlier].shape

(55, 20)

In [19]:
print('is_outlier: ', is_outlier)
print('The number of outliers: ', n_outliers)

is_outlier:  [False False False  True False False False False False False False False
 False False False False False False False False False False  True False
 False  True False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False  True False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False False False False
 False  True False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False False False
 False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False F

In [18]:
X

array([[-2.28484191, -0.94059523,  1.58058859, ..., -0.32521655,
        -0.84772037,  0.51412626],
       [-0.11202373, -0.90934959,  1.45638293, ...,  0.48836255,
         0.29273815,  0.04639294],
       [-0.31604327, -0.33889243,  0.24353321, ..., -1.14079656,
        -0.56513728,  0.71190933],
       ...,
       [ 1.00306091, -0.18608432, -0.30426958, ...,  0.07699352,
         1.16478641, -0.40417652],
       [-0.53586386, -0.47283779,  0.86015287, ..., -2.46798289,
         0.95511284,  0.21806112],
       [-0.10452085,  0.62861124,  1.35956493, ...,  0.90939772,
         0.67034991,  0.39572598]])

In [None]:
from __future__ import division
import numpy as np
from scipy import stats
from warnings import warn
from sklearn import metrics

class RobustRandomCutForest(object):
    """Robust Random Cut Forest

    Return the anomaly score of each sample using the Robust  Random Cut Forest algorithm
    The Robust Random Cut Forest 'isolates' observations by randomly selecting a feature
    with probability proportional to its range and then uniformly selecting a split
    at random between the maximum and minimum values of the selected feature.
    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.
    This path length, averaged over a forest of such random trees, is a
    measure of abnormality and our decision function.
    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.


    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    threshold : float in (0., 0.5), optional (default=0.25)
        The threshold of the score used to determine outliers.  Lower means
        fewer points are considered outliers.  This can be calculated to
        allow for an expected proportion of points to be considered outliers
        through the contamination_pct parameter and setting calculate_threshold
        to be True during fitting.

    contamination_pct : float in (0., 0.5), optional (default=0.01)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    bootstrap : boolean, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    random_features : boolean, optional (default=False)
        If True, the feature a tree splits on is chosen uniformaly at random
        (among all features for which the range is nonzero) rather than
        proportional to its range.

    float_min : float in (0, infinity), optional (default=np.finfo(np.float64).eps*10000)
        The minimum range for a feature to be considered worth splitting. If
        all features have ranges smaller than this, no more splits will be made.
        This is put in place to deal with floating point errors where all points
        can land on the same side of a split.
    """

    def __init__(self, n_estimators=100, max_samples=256, max_node_depth=None, threshold=0.7, contamination_pct=.01, bootstrap=False, random_features=False, float_min=np.finfo(np.float64).eps * 10000):
        self._n_estimators = n_estimators
        self.max_samples = max_samples
        self.threshold = threshold
        self.contamination_pct = contamination_pct
        self.bootstrap = bootstrap
        self.random_features = random_features
        self.float_min = float_min
        self.max_node_depth = max_node_depth

    def fit(self, X, calculate_threshold=False):
        '''Fit estimator.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input samples. Use ``dtype=np.float32`` for maximum
            efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficiency.

        calculate_threshold : boolean, optional (default=False)
            If True, the treshold for an outlier used in the predict
            function is calculated by computing the scores of the training
            data and choosing the threshold to be the number at which
            the specified contamination percent proportion of samples are
            considered outliers.  Otherwise, the threshold specified when
            initializing the forest is used.

        Returns
        -------
        self : object
            Returns self.
        '''
        self._n = X.shape[0]
        self.max_samples_ = _get_max_samples(self.max_samples, self._n)
        avg_depth = average_path_length(self.max_samples_)
        # child_rightfit
        self.trees = [
            RandomCutTree(
                random_features=self.random_features,
                float_min=self.float_min,
                max_node_depth=self.max_node_depth,
                avg_depth=avg_depth
            ).fit(_sample(X, self.max_samples_, self.bootstrap))
            for i in range(self._n_estimators)
        ]

        # calculate the threshold for outliers by evaluating the scores of the
        # training set
        if calculate_threshold:
            self.threshold = \
                -stats.scoreatpercentile(-self.decision_function(X),
                                         100. * (1. - self.contamination_pct))

        return self

    def predict(self, X):
        '''Predict if a particular sample is an outlier or not.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.

        Returns
        -------
        is_inlier : array, shape (n_samples,)
            For each observation, tells whether or not (+1 or -1) it should
            be considered as an inlier according to the fitted model.
        '''
        scores = self.decision_function(X)
        is_inlier = np.zeros(X.shape[0], dtype=int)
        is_inlier[scores <= self.threshold] = 1
        return is_inlier

    def decision_function(self, X, transformed=True):
        '''Average anomaly score of X of the base classifiers.

        The anomaly score of an input sample is computed as
        a function of the mean of its depth across all trees in the forest.
        The depths of a leaf is equivalent to the number of splittings required
        to isolate this point.  In case of several identical observations in the
        leaf, the average path length required to separate that many points is
        added to the length.  These default is to apply a transformation to the
        depths of the leaves to return a more easily interpretable score in the
        range (0, 1).

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        transformed : boolean, optional (default=False)
            If True, the score is transformed to lie in the range (0, 1) where
            high scores represent normal points and low scores represent anomalies.
            If False, the average tree depth is returned.

        Returns
        -------
        scores : array of shape (n_samples,)
            The anomaly score of the input samples.
            The lower, the more abnormal.
        '''
        depths = np.column_stack([tree.decision_function(X)
                                  for tree in self.trees])
        mean_depths = np.mean(depths, axis=1)

        if transformed:
            scores = np.power(2., -mean_depths /
                              average_path_length(self.max_samples_))
        else:
            scores = mean_depths

        return 0.5 - scores

    def add_point(self, x):
        '''add a point to the forest

        This method streams a point into the forest.  The point is added and
        and existing point is removed randomly based on reservoir sampling.

        Parameters
        ----------
        x : {array-like, sparse matrix}, shape (n_features,)
            An individual training sample
        '''
        self._n += 1
        add_point_probability = np.float(self.max_samples_) / self._n
        should_add_points = np.random.binomial(
            1, add_point_probability, size=self._n_estimators)
        for tree, should_add_point in zip(self.trees, should_add_points):
            if should_add_point == 1:
                if tree._X.shape[0] > 1:
                    tree.remove_point()
                tree.add_point(x)


class RandomCutTree(object):
    '''Random Cut Tree

    An individual tree in a random cut forest.

    Parameters
    ----------
    random_features : boolean, optional (default=False)
        If True, the feature a tree splits on is chosen uniformaly at random
        (among all features for which the range is nonzero) rather than
        proportional to its range.

    float_min : float in (0, infinity), optional (default=np.finfo(np.float64).eps*10000)
        The minimum range for a feature to be considered worth splitting. If
        all features have ranges smaller than this, no more splits will be made.
        This is put in place to deal with floating point errors where all points
        can land on the same side of a split.
    '''

    def __init__(self, avg_depth=None, max_node_depth=None, random_features=False, float_min=np.finfo(np.float64).eps * 10000):
        self.random_features = random_features
        self.float_min = float_min
        self.max_node_depth = max_node_depth
        self._avg_depth = avg_depth

    def _update_vector(self, X):
        self._X = X
        self._X.setflags(write=False)  # Ensure the base array doesn't change

    def fit(self, X):
        '''fit a random cut tree'''
        self._update_vector(X=X)

        feature_mins = np.min(self._X, axis=0)
        feature_maxs = np.max(self._X, axis=0)
        feature_data = (feature_mins, feature_maxs)

        self._root = TreeNode(
            n=self._X.shape[0],
            parent=None,
            feature_data=feature_data,
            random_features=self.random_features,
            float_min=self.float_min
        )
        self.split_node(node=self._root, X=X)
        return self

    def split_node(self, node, X, current_depth=0):
        if self.max_node_depth is not None and current_depth >= self.max_node_depth:
            node._X = X
        elif not node.is_leaf:
            (child_left, child_right, X_left, X_right) = node._split(X)
            self.split_node(child_left, X_left, current_depth + 1)
            self.split_node(child_right, X_right, current_depth + 1)
        else:
            node._X = X

    def decision_function(self, X):
        '''return the decision function (the depth) for each point in the tree'''
        return np.array([self._root.get_depth(x, self.max_node_depth, self._avg_depth) for x in X])

    def add_point(self, x):
        '''insert a new point into the tree'''
        self._update_vector(np.row_stack([self._X, x]))
        self._root = self.add_new_node(
            node=self._root, point=x, current_depth=0)

    def remove_point(self):
        '''forget a point at random from the tree'''

        choice = np.random.choice(self._root.num_points())
        point_to_forget = self._X[choice]

        current_point_index = np.argmax(
            np.all(self._X == point_to_forget, axis=1))
        self._X = np.delete(self._X, current_point_index, 0)
        self._root = self.remove_node(node=self._root, point=point_to_forget)

    def add_new_node(self, node, point, current_depth=0):
        '''insert a new point into the tree'''
        if node is None:
            return TreeNode(n=1, parent=None, feature_data=(point, point))

        # pick a split for if the new point is included
        (feature_mins, feature_maxs) = node.get_feature_ranges(point)
        feature_ranges = feature_maxs - feature_mins

        (alone_left, alone_right, split_data) = \
            node.get_new_node_position(feature_ranges, feature_mins)

        if self.max_node_depth is not None and current_depth >= self.max_node_depth:
            node.feature_data = (feature_mins, feature_maxs)
            node.point_added()

        # Check if the new split separates the new point from the node
        # If it does we will use this new split
        elif alone_left or alone_right:
            child = node
            node = TreeNode(
                n=child.num_points() + 1,
                parent=node.parent,
                feature_data=(feature_mins, feature_maxs),
                split_data=split_data,
                random_features=child.random_features,
                float_min=child.float_min
            )

            if alone_left:
                node.child_right = child
                node.child_left = TreeNode(
                    n=1,
                    parent=node,
                    feature_data=(point, point),
                    is_left_of_parent=True,
                    float_min=node.float_min
                )

            else:
                node.child_left = child
                node.child_right = TreeNode(
                    n=1,
                    parent=node,
                    feature_data=(point, point),
                    is_left_of_parent=False,
                    float_min=node.float_min
                )

        else:
            # update node statistics to include new point
            node.feature_data = (feature_mins, feature_maxs)
            node.point_added()
            # find the new point's leaf starting at the node below
            # note that we use the old splitting criterion

            if node.is_point_left(point):
                node.child_left = self.add_new_node(
                    node.child_left, point, current_depth + 1)
                node.child_left.parent = node
            else:
                node.child_right = self.add_new_node(
                    node.child_right, point, current_depth + 1)
                node.child_right.parent = node

        return node

    def remove_node(self, node, point):
        '''forget a point from the tree'''
        if node.is_leaf:
            return self._remove_from_leaf(node, point)

        node.point_removed()
        prev_parent = node.parent
        if node.is_point_left(point):
            node.child_left = self.remove_node(node.child_left, point)
            if node.child_left is None:
                node = node.child_right
                node.parent = prev_parent

        else:
            node.child_right = self.remove_node(node.child_right, point)

            if node.child_right is None:
                node = node.child_left
                node.parent = prev_parent

        return node

    def _remove_from_leaf(self, node, point):
        if node.num_points() > 1:  # array is duplicate points
            node.point_removed()
            return node

        return None


class TreeNode(object):
    '''Tree Node

    An individual node in a random cut tree.
    '''

    def __init__(self, n, parent, is_left_of_parent=None, feature_data=None, split_data=None, child_left=None, child_right=None, initialize=False, random_features=False, float_min=np.finfo(np.float64).eps * 10000):
        self._n = n
        self.parent = parent
        self.is_left_of_parent = is_left_of_parent
        self.random_features = random_features
        self.float_min = float_min

        self.feature_data = feature_data
        self.child_left = child_left
        self.child_right = child_right
        self._X = None

        self._set_split()

    @property
    def is_leaf(self):
        """ Ensures we correctly check if a node is a leaf. """
        return np.all(self.feature_ranges < self.float_min)

    @property
    def feature_ranges(self):
        return self.feature_maxs - self.feature_mins

    @property
    def feature_mins(self):
        return self.feature_data[0]

    @property
    def feature_maxs(self):
        return self.feature_data[1]

    @feature_mins.setter
    def feature_mins(self, val):
        self.feature_data = (val, self.feature_data[1])

    @feature_maxs.setter
    def feature_maxs(self, val):
        self.feature_data = (self.feature_data[0], val)

    def is_point_left(self, point):
        if point[self.split_feature] < self.split_threshold:
            return True
        return False

    def get_new_node_position(self, feature_ranges, feature_mins):
        if np.all(feature_ranges < self.float_min):
            # Node is a leaf and we don't want to continue down this path
            return (False, False, (None, None))

        split_feature = self._sample_split_feature(feature_ranges)
        split_threshold = self._sample_split_threshold(
            feature_ranges, feature_mins, split_feature)

        alone_left = self.feature_mins[split_feature] > split_threshold

        alone_right = self.feature_maxs[split_feature] < split_threshold

        return (alone_left, alone_right, (split_feature, split_threshold))

    def point_added(self):
        """ Wrapper so the number of points isn't directly exposed """
        self._n += 1

    def num_points(self):
        """ Wrapper to get the number of points stored in a node. """
        return self._n

    def point_removed(self):
        """ Wrapper to decrement the number of points """
        self._n -= 1

    def get_feature_ranges(self, point):
        """ Compute the feature ranges given a new point """
        feature_mins = np.minimum(self.feature_mins, point)
        feature_maxs = np.maximum(self.feature_maxs, point)
        return (feature_mins, feature_maxs)

    def _set_split(self):
        '''set splitting feature and threshold'''
        split_feature = self._sample_split_feature(self.feature_ranges)
        split_threshold = self._sample_split_threshold(
            self.feature_ranges, self.feature_mins, split_feature)
        self.split_feature = split_feature
        self.split_threshold = split_threshold

    def _split(self, X):
        '''split based on feature and threshold'''
        is_left = X[:, self.split_feature] < self.split_threshold
        X_left = X[is_left]
        X_right = X[np.logical_not(is_left)]

        feature_data_left = (np.min(X_left, axis=0), np.max(X_left, axis=0))
        feature_data_right = (np.min(X_right, axis=0), np.max(X_right, axis=0))

        self.child_left = TreeNode(
            n=X_left.shape[0],
            parent=self,
            feature_data=feature_data_left,
            is_left_of_parent=True,
            float_min=self.float_min
        )

        self.child_right = TreeNode(
            n=X_right.shape[0],
            parent=self,
            feature_data=feature_data_right,
            is_left_of_parent=False,
            float_min=self.float_min
        )

        return (self.child_left, self.child_right, X_left, X_right)

    def _sample_split_feature(self, feature_ranges):
        '''sample the feature to split on'''
        if self.random_features:
            is_feature_varying = np.array(
                feature_ranges > self.float_min, dtype=float)
            return np.flatnonzero(np.random.multinomial(1, is_feature_varying / np.sum(is_feature_varying)))[0]
        else:
            if np.sum(feature_ranges) == 0:
                return np.flatnonzero(np.random.multinomial(1, feature_ranges))[0]
            return np.flatnonzero(np.random.multinomial(1, feature_ranges / np.sum(feature_ranges)))[0]

    def _sample_split_threshold(self, feature_ranges, feature_mins, split_feature):
        '''sample the splitting threshold of a node'''
        return np.random.rand() * feature_ranges[split_feature] + feature_mins[split_feature]

    def get_depth(self, x, max_node_depth=None, avg_depth=None, current_depth=0):
        '''calculate number of nodes to isolate a point'''
        if max_node_depth is not None and current_depth >= max_node_depth:
            return avg_depth
        if self.is_leaf:
            return current_depth + average_path_length(self._n)
        elif x[self.split_feature] < self.split_threshold:
            return self.child_left.get_depth(x, max_node_depth, avg_depth, current_depth + 1)
        else:
            return self.child_right.get_depth(x, max_node_depth, avg_depth, current_depth + 1)


def _sample(X, num_samples, replace=False):
    '''take a random sample of X'''
    n = X.shape[0]
    return X[np.random.choice(n, num_samples, replace)]


def _get_max_samples(max_samples, n):
    '''get the number of samples for each tree'''
    if isinstance(max_samples, int):
        if max_samples > n:
            warn("max_samples (%s) is greater than the "
                 "total number of samples (%s). max_samples "
                 "will be set to n_samples for estimation."
                 % (max_samples, n))
            max_samples_ = n
        else:
            max_samples_ = max_samples
    else:  # float
        if not (0. < max_samples <= 1.):
            raise ValueError("max_samples must be in (0, 1]")
        max_samples_ = int(max_samples * n)

    return max_samples_


def harmonic_approx(n):
    '''Returns an approximate value of n-th harmonic number.'''
    # Euler-Mascheroni constant
    gamma = 0.57721566490153286060651209
    return gamma + np.log(n) + 0.5 / n - 1. / (12 * np.square(n)) + 1. / (120 * np.power(n, 4))


def average_path_length(n):
    '''Returns the average path length of an unsuccessful search in a BST'''
    return 2. * harmonic_approx(n - 1) - 2. * (n - 1.) / n if n > 1 else 0

In [7]:
# run a batch job to build a random cut forest to identify what the anomalies in the dataset are
forest_batch = RobustRandomCutForest(max_samples=128, random_features=False).fit(X)

In [8]:
scores_batch = forest_batch.decision_function(X)

In [10]:
print('Forest batch: ', forest_batch)
print('Scores batch: ', scores_batch)

Forest batch:  <__main__.RobustRandomCutForest object at 0x7ff868077710>
Scores batch:  [ 0.07648815  0.12715913  0.02193005 -0.03128418  0.08240607  0.07048636
  0.08759699  0.05684278  0.09144779  0.10833624  0.08044268  0.06779167
  0.09462931  0.0398972   0.10216423  0.01968232  0.09747148  0.05196583
  0.02267696  0.03953736  0.07149253  0.085658   -0.08582393  0.03519728
  0.06745365 -0.07358795  0.06947783  0.08273241  0.02713395  0.07516171
  0.05021109  0.09935519  0.08142553  0.0234227   0.06846692  0.1180129
  0.03953736  0.05371373  0.03845616  0.10741658  0.08338432 -0.07493478
  0.06779167  0.1095591   0.07648815  0.04312312  0.1095591   0.04490552
  0.03008213  0.02416727  0.04133375  0.10802993  0.02491068  0.02639402
  0.09399499 -0.07899429  0.11290233  0.06473993  0.0637179   0.01930667
  0.11681652  0.11199339  0.05091381  0.07582545 -0.06956635  0.0718274
  0.03845616  0.06063739  0.05196583  0.04133375  0.07879946  0.0637179
  0.06947783  0.04490552  0.07082201  0

In [12]:
# build a random cut forest with only a small sample of initial points
stream_init = 300
forest_stream = RobustRandomCutForest(max_samples=128, random_features=False).fit(X[:stream_init])

In [13]:
# now stream in the remaining points
for i in range(stream_init, n):
    forest_stream.add_point(X[i])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [15]:
scores_stream = forest_stream.decision_function(X)

In [16]:
scores_stream

array([ 0.04703521,  0.09112827,  0.0151551 , -0.04134659,  0.07781045,
        0.02491068,  0.05580221, -0.0113171 ,  0.05926131,  0.06235146,
        0.00597224,  0.04240821,  0.05891661,  0.0170466 ,  0.07549371,
        0.01020244,  0.06981427,  0.02897872, -0.0057513 ,  0.02005767,
        0.04915495,  0.04025676, -0.1030168 ,  0.00828413,  0.05788091,
       -0.09552097,  0.02565293,  0.06609891, -0.02632333,  0.0398972 ,
        0.04632643,  0.04880235,  0.07582545, -0.0077321 ,  0.04169218,
        0.06846692,  0.00789957,  0.00597224, -0.00102887,  0.06063739,
        0.04348016, -0.06116866,  0.02971461,  0.07216201,  0.04597161,
        0.04597161,  0.06063739,  0.02416727,  0.02005767, -0.01613651,
       -0.00812919,  0.09240486, -0.00496115,  0.01211327,  0.06132383,
       -0.08308246,  0.08109817,  0.02787272,  0.03737241, -0.00812919,
        0.06303522,  0.06981427, -0.01091752,  0.04880235, -0.07628478,
        0.03737241,  0.00674407,  0.04240821,  0.04169218, -0.01

In [17]:
# both random cut forests produced good results at identifying the anomalies
print('batch random cut forest roc auc: ', metrics.roc_auc_score(is_outlier, -scores_batch))
print('streaming random cut forest roc auc: ', metrics.roc_auc_score(is_outlier, -scores_stream))

batch random cut forest roc auc:  0.9985473785473785
streaming random cut forest roc auc:  0.9935257335257335


# Different Approach


In [25]:
forest = RobustRandomCutForest()
forest = forest.fit(X)

In [26]:
depths = forest.decision_function(X)
labels = forest.predict(X)

In [28]:
print('Depths: ', depths)
print('Labels: ', labels)

Depths:  [ 0.12494216  0.17371971  0.05794785 -0.00368904  0.14107302  0.12111786
  0.10914516  0.06799691  0.13593885  0.1456558   0.06974641  0.09842699
  0.12620832  0.06181745  0.1344585   0.05132128  0.1236717   0.06682663
  0.07667386  0.05764877  0.08461503  0.10993739 -0.0593581   0.05192778
  0.09761138 -0.06697569  0.10409017  0.11151704  0.02765579  0.08685645
  0.11387454  0.10940941  0.11204217  0.03494651  0.11491771  0.14445551
  0.05645045  0.07466496  0.05374236  0.12822531  0.07895813 -0.02525897
  0.09102657  0.14541607  0.11725457  0.08236145  0.12948039  0.08123007
  0.05010582  0.02637622  0.03651649  0.1499434   0.03431702  0.04827646
  0.12922971 -0.05258981  0.14373338  0.09102657  0.08377136  0.05132128
  0.12290736  0.12747022  0.06594683  0.09788344 -0.05371215  0.11335189
  0.05794785  0.07495252  0.06033316  0.05944018  0.09350841  0.09460661
  0.10435785  0.05374236  0.11803037  0.06799691  0.12948039  0.05824672
  0.14155819  0.11751334  0.13667677  0.13

In [30]:
import pandas as pd 
df = pd.DataFrame(X)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-2.284842,-0.940595,1.580589,-0.9367,0.201533,0.562792,0.222527,-0.277234,-1.077819,-0.493098,-0.592255,-0.111492,-0.749112,0.273309,-1.04484,0.553838,0.287068,-0.325217,-0.84772,0.514126
1,-0.112024,-0.90935,1.456383,0.729539,-0.144048,-0.81178,-0.322083,0.177745,-0.561323,-0.232063,-0.2184,-0.530819,0.111559,-0.339777,-1.461972,0.264692,-0.127786,0.488363,0.292738,0.046393
2,-0.316043,-0.338892,0.243533,0.565637,1.499923,1.597672,1.665715,-0.087915,-1.744642,-0.987217,-1.913087,-0.8307,-1.235151,0.268027,0.434729,-0.65703,-1.566755,-1.140797,-0.565137,0.711909
3,1.243562,0.953326,0.91687,0.242212,2.861265,0.855922,1.382748,1.629768,1.737861,1.821615,0.664029,2.487674,0.674101,0.654034,1.018153,1.08298,2.56856,2.996957,2.049397,0.5136
4,-0.38276,-1.591869,1.212411,-0.544538,0.523952,-0.038625,0.74603,0.006171,-1.043119,-0.210697,-0.557552,-0.072771,-1.51138,0.867469,0.565315,0.426926,-0.684476,0.623326,-0.093837,-0.952544


In [39]:
df['depths'] = depths
df['labels'] = labels 

In [40]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,depths,labels
0,-2.284842,-0.940595,1.580589,-0.9367,0.201533,0.562792,0.222527,-0.277234,-1.077819,-0.493098,...,-0.749112,0.273309,-1.04484,0.553838,0.287068,-0.325217,-0.84772,0.514126,0.124942,1
1,-0.112024,-0.90935,1.456383,0.729539,-0.144048,-0.81178,-0.322083,0.177745,-0.561323,-0.232063,...,0.111559,-0.339777,-1.461972,0.264692,-0.127786,0.488363,0.292738,0.046393,0.17372,1
2,-0.316043,-0.338892,0.243533,0.565637,1.499923,1.597672,1.665715,-0.087915,-1.744642,-0.987217,...,-1.235151,0.268027,0.434729,-0.65703,-1.566755,-1.140797,-0.565137,0.711909,0.057948,1
3,1.243562,0.953326,0.91687,0.242212,2.861265,0.855922,1.382748,1.629768,1.737861,1.821615,...,0.674101,0.654034,1.018153,1.08298,2.56856,2.996957,2.049397,0.5136,-0.003689,1
4,-0.38276,-1.591869,1.212411,-0.544538,0.523952,-0.038625,0.74603,0.006171,-1.043119,-0.210697,...,-1.51138,0.867469,0.565315,0.426926,-0.684476,0.623326,-0.093837,-0.952544,0.141073,1


In [27]:
# # Given an array of points....
# for point in points:
#     forest.add_point(point)
# depths = forest.decision_function(points)
# labels = forest.predict(points)