# Developing an Isolation Forest:

### Imports:

In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.ensemble import IsolationForest

### Defining Isolation Forest Class:

In [2]:
class UEBAIsolationForest:
    """
    Isolation Forest for anomaly detection on Autoencoder latent embeddings.
    """
    
    def __init__(self, n_estimators: int=200, max_samples: str="auto", contamination: float=0.05, random_state: int=42) -> None:
        """
        Initializes the Isolation Forest.
        
        Args:
            n_estimators: The number of trees in the forest
            max_samples: The subsamples size for each tree
            contamination: Expected proportion of anomalies
            random_state: Random seed for reproducibility
            
        Returns:
            None:
        """
        self.model = IsolationForest(
            n_estimators=n_estimators,
            max_samples=max_samples,
            contamination=contamination,
            random_state=random_state,
            n_jobs=-1
        )
    
     
    def train(self, latent_embeddings: np.ndarray):
        """
        Trains the Isolation Forest on latent embeddibgs.
        
        Args:
            latent_embeddings: Latent embedding matrix of shape: (n_samples, latent_emb_dim)
            
        Returns:
            A history object of the model's training
        """
        history = self.model.fit(latent_embeddings)
        return history
    
    
    def anomaly_score(self, latent_embeddings: np.ndarray) -> np.ndarray:
        """
        Computes the anomaly scores for latent embeddings, where a higher score signifies more
        anomalous activity.
        
        Args:
            latent_embeddings: The latent embeddings matrix of shape: (n_samples, latent_emb_dim)
            
        Returns:
            np.ndarray: Anomaly scores
        """
        scores = -self.model.score_samples(latent_embeddings)
        return scores
    
    
    def predict(self, latent_embeddings: np.ndarray) -> np.ndarray:
        """
        Predicts anomaly labels using model threshold, where -1 signifies anomalous activity and
        1 signifies normal behavior.
        
        Args:
            latent_embeddings: The latent embedding matrix.
            
        Returns:
            np.ndarray: Binary predictions conveying normal or anomalous
        """
        labels = self.model.predict(latent_embeddings)
        return labels
    
    
    def save(self, save_path: str) -> None:
        """
        Save the trained Isolation Forest model.
        
        Args:
            save_path: The path where the Isolation Forest will be saved
            
        Returns:
            None:
        """
        joblib.dump(self.model, save_path)
        
    
    def load(self, load_path: str) -> None:
        """
        Loads a previously trained Isolation Forest model.
        
        Args:
            load_path: File path from where to load the pretrained model
            
        Returns:
            None:
        """
        self.model = joblib.load(load_path)

### Loading Latent Embeddings:

In [3]:
latent_embeddings = np.load(r"encoders\encoder_model_1\latent_embeddings.npy")

### Initializing and Training Isolation Forest:

In [4]:
iforest = UEBAIsolationForest(
    n_estimators=200,
    contamination=0.05
)

In [5]:
iforest.train(latent_embeddings)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",200
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.05
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


### Computing Anomaly Scores and Predictions:

In [6]:
scores = iforest.anomaly_score(latent_embeddings)

In [11]:
scores[:10]

array([0.38017294, 0.35783825, 0.41736843, 0.3720013 , 0.38407974,
       0.36603173, 0.36543748, 0.43302496, 0.35434853, 0.43602313])

In [7]:
predictions = iforest.predict(latent_embeddings)

In [27]:
predictions[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### Saving the Isolation Froest, Anomaly Scores, and Anomaly Labels:

In [8]:
# Defining save path for current training iteration
save_path = r"isolation_forests\iforest_model_1"
os.makedirs(save_path, exist_ok=True)

In [9]:
# Saving the isolation forest model
iforest.save(os.path.join(save_path, "iforest_modle.pkl"))

In [10]:
# Saving anomaly scores and label predictions
np.save(os.path.join(save_path, "anomaly_scores.npy"), scores)
np.save(os.path.join(save_path, "anomaly_labels.npy"), predictions)