# **Deep Isolation Forests**

This is an implementation of the following paper on Deep Isolation Forests: https://arxiv.org/pdf/2206.06602. We implement the tabular data and ignore the time series and graph implementations of the model as we are unfamiliar with those kinds of data in general. The benchmarks used are the standard Isolation Forests and extended solation Forests(https://arxiv.org/pdf/1811.02141). The model implementations usew h2o package for EIF and sklearn for IF. the DeepOD implementation of DIF model is being tested. Unlike the original paper, we use only 3 datasets, Fraud, Shuttle and Pageblocks. The datasets will be uploaded in the GitHub repositry submission. The code for creating synthetic datasets has been taken from the official paper repositry https://github.com/xuhongzuo/deep-iforest/blob/main/create_scal_data.py and the h2o model implementation takes inspiration from the repositry https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/extended-isolation-forest-introduction.ipynb and the dataset testing is inspired by the Kaggle Notebook https://www.kaggle.com/code/nilanml/fraud-detection-using-isolation-forest .

# **Importing necessary libraries**

In [1]:
!pip install h2o # For extended isolation forests
!pip install deepod # For deep isolation forests
# sklearn isolation forests and h2o eif are the benchmarks for comparision

Collecting deepod
  Downloading deepod-0.4.1-py3-none-any.whl.metadata (10 kB)
Collecting torch<1.13.1,>=1.10.0 (from deepod)
  Downloading torch-1.13.0-cp310-cp310-manylinux1_x86_64.whl.metadata (23 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch<1.13.1,>=1.10.0->deepod)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch<1.13.1,>=1.10.0->deepod)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch<1.13.1,>=1.10.0->deepod)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch<1.13.1,>=1.10.0->deepod)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Downloading deepod-0.4.1-py3-none-any.whl (203 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import h2o
import math
import random
import time
from h2o.estimators import H2OIsolationForestEstimator, H2OExtendedIsolationForestEstimator
from deepod.models import DeepIsolationForest
from deepod.models import DeepIsolationForestTS
from sklearn.datasets import make_blobs
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import warnings

warnings.filterwarnings('ignore')

We will first convert all the object type columns to int or bool type

# **Single Blob Data**

We will observe the score maps and anomaly scores for a single blob first.

In [3]:
# blob_params = {'random_state': 42, 'n_samples': 2000, 'n_features': 2}
# data = make_blobs(centers=[[0, 0]], cluster_std=[1], **blob_params)[0]
# fig=plt.figure(figsize=(8,8))
# fig.add_subplot(111)
# plt.plot(data[:,0],data[:,1],'o')
# plt.grid("off")
# # plt.axis("equal")
# plt.xlabel("x")
# plt.ylabel("y")
# plt.xlim([-5,5])
# plt.ylim([-5,5])
# plt.tick_params(direction='out', length=6, width=2)
# plt.title("Blob")
# plt.show()

In [4]:
# X=data[:,0]
# y=data[:,1]

In [5]:
# h2o.init()
# training_frame = h2o.H2OFrame(data,column_names = ["x", "y"])

# EIF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 300,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# EIF_h2o.train(training_frame = training_frame)

# IF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "isolation_forest.hex",
#                                               ntrees = 300,
#                                               sample_size = 256,
#                                               extension_level = 0,
#                                               seed = 42
#                                              )
# IF_h2o.train(training_frame = training_frame)

# dif=DeepIsolationForest()
# dif.fit(data)

# print(EIF_h2o)
# print(IF_h2o)
# print(dif)

In [6]:
# granularity = 50

# # Heat map data
# xx, yy = np.meshgrid(np.linspace(-5, 5, granularity), np.linspace(-5, 5, granularity))

# # H2O
# hf_heatmap = h2o.H2OFrame(np.c_[xx.ravel(), yy.ravel()], column_names = ["x", "y"])

# #EIF
# h2o_anomaly_score_eif = EIF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_eif = h2o_anomaly_score_eif.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_eif = np.array(h2o_anomaly_score_df_eif["anomaly_score"]).reshape(xx.shape)

# #IF
# h2o_anomaly_score_if = IF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_if = h2o_anomaly_score_if.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_if = np.array(h2o_anomaly_score_df_if["anomaly_score"]).reshape(xx.shape)

# #DIF
# heatmap_data = np.c_[xx.ravel(), yy.ravel()]
# anomaly_scores = dif.decision_function(heatmap_data)
# heatmap_dif = np.array(anomaly_scores).reshape(xx.shape)

In [7]:
# f = plt.figure(figsize=(24, 9))

# def plot_heatmap(heatmap_data, subplot, title):
#     ax1 = f.add_subplot(subplot)
#     levels = np.linspace(0,1,10, endpoint=True)
#     v = np.linspace(0, 1, 12, endpoint=True)
#     v = np.around(v, decimals=1)
#     CS = ax1.contourf(xx, yy, heatmap_data, levels, cmap=plt.cm.YlOrRd)
#     cbar = plt.colorbar(CS, ticks=v)
#     cbar.ax.set_ylabel('Anomaly score', fontsize = 25)
#     cbar.ax.tick_params(labelsize=15)
#     ax1.set_xlabel("x", fontsize = 25)
#     ax1.set_ylabel("y", fontsize = 25)
#     plt.tick_params(labelsize=30)
#     plt.scatter(data[:,0],data[:,1],s=15,c='None',edgecolor='k')
#     plt.axis("equal")
#     plt.title(title,  fontsize=32)

# plot_heatmap(heatmap_h2o_if, 131, "IF")
# plot_heatmap(heatmap_h2o_eif, 132, "EIF")
# plot_heatmap(heatmap_dif,133,"DIF")

# plt.show()

In [8]:
# hf1010 = h2o.H2OFrame([[10,10],[0,0]], column_names = ["x", "y"])

# predictions1010_eif = EIF_h2o.predict(hf1010)
# predictions1010_if = IF_h2o.predict(hf1010)

# predictions1010_all = predictions1010_eif.cbind(predictions1010_if).cbind(hf1010)
# predictions1010_all.rename({"anomaly_score": "anomaly_score_eif", "mean_length": "mean_length_eif", "anomaly_score0": "anomaly_score_if", "mean_length0": "mean_length_if"})

In [9]:
# test_data = np.array(hf1010.as_data_frame())
# predictions_dif = dif.decision_function(test_data)

# predictions_all = np.column_stack((test_data, predictions_dif))

# #predictions_all.rename({"anomaly_score": "anomaly_score_dif"})
# predictions_all_df = pd.DataFrame(predictions_all, columns=hf1010.columns + ['anomaly_score_dif'])

# print(predictions_all_df)

In [10]:
# print('Mean Anomaly Socre of DIF: ',np.mean(dif.decision_function(data)))
# print('Mean anomaly score of IF: ',IF_h2o.predict(training_frame)[0].mean())
# print('Mean anomaly score of EIF: ',EIF_h2o.predict(training_frame)[0].mean())

# **2D Blob Data**

We will start working on 2d blobs

In [11]:
# blob_params = {'random_state': 42, 'n_samples': 2000, 'n_features': 2}
# data = make_blobs(centers=[[10, 0], [0, 10]], cluster_std=[1, 1], **blob_params)[0]
# fig=plt.figure(figsize=(8,8))
# fig.add_subplot(111)
# plt.plot(data[:,0],data[:,1],'o')
# plt.grid("off")
# # plt.axis("equal")
# plt.xlabel("x")
# plt.ylabel("y")
# plt.xlim([-5,15])
# plt.ylim([-5,15])
# plt.tick_params(direction='out', length=6, width=2)
# plt.title("Double blob")
# plt.show()

In [12]:
# h2o.init()

In [13]:
# training_frame = h2o.H2OFrame(data,column_names = ["x", "y"])

# EIF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 500,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# EIF_h2o.train(training_frame = training_frame)

# IF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "isolation_forest.hex",
#                                               ntrees = 500,
#                                               sample_size = 256,
#                                               extension_level = 0,
#                                               seed = 42
#                                              )
# IF_h2o.train(training_frame = training_frame)

# dif=DeepIsolationForest()
# dif.fit(data)

# print(EIF_h2o)
# print(IF_h2o)
# print(dif)

In [14]:
# granularity = 30

# # Heat map data
# xx, yy = np.meshgrid(np.linspace(-5, 15, granularity), np.linspace(-5, 15, granularity))

# # H2O
# hf_heatmap = h2o.H2OFrame(np.c_[xx.ravel(), yy.ravel()], column_names = ["x", "y"])

# #EIF
# h2o_anomaly_score_eif = EIF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_eif = h2o_anomaly_score_eif.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_eif = np.array(h2o_anomaly_score_df_eif["anomaly_score"]).reshape(xx.shape)

# #IF
# h2o_anomaly_score_if = IF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_if = h2o_anomaly_score_if.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_if = np.array(h2o_anomaly_score_df_if["anomaly_score"]).reshape(xx.shape)

# #DIF
# heatmap_data = np.c_[xx.ravel(), yy.ravel()]
# anomaly_scores = dif.decision_function(heatmap_data)
# heatmap_dif = np.array(anomaly_scores).reshape(xx.shape)

In [15]:
# f = plt.figure(figsize=(24, 9))

# def plot_heatmap(heatmap_data, subplot, title):
#     ax1 = f.add_subplot(subplot)
#     levels = np.linspace(0,1,10, endpoint=True)
#     v = np.linspace(0, 1, 12, endpoint=True)
#     v = np.around(v, decimals=1)
#     CS = ax1.contourf(xx, yy, heatmap_data, levels, cmap=plt.cm.YlOrRd)
#     cbar = plt.colorbar(CS, ticks=v)
#     cbar.ax.set_ylabel('Anomaly score', fontsize = 25)
#     cbar.ax.tick_params(labelsize=15)
#     ax1.set_xlabel("x", fontsize = 25)
#     ax1.set_ylabel("y", fontsize = 25)
#     plt.tick_params(labelsize=30)
#     plt.scatter(data[:,0],data[:,1],s=15,c='None',edgecolor='k')
#     plt.axis("equal")
#     plt.title(title,  fontsize=32)

# plot_heatmap(heatmap_h2o_if, 131, "IF")
# plot_heatmap(heatmap_h2o_eif, 132, "EIF")
# plot_heatmap(heatmap_dif,133,"DIF")

# plt.show()

In [16]:
# hf1010 = h2o.H2OFrame([[10,10],[0,0]], column_names = ["x", "y"])

# predictions1010_eif = EIF_h2o.predict(hf1010)
# predictions1010_if = IF_h2o.predict(hf1010)

# predictions1010_all = predictions1010_eif.cbind(predictions1010_if).cbind(hf1010)
# predictions1010_all.rename({"anomaly_score": "anomaly_score_eif", "mean_length": "mean_length_eif", "anomaly_score0": "anomaly_score_if", "mean_length0": "mean_length_if"})

In [17]:
# test_data = np.array(hf1010.as_data_frame())
# predictions_dif = dif.decision_function(test_data,return_confidence=True)

# predictions_all = np.column_stack((test_data, predictions_dif))

# #predictions_all.rename({"anomaly_score": "anomaly_score_dif"})
# predictions_all_df = pd.DataFrame(predictions_all, columns=hf1010.columns + ['anomaly_score_dif','confidence'])

# print(predictions_all_df)

In [18]:
# print('Mean Anomaly Socre of DIF: ',np.mean(dif.decision_function(data)))
# print('Mean anomaly score of IF: ',IF_h2o.predict(training_frame)[0].mean())
# print('Mean anomaly score of EIF: ',EIF_h2o.predict(training_frame)[0].mean())

# **Sinosidual Data**

In [19]:
# np.random.seed(42)
# n_points = 2000

# x = np.linspace(0, 10, n_points)
# y_true = np.sin(x)
# noise = np.random.normal(0, 0.1, n_points)  # Mean=0, Std=0.1
# y = y_true + noise
# data = np.column_stack((x, y))

# plt.figure(figsize=(10, 6))
# plt.scatter(x, y, label='Noisy Sinusoidal Data')
# plt.title('Sinusoidal Data with Noise')
# plt.xlabel('X')
# plt.ylabel('Y')
# plt.legend()
# plt.grid(True)
# plt.show()

In [20]:
# h2o.init()

In [21]:
# training_frame = h2o.H2OFrame(data,column_names = ["x", "y"])

# EIF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 500,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# EIF_h2o.train(training_frame = training_frame)

# IF_h2o = H2OExtendedIsolationForestEstimator(
#                                               model_id = "isolation_forest.hex",
#                                               ntrees = 500,
#                                               sample_size = 256,
#                                               extension_level = 0,
#                                               seed = 42
#                                              )
# IF_h2o.train(training_frame = training_frame)

# dif=DeepIsolationForest()
# dif.fit(data)

# print(EIF_h2o)
# print(IF_h2o)
# print(dif)

In [22]:
# granularity = 30

# # Heat map data
# xx, yy = np.meshgrid(np.linspace(-5, 15, granularity), np.linspace(-5, 15, granularity))

# # H2O
# hf_heatmap = h2o.H2OFrame(np.c_[xx.ravel(), yy.ravel()], column_names = ["x", "y"])

# #EIF
# h2o_anomaly_score_eif = EIF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_eif = h2o_anomaly_score_eif.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_eif = np.array(h2o_anomaly_score_df_eif["anomaly_score"]).reshape(xx.shape)

# #IF
# h2o_anomaly_score_if = IF_h2o.predict(hf_heatmap)
# h2o_anomaly_score_df_if = h2o_anomaly_score_if.as_data_frame(use_pandas=True, header=True)
# heatmap_h2o_if = np.array(h2o_anomaly_score_df_if["anomaly_score"]).reshape(xx.shape)

# #DIF
# heatmap_data = np.c_[xx.ravel(), yy.ravel()]
# anomaly_scores = dif.decision_function(heatmap_data)
# heatmap_dif = np.array(anomaly_scores).reshape(xx.shape)

In [23]:
# f = plt.figure(figsize=(24, 9))

# def plot_heatmap(heatmap_data, subplot, title):
#     ax1 = f.add_subplot(subplot)
#     levels = np.linspace(0,1,10, endpoint=True)
#     v = np.linspace(0, 1, 12, endpoint=True)
#     v = np.around(v, decimals=1)
#     CS = ax1.contourf(xx, yy, heatmap_data, levels, cmap=plt.cm.YlOrRd)
#     cbar = plt.colorbar(CS, ticks=v)
#     cbar.ax.set_ylabel('Anomaly score', fontsize = 25)
#     cbar.ax.tick_params(labelsize=15)
#     ax1.set_xlabel("x", fontsize = 25)
#     ax1.set_ylabel("y", fontsize = 25)
#     plt.tick_params(labelsize=30)
#     plt.scatter(data[:,0],data[:,1],s=15,c='None',edgecolor='k')
#     plt.axis("equal")
#     plt.title(title,  fontsize=32)

# plot_heatmap(heatmap_h2o_if, 131, "IF")
# plot_heatmap(heatmap_h2o_eif, 132, "EIF")
# plot_heatmap(heatmap_dif,133,"DIF")

# plt.show()

In [24]:
# hf1010 = h2o.H2OFrame([[10,10],[0,0]], column_names = ["x", "y"])

# predictions1010_eif = EIF_h2o.predict(hf1010)
# predictions1010_if = IF_h2o.predict(hf1010)

# predictions1010_all = predictions1010_eif.cbind(predictions1010_if).cbind(hf1010)
# predictions1010_all.rename({"anomaly_score": "anomaly_score_eif", "mean_length": "mean_length_eif", "anomaly_score0": "anomaly_score_if", "mean_length0": "mean_length_if"})

In [25]:
# test_data = np.array(hf1010.as_data_frame())
# predictions_dif = dif.decision_function(test_data,return_confidence=True)

# predictions_all = np.column_stack((test_data, predictions_dif))

# #predictions_all.rename({"anomaly_score": "anomaly_score_dif"})
# predictions_all_df = pd.DataFrame(predictions_all, columns=hf1010.columns + ['anomaly_score_dif','confidence'])

# print(predictions_all_df)

In [26]:
# print('Mean Anomaly Socre of DIF: ',np.mean(dif.decision_function(data)))
# print('Mean anomaly score of IF: ',IF_h2o.predict(training_frame)[0].mean())
# print('Mean anomaly score of EIF: ',EIF_h2o.predict(training_frame)[0].mean())

# **Testing DIF performance on actual datasets**

We will testing tabular and time series datasets unlike the paper which had even graph data. However, not all datasets used in the paper are being considered as mentioned previously. Furthermore, the fraud dataset has a roughly even class distribution. So we will split it into an inliwr and outlier dataset, and choose the number of outliers to be sampled based on our convinience. In the initial section, 0.17% of the total data is contaminated.

**1. Tabular Datasets**

4 datasets, fraud, pageblocks, shuttle and covtype are being used in this section.

In [27]:
fraud = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
shuttle=pd.read_csv('/kaggle/input/shuttle/shuttle_16.csv')
pageblocks=pd.read_csv('/kaggle/input/pageblocks/pageblocks_16.csv')

In [28]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [29]:
fraud.dropna(inplace=True)
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [30]:
pageblocks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5393 entries, 0 to 5392
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   att1    5393 non-null   float64
 1   att2    5393 non-null   float64
 2   att3    5393 non-null   float64
 3   att4    5393 non-null   float64
 4   att5    5393 non-null   float64
 5   att6    5393 non-null   float64
 6   att7    5393 non-null   float64
 7   att8    5393 non-null   float64
 8   att9    5393 non-null   float64
 9   att10   5393 non-null   float64
 10  label   5393 non-null   int64  
dtypes: float64(10), int64(1)
memory usage: 463.6 KB


In [31]:
shuttle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   att1    1013 non-null   float64
 1   att2    1013 non-null   float64
 2   att3    1013 non-null   float64
 3   att4    1013 non-null   float64
 4   att5    1013 non-null   float64
 5   att6    1013 non-null   float64
 6   att7    1013 non-null   float64
 7   att8    1013 non-null   float64
 8   att9    1013 non-null   float64
 9   label   1013 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 79.3 KB


All 4 datasets have had their na columns dealt with and have all columns as int or float types. We will not standardize or normalize any columns. Also, except cover, all datasets have the same dimensions as in the original paper, so we will not be adding or deleating any columns.

Our approach will be the following:

1. Use train test split to split the dataset into a training and test set.

2. Train the dataset using all 3 models, the sklearn if, h2o eif and deepod dif methods.

3. Calculate the AUC_ROC scores and AUC_PR scores

In [35]:
# X_train, X_test, y_train, y_test = train_test_split(fraud.drop(columns=['Class']), fraud['Class'], test_size=0.2, random_state=42)

# if_model_sklearn = IsolationForest(random_state=42,contamination=0.0946,n_estimators=300,max_samples=256)
# if_model_sklearn.fit(X_train,y_train)

# df_target = y_train.to_frame()
# h2o.init()
# hf_train =h2o.H2OFrame(X_train)
# hf_target = h2o.H2OFrame(df_target.astype('str'))
# hf_train['target'] = hf_target
# eif = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 300,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# eif.train(training_frame=hf_train, x=hf_train.names, y='target')

# # Train DeepOD DIF model
# dif=DeepIsolationForest()
# dif.fit(X_train.values,y_train)

# # Generate predictions on the test set
# # For sklearn Isolation Forest
# IF = if_model_sklearn.predict(X_test)
# # For H2O Extended Isolation Forest
# hf_test = h2o.H2OFrame(X_test)
# predictions_h2o = eif.predict(hf_test)
# # For DeepOD DIF model
# predictions_dif = dif.predict(X_test.values)

# # Evaluate ROC AUC and PR scores
# roc_auc_sklearn = roc_auc_score(y_test, IF)
# pr_auc_sklearn = average_precision_score(y_test, IF)

# roc_auc_h2o = roc_auc_score(y_test, np.array(predictions_h2o[0].as_data_frame().values))
# pr_auc_h2o = average_precision_score(y_test, predictions_h2o[0].as_data_frame().values)

# roc_auc_deepod = roc_auc_score(y_test, predictions_dif)
# pr_auc_deepod = average_precision_score(y_test, predictions_dif)

# # Print the scores
# print("ROC AUC Score (sklearn Isolation Forest):", roc_auc_sklearn)
# print("PR AUC Score (sklearn Isolation Forest):", pr_auc_sklearn)

# print("ROC AUC Score (H2O Extended Isolation Forest):", roc_auc_h2o)
# print("PR AUC Score (H2O Extended Isolation Forest):", pr_auc_h2o)

# print("ROC AUC Score (DeepOD DIF model):", roc_auc_deepod)
# print("PR AUC Score (DeepOD DIF model):", pr_auc_deepod)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,36 mins 43 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_unknownUser_314m94
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.991 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
extendedisolationforest Model Build progress: |██████████████████████████████████| (done) 100%
Start Training...


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


Start Inference on the training data...
Start Inference...


100%|██████████| 50/50 [09:28<00:00, 11.36s/it]


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
extendedisolationforest prediction progress: |███████████████████████████████████| (done) 100%
Start Inference...


100%|██████████| 50/50 [11:38<00:00, 13.98s/it]


ROC AUC Score (sklearn Isolation Forest): 0.0515551141389753
PR AUC Score (sklearn Isolation Forest): 0.0008762212586499051
ROC AUC Score (H2O Extended Isolation Forest): 0.9520325274740358
PR AUC Score (H2O Extended Isolation Forest): 0.1404023340128077
ROC AUC Score (DeepOD DIF model): 0.6204290328959248
PR AUC Score (DeepOD DIF model): 0.003977225397399574


In [46]:
# X_train, X_test, y_train, y_test = train_test_split(shuttle.drop(columns=['label']), shuttle['label'], test_size=0.2, random_state=42)

# if_model_sklearn = IsolationForest(random_state=42,contamination=0.0946,n_estimators=300,max_samples=256)
# if_model_sklearn.fit(X_train,y_train)

# df_target = y_train.to_frame()
# h2o.init()
# hf_train =h2o.H2OFrame(X_train)
# hf_target = h2o.H2OFrame(df_target.astype('str'))
# hf_train['target'] = hf_target
# eif = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 300,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# eif.train(training_frame=hf_train, x=hf_train.names, y='target')

# # Train DeepOD DIF model
# dif=DeepIsolationForest()
# dif.fit(X_train.values,y_train)

# # Generate predictions on the test set
# # For sklearn Isolation Forest
# IF = if_model_sklearn.predict(X_test)
# # For H2O Extended Isolation Forest
# hf_test = h2o.H2OFrame(X_test)
# predictions_h2o = eif.predict(hf_test)
# # For DeepOD DIF model
# predictions_dif = dif.predict(X_test.values)

# # Evaluate ROC AUC and PR scores
# roc_auc_sklearn = roc_auc_score(y_test, IF)
# pr_auc_sklearn = average_precision_score(y_test, IF)

# roc_auc_h2o = roc_auc_score(y_test, np.array(predictions_h2o[0].as_data_frame().values))
# pr_auc_h2o = average_precision_score(y_test, predictions_h2o[0].as_data_frame().values)

# roc_auc_deepod = roc_auc_score(y_test, predictions_dif)
# pr_auc_deepod = average_precision_score(y_test, predictions_dif)

# # Print the scores
# print("ROC AUC Score (sklearn Isolation Forest):", roc_auc_sklearn)
# print("PR AUC Score (sklearn Isolation Forest):", pr_auc_sklearn)

# print("ROC AUC Score (H2O Extended Isolation Forest):", roc_auc_h2o)
# print("PR AUC Score (H2O Extended Isolation Forest):", pr_auc_h2o)

# print("ROC AUC Score (DeepOD DIF model):", roc_auc_deepod)
# print("PR AUC Score (DeepOD DIF model):", pr_auc_deepod)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 23 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_unknownUser_8ozyzj
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.665 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
extendedisolationforest Model Build progress: |██████████████████████████████████| (done) 100%
Start Training...


100%|██████████| 50/50 [00:01<00:00, 49.97it/s]


Start Inference on the training data...
Start Inference...


100%|██████████| 50/50 [00:01<00:00, 27.23it/s]


extendedisolationforest prediction progress: |███████████████████████████████████| (done) 100%
Start Inference...


100%|██████████| 50/50 [00:01<00:00, 27.59it/s]

ROC AUC Score (sklearn Isolation Forest): 0.5379615384615385
PR AUC Score (sklearn Isolation Forest): 0.9881296000656159
ROC AUC Score (H2O Extended Isolation Forest): 0.5
PR AUC Score (H2O Extended Isolation Forest): 0.9871668311944719
ROC AUC Score (DeepOD DIF model): 0.044499999999999984
PR AUC Score (DeepOD DIF model): 0.976965845963262





In [41]:
# X_train, X_test, y_train, y_test = train_test_split(pageblocks.drop(columns=['label']), pageblocks['label'], test_size=0.2, random_state=42)

# if_model_sklearn = IsolationForest(random_state=42,contamination=0.0946,n_estimators=300,max_samples=256)
# if_model_sklearn.fit(X_train,y_train)

# df_target = y_train.to_frame()
# h2o.init()
# hf_train =h2o.H2OFrame(X_train)
# hf_target = h2o.H2OFrame(df_target.astype('str'))
# hf_train['target'] = hf_target
# eif = H2OExtendedIsolationForestEstimator(
#                                               model_id = "extended_isolation_forest.hex",
#                                               ntrees = 300,
#                                               sample_size = 256,
#                                               extension_level = 1,
#                                               seed = 42
#                                              )
# eif.train(training_frame=hf_train, x=hf_train.names, y='target')

# # Train DeepOD DIF model
# dif=DeepIsolationForest()
# dif.fit(X_train.values,y_train)

# # Generate predictions on the test set
# # For sklearn Isolation Forest
# IF = if_model_sklearn.predict(X_test)
# # For H2O Extended Isolation Forest
# hf_test = h2o.H2OFrame(X_test)
# predictions_h2o = eif.predict(hf_test)
# # For DeepOD DIF model
# predictions_dif = dif.predict(X_test.values)

# # Evaluate ROC AUC and PR scores
# roc_auc_sklearn = roc_auc_score(y_test, IF)
# pr_auc_sklearn = average_precision_score(y_test, IF)

# roc_auc_h2o = roc_auc_score(y_test, np.array(predictions_h2o[0].as_data_frame().values))
# pr_auc_h2o = average_precision_score(y_test, predictions_h2o[0].as_data_frame().values)

# roc_auc_deepod = roc_auc_score(y_test, predictions_dif)
# pr_auc_deepod = average_precision_score(y_test, predictions_dif)

# # Print the scores
# print("ROC AUC Score (sklearn Isolation Forest):", roc_auc_sklearn)
# print("PR AUC Score (sklearn Isolation Forest):", pr_auc_sklearn)

# print("ROC AUC Score (H2O Extended Isolation Forest):", roc_auc_h2o)
# print("PR AUC Score (H2O Extended Isolation Forest):", pr_auc_h2o)

# print("ROC AUC Score (DeepOD DIF model):", roc_auc_deepod)
# print("PR AUC Score (DeepOD DIF model):", pr_auc_deepod)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 30 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 26 days
H2O_cluster_name:,H2O_from_python_unknownUser_7s3wfw
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.968 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
extendedisolationforest Model Build progress: |██████████████████████████████████| (done) 100%
Start Training...


100%|██████████| 50/50 [00:01<00:00, 36.22it/s]


Start Inference on the training data...
Start Inference...


100%|██████████| 50/50 [00:06<00:00,  7.87it/s]


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
extendedisolationforest prediction progress: |███████████████████████████████████| (done) 100%
Start Inference...


100%|██████████| 50/50 [00:01<00:00, 28.96it/s]


ROC AUC Score (sklearn Isolation Forest): 0.34291950309603475
PR AUC Score (sklearn Isolation Forest): 0.07983718401174902
ROC AUC Score (H2O Extended Isolation Forest): 0.5
PR AUC Score (H2O Extended Isolation Forest): 0.09916589434661724
ROC AUC Score (DeepOD DIF model): 0.6949780777662398
PR AUC Score (DeepOD DIF model): 0.25975369074119214


# **Scalability to high dimensional, large scale data**

The code in the official github repositry will be used to generate synthetic data. Then,we will use the deepod dif, sklearn if and h2o eif models and fit the models with the datasets. After which, we will plot 2 graphs,1 showing the time taken for fitting vs the the number of dimensions and the other showing the computation time vs the data size. Naturally, we will learn dif using a TPU as it is a deep learning model. The important thing we want from the graphs is the trend, not the exact time, so this should not really be an issue. Also, if a model is unable to run for any particular dataset due to space limitations, it will be assumed to take infinite time.

In [None]:
# df1_dim=pd.read_csv('/kaggle/working/scal_dim-0@5000-16.csv')
# df2_dim=pd.read_csv('/kaggle/working/scal_dim-1@5000-32.csv')
# df3_dim=pd.read_csv('/kaggle/working/scal_dim-2@5000-64.csv')
# df4_dim=pd.read_csv('/kaggle/working/scal_dim-3@5000-128.csv')
# df5_dim=pd.read_csv('/kaggle/working/scal_dim-4@5000-256.csv')
# df6_dim=pd.read_csv('/kaggle/working/scal_dim-5@5000-512.csv')
# df7_dim=pd.read_csv('/kaggle/working/scal_dim-6@5000-1024.csv')
# df8_dim=pd.read_csv('/kaggle/working/scal_dim-7@5000-2048.csv')
# df9_dim=pd.read_csv('/kaggle/working/scal_dim-8@5000-4096.csv')

In [None]:
# def fit_sklearn_if(df, contamination=0.1):
#     start_time = time.time()
#     try:
#         if_model = IsolationForest(contamination=float(contamination), random_state=42)
#         X=df.drop(['class'],axis=1)
#         y=df['class']
#         if_model.fit(X,y)  # Fit on features (drop the target column)
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in isolation forest: {e}")
#         return 10**10

# def fit_h2o_eif(df):
#     start_time = time.time()
#     try:
#         h2o.init()
#         X=df.drop(['class'],axis=1)
#         y=df['class'].to_frame()
#         predictors=df.columns
#         hf = h2o.H2OFrame(X)
#         hf_target = h2o.H2OFrame(y.astype('str'))
#         hf['class'] = hf_target
#         eif = H2OExtendedIsolationForestEstimator(model_id="extended_isolation_forest.hex",
#                                                   ntrees = 500,
#                                                   sample_size = 256,
#                                                   extension_level = len(predictors)-1,
#                                                   seed=42
#         )
#         eif.train(training_frame=hf,x=hf.names, y='class')
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in extended isolation forest: {e}")
#         return 10**10  # Set time to infinity if an error occurs

# def fit_deepod_dif(df):
#     start_time = time.time()
#     try:
#         X=df.drop(['class'],axis=1)
#         y=df['class']
#         dif = DeepIsolationForest()
#         dif.fit(X.values,y)  # Fit on features (drop the target column)
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in deep isolation forest: {e}")
#         return 10**10

# # Define datasets
# datasets = [df1_dim, df2_dim, df3_dim, df4_dim, df5_dim, df6_dim, df7_dim, df8_dim, df9_dim]
# dimensions = [2**i for i in range(4, 13)]  # Dimensions range from 16 to 4096

# # Fit models and record time taken
# if_times = []
# eif_times = []
# dif_times = []
# for df in datasets:
#     if_times.append(fit_sklearn_if(df,0.1))
#     eif_times.append(fit_h2o_eif(df))
#     dif_times.append(fit_deepod_dif(df))
#     print("Time estimation done for df of size: ",df.shape[1])

# markers = ['o', '+', '*']

# # Plot results with markers
# plt.figure(figsize=(10, 6))
# plt.plot(dimensions, if_times, marker=markers[0], color='blue', label='Isolation Forest (sklearn)')
# plt.plot(dimensions, eif_times, marker=markers[1], color='red', label='Extended Isolation Forest (H2O)')
# plt.plot(dimensions, dif_times, marker=markers[2], color='green', label='Deep Isolation Forest (DeepOD)')
# plt.yscale('log')
# plt.title('Model Fitting Time vs Number of Dimensions')
# plt.xlabel('Number of Dimensions')
# plt.ylabel('Time (seconds)')
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
# size_range = [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000]
# for ii, size in enumerate(size_range):
#     dim = 32
#     n_nor = int(size * 0.95)
#     n_ano = int(size * 0.05)
#     df = generate_data(n_nor=n_nor, n_ano=n_ano, dim=dim)
#     name = "scal_size-" + str(ii) + "@" + str(size) + "-" + str(dim) + ".csv"
#     df.to_csv(os.path.join(dataset_root, name), index=False)

In [None]:
# df1_size=pd.read_csv('/kaggle/working/scal_size-0@1000-32.csv')
# df2_size=pd.read_csv('/kaggle/working/scal_size-1@2000-32.csv')
# df3_size=pd.read_csv('/kaggle/working/scal_size-2@4000-32.csv')
# df4_size=pd.read_csv('/kaggle/working/scal_size-3@8000-32.csv')
# df5_size=pd.read_csv('/kaggle/working/scal_size-4@16000-32.csv')
# df6_size=pd.read_csv('/kaggle/working/scal_size-5@32000-32.csv')
# df7_size=pd.read_csv('/kaggle/working/scal_size-6@64000-32.csv')
# df8_size=pd.read_csv('/kaggle/working/scal_size-7@128000-32.csv')
# df9_size=pd.read_csv('/kaggle/working/scal_size-8@256000-32.csv')

In [None]:
# def fit_sklearn_if(df, contamination=0.1):
#     start_time = time.time()
#     try:
#         if_model = IsolationForest(contamination=float(contamination), random_state=42)
#         X=df.drop(['class'],axis=1)
#         y=df['class']
#         if_model.fit(X,y)  # Fit on features (drop the target column)
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in isolation forest: {e}")
#         return 10**10

# def fit_h2o_eif(df):
#     start_time = time.time()
#     try:
#         X=df.drop(['class'],axis=1)
#         y=df['class'].to_frame()
#         predictors=df.columns
#         hf = h2o.H2OFrame(X)
#         hf_target = h2o.H2OFrame(y.astype('str'))
#         hf['class'] = hf_target
#         eif = H2OExtendedIsolationForestEstimator(model_id="extended_isolation_forest.hex",
#                                                   ntrees = 500,
#                                                   sample_size = 256,
#                                                   extension_level = len(predictors)-1,
#                                                   seed=42
#         )
#         eif.train(training_frame=hf,x=hf.names, y='class')
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in extended isolation forest: {e}")
#         return 10**10  # Set time to infinity if an error occurs

# def fit_deepod_dif(df):
#     start_time = time.time()
#     try:
#         X=df.drop(['class'],axis=1)
#         y=df['class']
#         dif = DeepIsolationForest()
#         dif.fit(X.values,y)  # Fit on features (drop the target column)
#         end_time = time.time()
#         return end_time - start_time
#     except Exception as e:
#         print(f"Error in deep isolation forest: {e}")
#         return 10**10

# # Define datasets
# datasets = [df1_size, df2_size, df3_size, df4_size, df5_size, df6_size, df7_size, df8_size, df9_size]
# dimensions = [1000,2000,4000,8000,16000,32000,64000,128000,256000]  # Dimensions range from 16 to 4096

# # Fit models and record time taken
# if_times = []
# eif_times = []
# dif_times = []
# for df in datasets:
#     if_times.append(fit_sklearn_if(df,0.1))
#     eif_times.append(fit_h2o_eif(df))
#     dif_times.append(fit_deepod_dif(df))
#     print("Time estimation done for df of size: ",df.shape[0])

# markers = ['o', '+', '*']

# # Plot results with markers
# plt.figure(figsize=(10, 6))
# plt.plot(dimensions, if_times, marker=markers[0], color='blue', label='Isolation Forest (sklearn)')
# plt.plot(dimensions, eif_times, marker=markers[1], color='red', label='Extended Isolation Forest (H2O)')
# plt.plot(dimensions, dif_times, marker=markers[2], color='green', label='Deep Isolation Forest (DeepOD)')
# plt.yscale('log')
# plt.title('Model Fitting Time vs Number of Dimensions')
# plt.xlabel('Size')
# plt.ylabel('Time (seconds)')
# plt.legend()
# plt.grid(True)
# plt.show()