In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from pyod.models.iforest import IForest

from itertools import product


import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn import __version__ as sklearn_version
print(pd.__version__)
print(sklearn_version)

from scipy import io as sio

import h5py

In [None]:
class UFIF():
    """
    Universal Filled Isolation Forest.
    Wrapper of the normal one.
    """
    contamination=0.1
    
    X=None
    X_fill=None
    X_filled=None
    
    iforest:IForest=None
    
    def __init__(self, n_estimators=100, contamination=0.1, random_state=None, max_depth=8):
        self.iforest=IForest(
            n_estimators=n_estimators, 
            contamination=contamination, 
            random_state=random_state,
            max_depth=max_depth
        )
        
    def get_filling(self, n:int):
        return np.array(list(product([0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1], repeat=n)))
        
    def fit(self, X, y=None):
        n_features = X.shape[1]
        X_fill = self.get_filling(n_features)
        X_filled = np.concatenate([X, X_fill], axis=0)
        self.iforest.fit(X_filled)
        return self
    
    def predict_proba(self, X):
        return self.iforest.predict_proba(X)

In [None]:
def get_X_y_73(filename):
    f=h5py.File(filename, 'r')
    d={}
    for k, v in f.items():
        d[k]=v
        print(k, v)
    X=d['X'].value.T
    y=d['y'].value[0]
    
    X=X-np.min(X, axis=0)
    X=X/np.max(X, axis=0)
    
    return X, y

def get_X_y_50(filename):
    dat = sio.loadmat(filename)
    X=dat['X']
    y=dat['y']
    X=X-np.min(X, axis=0)
    X=X/np.max(X, axis=0)
    
    return X, y

def compare(X_train, y_train, X_test, y_test):
    # X, y = get_X_y(filename)
    n_features = X_train.shape[1]
    n_samples = len(y_train)
    n_anomalies = sum(y_train)
    print(f"n_samples:{n_samples}, n_anomalies:{n_anomalies}")
    contamination=n_anomalies/n_samples

    if_aucs=[]
    ufif_aucs=[]
    for random_state in [11,22,33,44,55,66,77,88,99,101]:
        print(f'random_state {random_state}')
        clf = IForest(contamination=contamination, random_state=random_state)
        clf.fit(X_train)
        y_proba=clf.predict_proba(X_test)
        if_auc=roc_auc_score(y_test ,y_proba[:,1])
        if_aucs.append(if_auc)

        ufif = UFIF(contamination=contamination, random_state=random_state)
        ufif.fit(X_train)
        y_proba=ufif.predict_proba(X_test)
        ufif_auc=roc_auc_score(y_test ,y_proba[:,1])
        ufif_aucs.append(ufif_auc)
        
        print(if_auc, ufif_auc)

    if_auc = np.mean(if_aucs)
    ufif_auc = np.mean(ufif_aucs)
    return n_samples, n_features, n_anomalies, if_auc, ufif_auc

# SMPT

In [None]:
X, y = get_X_y_73('data/smtp.mat')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)
compare(X_train, y_train, X_test, y_test)

# HTTP

In [None]:
X, y = get_X_y_73('data/http.mat')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)
compare(X_train, y_train, X_test, y_test)

In [None]:
X, y = get_X_y_50('data/pima.mat')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
compare(X_train, y_train, X_test, y_test)

In [None]:
X, y = get_X_y_50('data/cover.mat')
ca, cb = compare(X, y)