## Import Python Packages

In [3]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

## Import PyOD packages and methods

In [14]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

## Import Metrics Packages

In [5]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [6]:
#Define Data file and read X and y
mat_file_list = ['arrhythmia.mat',
'cardio.mat',
'glass.mat',
'ionosphere.mat',
'letter.mat',
'lympho.mat',
'mnist.mat',
'musk.mat',
'optdigits.mat',
'pendigits.mat',
'pima.mat',
'satellite.mat',
'satimage-2.mat',
'shuttle.mat',
'vertebral.mat',
'vowels.mat',
'wbc.mat']

In [10]:
data = loadmat('Anamoly_detect_data/cardio.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [11]:
len(data)
type(data)

dict

In [12]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [13]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

## Input(Independent) Feature Shape in Mat File format

In [16]:
type(data['X']), data['X'].shape

(numpy.ndarray, (1831, 21))

## Target(Dependent) Feature Shape in Mat File format

In [15]:
type(data['y']), data['y'].shape

(numpy.ndarray, (1831, 1))

In [17]:
df_columns = ['Data', '#Samples', '#Dimension', 'Outlier Perc',
              'PCA', 'MCD', 'OCSVM',
              'ABOD', 'CBLOF', 'KNN','HBOS',
              'LOF','IForest','FB']

## ROC Performance Evolution Table

In [18]:
roc_df = pd.DataFrame(columns= df_columns)
roc_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB


## precision_n_scores Performance Evolution Table

In [19]:
prf_df = pd.DataFrame(columns= df_columns)
prf_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB


## Time Performance Evolution Table

In [20]:
time_df = pd.DataFrame(columns= df_columns)
time_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB


## Exploring All Mat Files

In [21]:
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n...Processing", mat_file, '...')
    #mat = loadmat(mat_file)
    #incase files are in folder or other loaction
    mat = loadmat(os.path.join("Anamoly_detect_data",mat_file))
    
    X = mat['X']
    y= mat['y'].ravel()
    
    outliers_fraction = np.count_nonzero(y)/len(y)
    
    outliers_percentage =round(outliers_fraction * 100, ndigits=4)
    
    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prf_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    # Spliting testdata
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.4, random_state = random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction,random_state=random_state),
                  'Minimum Covariance Determinant (MCD)':MCD(contamination= outliers_fraction,random_state=random_state),
                  'One-Class Support Vector Machine (OCSVM)':OCSVM(contamination= outliers_fraction),
                  'Angle-Based Outlier Detection (ABOD)':ABOD(contamination= outliers_fraction),
                  'Clustering Based LOF (CBLOF)':CBLOF(contamination= outliers_fraction,random_state=random_state,check_estimator=False),
                  'K-Nearest-Neighbors (KNN)':KNN(contamination= outliers_fraction),
                  'Histogram Based Outlier Score (HBOS)':HBOS(contamination=outliers_fraction),
                  'Local Outlier Factor (LOF)':LOF(contamination = outliers_fraction),
                  'Isolation Forest':IForest(contamination= outliers_fraction,random_state=random_state),
                  'Feature bagging (FB)':FeatureBagging(contamination = outliers_fraction,random_state=random_state)
                  }
    
    from time import time
    for clf_name, clf in classifiers.items():
        t0 =time()
        clf.fit(X_train_norm)
        test_scores =clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1-t0, ndigits=4)
        
        
        roc =round(roc_auc_score(y_test,test_scores),ndigits=4)
        prf =round(precision_n_scores(y_test,test_scores),ndigits=4)
        
        print('{clf_name} ROC:{roc}, Precision @rank n:{prf}, ''execution time:{duration}s'.format(clf_name =clf_name,roc=roc,prf =prf, duration=duration))
        
        roc_list.append(roc)
        prf_list.append(prf)
        time_list.append(duration)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prf_list).transpose()
    temp_df.columns = df_columns
    prf_df = pd.concat([prf_df, temp_df], axis=0)


...Processing arrhythmia.mat ...
Principal Component Analysis (PCA) ROC:0.7997, Precision @rank n:0.5, execution time:0.3008s




Minimum Covariance Determinant (MCD) ROC:0.8228, Precision @rank n:0.4286, execution time:0.5635s
One-Class Support Vector Machine (OCSVM) ROC:0.7986, Precision @rank n:0.5, execution time:0.0539s
Angle-Based Outlier Detection (ABOD) ROC:0.7687, Precision @rank n:0.3571, execution time:1.7846s
Clustering Based LOF (CBLOF) ROC:0.788, Precision @rank n:0.4643, execution time:1.599s
K-Nearest-Neighbors (KNN) ROC:0.782, Precision @rank n:0.5, execution time:0.0997s
Histogram Based Outlier Score (HBOS) ROC:0.8511, Precision @rank n:0.5714, execution time:1.657s
Local Outlier Factor (LOF) ROC:0.7787, Precision @rank n:0.4643, execution time:0.0708s
Isolation Forest ROC:0.8343, Precision @rank n:0.5357, execution time:0.3594s
Feature bagging (FB) ROC:0.7768, Precision @rank n:0.4286, execution time:0.5206s

...Processing cardio.mat ...
Principal Component Analysis (PCA) ROC:0.9444, Precision @rank n:0.6267, execution time:0.005s




Minimum Covariance Determinant (MCD) ROC:0.8221, Precision @rank n:0.44, execution time:0.4997s
One-Class Support Vector Machine (OCSVM) ROC:0.9292, Precision @rank n:0.5467, execution time:0.0938s
Angle-Based Outlier Detection (ABOD) ROC:0.6114, Precision @rank n:0.2533, execution time:0.3401s
Clustering Based LOF (CBLOF) ROC:0.8566, Precision @rank n:0.52, execution time:0.1695s
K-Nearest-Neighbors (KNN) ROC:0.7601, Precision @rank n:0.3467, execution time:0.1924s
Histogram Based Outlier Score (HBOS) ROC:0.8684, Precision @rank n:0.4533, execution time:0.006s
Local Outlier Factor (LOF) ROC:0.5969, Precision @rank n:0.1867, execution time:0.0878s
Isolation Forest ROC:0.9309, Precision @rank n:0.6, execution time:0.3421s
Feature bagging (FB) ROC:0.6202, Precision @rank n:0.1867, execution time:0.6652s

...Processing glass.mat ...
Principal Component Analysis (PCA) ROC:0.8512, Precision @rank n:0.0, execution time:0.006s
Minimum Covariance Determinant (MCD) ROC:0.7738, Precision @rank n



Clustering Based LOF (CBLOF) ROC:0.9405, Precision @rank n:0.0, execution time:0.0489s
K-Nearest-Neighbors (KNN) ROC:0.9405, Precision @rank n:0.0, execution time:0.009s
Histogram Based Outlier Score (HBOS) ROC:0.8452, Precision @rank n:0.0, execution time:0.003s
Local Outlier Factor (LOF) ROC:0.9762, Precision @rank n:0.0, execution time:0.0049s
Isolation Forest ROC:0.9048, Precision @rank n:0.0, execution time:0.2623s
Feature bagging (FB) ROC:0.9762, Precision @rank n:0.0, execution time:0.0299s

...Processing ionosphere.mat ...
Principal Component Analysis (PCA) ROC:0.7879, Precision @rank n:0.54, execution time:0.0029s
Minimum Covariance Determinant (MCD) ROC:0.9556, Precision @rank n:0.86, execution time:0.0628s
One-Class Support Vector Machine (OCSVM) ROC:0.8543, Precision @rank n:0.74, execution time:0.005s
Angle-Based Outlier Detection (ABOD) ROC:0.9211, Precision @rank n:0.82, execution time:0.0648s
Clustering Based LOF (CBLOF) ROC:0.8756, Precision @rank n:0.76, execution tim



Isolation Forest ROC:1.0, Precision @rank n:1.0, execution time:0.2832s
Feature bagging (FB) ROC:1.0, Precision @rank n:1.0, execution time:0.0309s

...Processing mnist.mat ...
Principal Component Analysis (PCA) ROC:0.8477, Precision @rank n:0.3675, execution time:0.1246s




Minimum Covariance Determinant (MCD) ROC:0.8677, Precision @rank n:0.371, execution time:2.3836s
One-Class Support Vector Machine (OCSVM) ROC:0.8457, Precision @rank n:0.3781, execution time:4.2885s
Angle-Based Outlier Detection (ABOD) ROC:0.7947, Precision @rank n:0.371, execution time:6.5834s
Clustering Based LOF (CBLOF) ROC:0.8432, Precision @rank n:0.3958, execution time:1.0182s
K-Nearest-Neighbors (KNN) ROC:0.8417, Precision @rank n:0.4205, execution time:6.2906s
Histogram Based Outlier Score (HBOS) ROC:0.5715, Precision @rank n:0.1095, execution time:0.0459s
Local Outlier Factor (LOF) ROC:0.707, Precision @rank n:0.3357, execution time:5.9122s
Isolation Forest ROC:0.7933, Precision @rank n:0.3216, execution time:1.7814s
Feature bagging (FB) ROC:0.7049, Precision @rank n:0.3498, execution time:46.8646s

...Processing musk.mat ...
Principal Component Analysis (PCA) ROC:0.9998, Precision @rank n:0.9677, execution time:0.1416s
Minimum Covariance Determinant (MCD) ROC:1.0, Precision @



Minimum Covariance Determinant (MCD) ROC:0.3704, Precision @rank n:0.0, execution time:1.09s
One-Class Support Vector Machine (OCSVM) ROC:0.4634, Precision @rank n:0.0, execution time:1.2906s
Angle-Based Outlier Detection (ABOD) ROC:0.484, Precision @rank n:0.0175, execution time:2.2111s
Clustering Based LOF (CBLOF) ROC:0.7586, Precision @rank n:0.0, execution time:0.4148s
K-Nearest-Neighbors (KNN) ROC:0.3506, Precision @rank n:0.0, execution time:1.5888s
Histogram Based Outlier Score (HBOS) ROC:0.8247, Precision @rank n:0.1579, execution time:0.0279s
Local Outlier Factor (LOF) ROC:0.4359, Precision @rank n:0.0526, execution time:1.4721s
Isolation Forest ROC:0.6848, Precision @rank n:0.0351, execution time:0.8507s
Feature bagging (FB) ROC:0.4713, Precision @rank n:0.0526, execution time:10.853s

...Processing pendigits.mat ...
Principal Component Analysis (PCA) ROC:0.9421, Precision @rank n:0.3448, execution time:0.007s
Minimum Covariance Determinant (MCD) ROC:0.8383, Precision @rank n





Minimum Covariance Determinant (MCD) ROC:0.9903, Precision @rank n:0.7477, execution time:13.9916s
One-Class Support Vector Machine (OCSVM) ROC:0.9927, Precision @rank n:0.957, execution time:39.6042s
Angle-Based Outlier Detection (ABOD) ROC:0.6315, Precision @rank n:0.1889, execution time:12.3408s
Clustering Based LOF (CBLOF) ROC:0.5863, Precision @rank n:0.2657, execution time:0.8337s
K-Nearest-Neighbors (KNN) ROC:0.6463, Precision @rank n:0.2271, execution time:7.5189s
Histogram Based Outlier Score (HBOS) ROC:0.9857, Precision @rank n:0.9345, execution time:0.017s
Local Outlier Factor (LOF) ROC:0.5325, Precision @rank n:0.1283, execution time:9.4866s
Isolation Forest ROC:0.9972, Precision @rank n:0.9542, execution time:2.7566s
Feature bagging (FB) ROC:0.449, Precision @rank n:0.0437, execution time:48.0744s

...Processing vertebral.mat ...
Principal Component Analysis (PCA) ROC:0.4428, Precision @rank n:0.0667, execution time:0.002s
Minimum Covariance Determinant (MCD) ROC:0.3885, P

In [22]:
roc_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB
0,arrhythmia,452,274,14.6018,0.7997,0.8228,0.7986,0.7687,0.788,0.782,0.8511,0.7787,0.8343,0.7768
0,cardio,1831,21,9.6122,0.9444,0.8221,0.9292,0.6114,0.8566,0.7601,0.8684,0.5969,0.9309,0.6202
0,glass,214,9,4.2056,0.8512,0.7738,0.9405,0.9524,0.9405,0.9405,0.8452,0.9762,0.9048,0.9762
0,ionosphere,351,33,35.8974,0.7879,0.9556,0.8543,0.9211,0.8756,0.9295,0.533,0.8943,0.8503,0.8947
0,letter,1600,32,6.25,0.4951,0.7683,0.57,0.8471,0.721,0.8382,0.5515,0.8452,0.5665,0.8508
0,lympho,148,18,4.0541,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.8477,0.8677,0.8457,0.7947,0.8432,0.8417,0.5715,0.707,0.7933,0.7049
0,musk,3062,166,3.1679,0.9998,1.0,1.0,0.0853,1.0,0.7503,1.0,0.6148,0.9994,0.584
0,optdigits,5216,64,2.8758,0.481,0.3704,0.4634,0.484,0.7586,0.3506,0.8247,0.4359,0.6848,0.4713
0,pendigits,6870,16,2.2707,0.9421,0.8383,0.9372,0.6691,0.7858,0.7372,0.9228,0.418,0.9601,0.4224


In [23]:
prf_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB
0,arrhythmia,452,274,14.6018,0.5,0.4286,0.5,0.3571,0.4643,0.5,0.5714,0.4643,0.5357,0.4286
0,cardio,1831,21,9.6122,0.6267,0.44,0.5467,0.2533,0.52,0.3467,0.4533,0.1867,0.6,0.1867
0,glass,214,9,4.2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,ionosphere,351,33,35.8974,0.54,0.86,0.74,0.82,0.76,0.86,0.36,0.76,0.68,0.76
0,letter,1600,32,6.25,0.0833,0.1111,0.0833,0.2222,0.1111,0.2222,0.0556,0.3611,0.0278,0.3611
0,lympho,148,18,4.0541,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.3675,0.371,0.3781,0.371,0.3958,0.4205,0.1095,0.3357,0.3216,0.3498
0,musk,3062,166,3.1679,0.9677,1.0,1.0,0.0323,1.0,0.2903,0.9677,0.1935,0.9355,0.1613
0,optdigits,5216,64,2.8758,0.0,0.0,0.0,0.0175,0.0,0.0,0.1579,0.0526,0.0351,0.0526
0,pendigits,6870,16,2.2707,0.3448,0.0517,0.3103,0.0345,0.1207,0.0345,0.2759,0.0345,0.3621,0.0345


In [24]:
time_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,PCA,MCD,OCSVM,ABOD,CBLOF,KNN,HBOS,LOF,IForest,FB
0,arrhythmia,452,274,14.6018,0.3008,0.5635,0.0539,1.7846,1.599,0.0997,1.657,0.0708,0.3594,0.5206
0,cardio,1831,21,9.6122,0.005,0.4997,0.0938,0.3401,0.1695,0.1924,0.006,0.0878,0.3421,0.6652
0,glass,214,9,4.2056,0.006,0.0379,0.0009,0.0369,0.0489,0.009,0.003,0.0049,0.2623,0.0299
0,ionosphere,351,33,35.8974,0.0029,0.0628,0.005,0.0648,0.0598,0.016,0.008,0.007,0.3361,0.0609
0,letter,1600,32,6.25,0.004,0.9126,0.0688,0.3191,0.1167,0.1536,0.009,0.0858,0.365,0.6533
0,lympho,148,18,4.0541,0.002,0.0309,0.001,0.0299,0.0429,0.006,0.004,0.002,0.2832,0.0309
0,mnist,7603,100,9.2069,0.1246,2.3836,4.2885,6.5834,1.0182,6.2906,0.0459,5.9122,1.7814,46.8646
0,musk,3062,166,3.1679,0.1416,9.5265,1.1759,2.2034,0.3899,1.6526,0.0578,1.5519,1.2127,12.9214
0,optdigits,5216,64,2.8758,0.0529,1.09,1.2906,2.2111,0.4148,1.5888,0.0279,1.4721,0.8507,10.853
0,pendigits,6870,16,2.2707,0.007,2.1403,0.9096,1.3105,0.2583,0.6932,0.009,0.5306,0.6364,3.8039
