In [None]:
!pip install pyod

Collecting pyod
[?25l  Downloading https://files.pythonhosted.org/packages/2b/1d/22a6c4e796fff1066bf80bf59b4494d6e3582e22012a61721f4cb730b3c3/pyod-0.8.4.tar.gz (98kB)
[K     |███▎                            | 10kB 15.5MB/s eta 0:00:01[K     |██████▋                         | 20kB 20.3MB/s eta 0:00:01[K     |██████████                      | 30kB 12.6MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 10.6MB/s eta 0:00:01[K     |████████████████▋               | 51kB 8.5MB/s eta 0:00:01[K     |████████████████████            | 61kB 9.3MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 8.5MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 8.8MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 8.7MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 5.4MB/s 
[?25hCollecting combo
  Downloading https://files.pythonhosted.org/packages/0a/2a/61b6ac584e75d8df16dc27962aa5fe99d76b09da5b6710e83d4862c84001/combo-0.1.1.t

* ___Import Python Packages___

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
import warnings
warnings.simplefilter("ignore")

* ___Import pyod packages & its methods___

In [None]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

* ___Import Metrics Packages___

In [None]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

* ___Define Data file___

In [None]:
mat_file_list=['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

In [None]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [None]:
df_columns=['Data','#Samples','#Dimensions','Outlier Perc','PCA','MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD','IFOREST','Feature Bagging']

In [None]:
df_columns

['Data',
 '#Samples',
 '#Dimensions',
 'Outlier Perc',
 'PCA',
 'MCD',
 'OCSVM',
 'LOF',
 'CBLOF',
 'KNN',
 'HBOS',
 'ABOD',
 'IFOREST',
 'Feature Bagging']

In [None]:
roc_df=pd.DataFrame(columns=df_columns)
prn_df=pd.DataFrame(columns=df_columns)
time_df=pd.DataFrame(columns=df_columns)

* ___Exploring all the mat files___

In [None]:
from time import time
random_state=np.random.RandomState(42)
for mat_file in mat_file_list:
    print("\n...Processing",mat_file,'...')
    mat=loadmat(os.path.join('/content/drive/My Drive',mat_file))
    X=mat['X']
    y=mat['y'].ravel()
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    # Construct containers for saving results
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    # Splitting Data for Training and Testing
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)
    # Standardizing Data for processing
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    classifiers={'Angle-Based Outlier Detector(ABOD)':ABOD(contamination=outliers_fraction),
                 'Cluster-Based Local Outlier Factor(CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                 'Feature Bagging':FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                 'Histogram-Base Outlier Detection(HBOS)':HBOS(contamination=outliers_fraction),
                 'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),
                 'K Nearest Neighbors(KNN)':KNN(contamination=outliers_fraction),
                 'Local Outlier Factor(LOF)':LOF(contamination=outliers_fraction),
                 'Minimum Covariance Determinant(MCD)':MCD(contamination=outliers_fraction,random_state=random_state),
                 'One-Class SVM(OCSVM)':OCSVM(contamination=outliers_fraction),
                 'Principal Component Analysis(PCA)':PCA(contamination=outliers_fraction,random_state=random_state)
                }
    for clf_name,clf in classifiers.items():
        t0=time()
        clf.fit(X_train_norm)
        test_scores=clf.decision_function(X_test_norm)
        t1=time()
        duration=round(t1-t0,ndigits=4)
        time_list.append(duration)
        roc=round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores),ndigits=4)
        print(f'{clf_name} ROC:{roc}, precision @ rank n:(prn),execution time:{duration}s')
        roc_list.append(roc)
        prn_list.append(prn)
    temp_df=pd.DataFrame(time_list).transpose()
    temp_df.columns=df_columns
    time_df=pd.concat([time_df,temp_df],axis=0)
    temp_df=pd.DataFrame(roc_list).transpose()
    temp_df.columns=df_columns
    roc_df=pd.concat([roc_df,temp_df],axis=0)
    temp_df=pd.DataFrame(prn_list).transpose()
    temp_df.columns=df_columns
    prn_df=pd.concat([prn_df,temp_df],axis=0)


...Processing arrhythmia.mat ...
Angle-Based Outlier Detector(ABOD) ROC:0.7687, precision @ rank n:(prn),execution time:1.2354s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.7684, precision @ rank n:(prn),execution time:1.6636s
Feature Bagging ROC:0.7799, precision @ rank n:(prn),execution time:0.6777s
Histogram-Base Outlier Detection(HBOS) ROC:0.8511, precision @ rank n:(prn),execution time:1.5133s
Isolation Forest ROC:0.8527, precision @ rank n:(prn),execution time:0.4626s
K Nearest Neighbors(KNN) ROC:0.782, precision @ rank n:(prn),execution time:0.1018s
Local Outlier Factor(LOF) ROC:0.7787, precision @ rank n:(prn),execution time:0.0864s
Minimum Covariance Determinant(MCD) ROC:0.8228, precision @ rank n:(prn),execution time:1.2855s
One-Class SVM(OCSVM) ROC:0.7986, precision @ rank n:(prn),execution time:0.0606s
Principal Component Analysis(PCA) ROC:0.8, precision @ rank n:(prn),execution time:0.0644s

...Processing cardio.mat ...
Angle-Based Outlier Detector(ABOD) ROC:0.5763, pr

In [None]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,Feature Bagging
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.8
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9127,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [None]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,Feature Bagging
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385


In [None]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,Feature Bagging
0,arrhythmia,452,274,14.6018,1.2354,1.6636,0.6777,1.5133,0.4626,0.1018,0.0864,1.2855,0.0606,0.0644
0,cardio,1831,21,9.6122,0.365,0.22,0.8161,0.0088,0.4055,0.1456,0.1016,0.724,0.1378,0.0057
0,glass,214,9,4.2056,0.0367,0.0435,0.0373,0.005,0.2921,0.0082,0.0035,0.051,0.0034,0.0026
0,ionosphere,351,33,35.8974,0.0609,0.0507,0.0808,0.0128,0.3161,0.0161,0.0098,0.1071,0.0099,0.004
0,letter,1600,32,6.25,0.3324,0.1604,0.8049,0.011,0.4241,0.1373,0.0986,1.5642,0.1321,0.0071
0,lympho,148,18,4.0541,0.0262,0.0472,0.034,0.0062,0.2927,0.0063,0.0031,0.0501,0.0027,0.0028
0,mnist,7603,100,9.2069,7.906,1.1626,59.0246,0.0542,1.4433,7.5972,7.3231,3.7118,6.0131,0.133
0,musk,3062,166,3.1679,2.319,0.4231,14.9609,0.0763,1.0318,2.0218,1.8958,16.4091,1.5391,0.1243
0,optdigits,5216,64,2.8758,2.7194,0.5605,16.9657,0.0328,0.928,2.1447,1.9516,1.8501,1.9301,0.0461
0,pendigits,6870,16,2.2707,1.3324,0.4078,3.9395,0.0104,0.7314,0.5477,0.4781,2.9538,1.7988,0.0207
