# 🛡️ Anomaly Detection Using ML 
## 👨‍💻 Abhijit Mahajan

__📌Import packages :__

In [1]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

#for loading matlab file
from scipy.io import loadmat


__📌Import pyod packages :__

In [2]:
#this all are the anolmaly detection models and mothod
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



__📌Import metrics packages :__

In [3]:
#using this methods we envolving performance of models
from pyod.utils.utility import standardizer 
from pyod.utils.utility import precision_n_scores #measure to identify the performance of model
from sklearn.metrics import roc_auc_score

__📌How to load mat file :__

In [4]:
data=loadmat('Dataset/Anamoly_detec_data/cardio.mat')
print(data)
print(">>  __header__ and __version are predifined X and Y define by uer")
print(">>  Length: ",len(data))
print(">>  Keys in MatFile:",data.keys())
print(">>  Type and Independent Var Shape: ",type(data['X']),data['X'].shape)
print(">>  Type and Dependent Var Shape: ",type(data['y']),data['y'].shape)

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '__version__': '1.0', '__globals__': [], 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), 'y': array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])}
>>  __header__ and __version are predifined X and Y define by uer
>>  Length:  5
>>  Keys in MatFile: dict_keys(['__header__', '__version__', '__globals__

__📌Define dataset file and read X and y :__

In [5]:
mat_file_list=['arrhythmia.mat',
              'cardio.mat',
              'glass.mat',
              'ionosphere.mat',
              'letter.mat',
              'lympho.mat',
              'mnist.mat',
              'musk.mat',
              'optdigits.mat',
              'pendigits.mat',
              'pima.mat',
              'satellite.mat',
              'satimage-2.mat',
              'shuttle.mat',
              'vertebral.mat',
              'vowelS.mat',
              'wbc.mat']

#Define nineoutlier detection tools to be compared
df_columns = ['Data','#Samples','# Dimensions', 'Outlier Perc',
              'ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF',
              'MCD','OCSVM','PCA']

from time import time
random_state=np.random.RandomState(42)

__📌ROC Performance Evolution Table :__ 

In [6]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


__📌Precision_n_score performance evolution table :__

In [7]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


__📌Time dataframe :__

In [8]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


__📌Exploring mat files :__

In [9]:
for mat_file in mat_file_list:
    print("\n...Processing",mat_file)
    mat=loadmat(os.path.join('data',mat_file))
    
    X=mat['X']
    y=mat['y'].ravel()# ravel convert 2 dimension to 1 dimension
    
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    #constructing container for saving result
    roc_list=[mat_file[:-1],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-1],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-1],X.shape[0],X.shape[1],outliers_percentage]
    
    #60% training and 40% testing
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    
    #standardizing data for processsing
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    classifiers={'Angle-based Outlier Detector (ABOD)':
                 ABOD(contamination=outliers_fraction),#contamination means on which factor outlier being detected 
                 
                 'Cluster-based Local Outlier Factor (CBLOF)':
                 CBLOF(contamination=outliers_fraction,check_estimator=False),
                 
                 'feature Bagging':
                 FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                 
                 'Histogram-base Outlier Detection(HBOS)':
                 HBOS(contamination=outliers_fraction),
                 
                 'Isolation Forest':
                 IForest(contamination=outliers_fraction,random_state=random_state),
                 
                 'K Nearst Neighbors (KNN)' : 
                 KNN(contamination=outliers_fraction),
                 
                 'Local Outlier Factors(LOF)':
                 LOF(contamination=outliers_fraction),
                 
                 'Minimum Covariance Determination(MCD)':
                 MCD(contamination=outliers_fraction,random_state=random_state),
                 
                 'One Class SVM':
                 OCSVM(contamination=outliers_fraction),
                 
                 'Principle Component Analysis(PCA)':
                 PCA(contamination=outliers_fraction,random_state=random_state),
                
                }
                 
    for clf_name,clf in classifiers.items():
            t0=time()#time before  training
            clf.fit(X_train_norm)#all algo training model here
            test_scores=clf.decision_function(X_test_norm)#test scores of model
            t1=time()#time after training
            duration=round(t1-t0,ndigits=4)#training testing time
            time_list.append(duration) 
            
            roc=round(roc_auc_score(y_test, test_scores),ndigits=4)
            prn=round(precision_n_scores(y_test, test_scores),ndigits=4)
            
            #display all calculation for each algo
            print('{clf_name} ROC:{roc}, precision @ rank n :{prn},'
                  'extecution time :{duration}s'.format(clf_name=clf_name,roc=roc,prn=prn,duration=duration))
           
            roc_list.append(roc)
            prn_list.append(prn)
            
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns=df_columns
    time_df=pd.concat([time_df, temp_df],axis=0)
            
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns=df_columns
    roc_df=pd.concat([roc_df, temp_df],axis=0)
            
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns=df_columns
    prn_df=pd.concat([prn_df, temp_df],axis=0)



...Processing arrhythmia.mat
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n :0.3571,extecution time :12.663s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.7927, precision @ rank n :0.5,extecution time :15.2883s
feature Bagging ROC:0.7806, precision @ rank n :0.4643,extecution time :5.0838s
Histogram-base Outlier Detection(HBOS) ROC:0.8511, precision @ rank n :0.5714,extecution time :13.4449s
Isolation Forest ROC:0.8527, precision @ rank n :0.5714,extecution time :3.0056s
K Nearst Neighbors (KNN) ROC:0.782, precision @ rank n :0.5,extecution time :0.6961s
Local Outlier Factors(LOF) ROC:0.7787, precision @ rank n :0.4643,extecution time :0.586s




Minimum Covariance Determination(MCD) ROC:0.8228, precision @ rank n :0.4286,extecution time :6.0658s
One Class SVM ROC:0.7986, precision @ rank n :0.5,extecution time :0.368s
Principle Component Analysis(PCA) ROC:0.7997, precision @ rank n :0.5,extecution time :0.456s

...Processing cardio.mat
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n :0.1875,extecution time :3.5714s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.7029, precision @ rank n :0.2344,extecution time :1.4483s
feature Bagging ROC:0.4956, precision @ rank n :0.125,extecution time :5.5535s
Histogram-base Outlier Detection(HBOS) ROC:0.8453, precision @ rank n :0.4688,extecution time :0.08s
Isolation Forest ROC:0.9449, precision @ rank n :0.4844,extecution time :3.5529s
K Nearst Neighbors (KNN) ROC:0.6959, precision @ rank n :0.2812,extecution time :1.2002s
Local Outlier Factors(LOF) ROC:0.4715, precision @ rank n :0.125,extecution time :0.7525s




Minimum Covariance Determination(MCD) ROC:0.8627, precision @ rank n :0.4219,extecution time :4.2137s
One Class SVM ROC:0.9507, precision @ rank n :0.5938,extecution time :0.6253s
Principle Component Analysis(PCA) ROC:0.9638, precision @ rank n :0.6875,extecution time :0.0399s

...Processing glass.mat
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n :0.25,extecution time :0.384s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.8506, precision @ rank n :0.25,extecution time :0.3121s
feature Bagging ROC:0.7439, precision @ rank n :0.25,extecution time :0.2644s
Histogram-base Outlier Detection(HBOS) ROC:0.6524, precision @ rank n :0.0,extecution time :0.016s
Isolation Forest ROC:0.7287, precision @ rank n :0.25,extecution time :2.1926s
K Nearst Neighbors (KNN) ROC:0.7805, precision @ rank n :0.25,extecution time :0.0879s
Local Outlier Factors(LOF) ROC:0.7774, precision @ rank n :0.25,extecution time :0.016s
Minimum Covariance Determination(MCD) ROC:0.7165, precision @ ra



Minimum Covariance Determination(MCD) ROC:0.9357, precision @ rank n :0.6667,extecution time :0.28s
One Class SVM ROC:0.9532, precision @ rank n :0.6667,extecution time :0.016s
Principle Component Analysis(PCA) ROC:0.9591, precision @ rank n :0.6667,extecution time :0.016s

...Processing mnist.mat
Angle-based Outlier Detector (ABOD) ROC:0.7708, precision @ rank n :0.35,extecution time :54.5005s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.854, precision @ rank n :0.4107,extecution time :7.1044s
feature Bagging ROC:0.6872, precision @ rank n :0.3036,extecution time :344.2075s
Histogram-base Outlier Detection(HBOS) ROC:0.5679, precision @ rank n :0.1214,extecution time :0.48s
Isolation Forest ROC:0.7999, precision @ rank n :0.2714,extecution time :14.3032s
K Nearst Neighbors (KNN) ROC:0.8403, precision @ rank n :0.4,extecution time :49.513s
Local Outlier Factors(LOF) ROC:0.6905, precision @ rank n :0.3071,extecution time :47.5837s




Minimum Covariance Determination(MCD) ROC:0.8454, precision @ rank n :0.2,extecution time :18.8947s
One Class SVM ROC:0.8586, precision @ rank n :0.3893,extecution time :35.2252s
Principle Component Analysis(PCA) ROC:0.8555, precision @ rank n :0.3679,extecution time :0.9881s

...Processing musk.mat
Angle-based Outlier Detector (ABOD) ROC:0.1139, precision @ rank n :0.0312,extecution time :16.2573s
Cluster-based Local Outlier Factor (CBLOF) ROC:1.0, precision @ rank n :1.0,extecution time :2.4496s
feature Bagging ROC:0.6284, precision @ rank n :0.3438,extecution time :88.1617s
Histogram-base Outlier Detection(HBOS) ROC:1.0, precision @ rank n :0.9688,extecution time :0.504s
Isolation Forest ROC:0.9998, precision @ rank n :0.9375,extecution time :8.8446s
K Nearst Neighbors (KNN) ROC:0.7694, precision @ rank n :0.2812,extecution time :13.3421s
Local Outlier Factors(LOF) ROC:0.6308, precision @ rank n :0.1875,extecution time :12.9942s
Minimum Covariance Determination(MCD) ROC:0.9997, prec



Minimum Covariance Determination(MCD) ROC:0.3674, precision @ rank n :0.0,extecution time :8.5202s
One Class SVM ROC:0.4972, precision @ rank n :0.0,extecution time :10.592s
Principle Component Analysis(PCA) ROC:0.504, precision @ rank n :0.0,extecution time :0.384s

...Processing pendigits.mat
Angle-based Outlier Detector (ABOD) ROC:0.7028, precision @ rank n :0.0606,extecution time :11.5995s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.8508, precision @ rank n :0.197,extecution time :1.9286s
feature Bagging ROC:0.4582, precision @ rank n :0.0606,extecution time :31.1362s
Histogram-base Outlier Detection(HBOS) ROC:0.9284, precision @ rank n :0.2879,extecution time :0.064s
Isolation Forest ROC:0.9563, precision @ rank n :0.3939,extecution time :5.5443s
K Nearst Neighbors (KNN) ROC:0.7678, precision @ rank n :0.0909,extecution time :5.1769s
Local Outlier Factors(LOF) ROC:0.4507, precision @ rank n :0.0758,extecution time :5.2009s
Minimum Covariance Determination(MCD) ROC:0.8302, pre







Minimum Covariance Determination(MCD) ROC:0.9898, precision @ rank n :0.7359,extecution time :125.4108s
One Class SVM ROC:0.9936, precision @ rank n :0.9585,extecution time :379.6533s
Principle Component Analysis(PCA) ROC:0.992, precision @ rank n :0.9535,extecution time :0.3099s

...Processing vertebral.mat
Angle-based Outlier Detector (ABOD) ROC:0.3812, precision @ rank n :0.0625,extecution time :0.3295s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.4469, precision @ rank n :0.125,extecution time :0.4435s
feature Bagging ROC:0.3781, precision @ rank n :0.0,extecution time :0.409s
Histogram-base Outlier Detection(HBOS) ROC:0.3555, precision @ rank n :0.0625,extecution time :0.0207s
Isolation Forest ROC:0.4039, precision @ rank n :0.0625,extecution time :2.6012s
K Nearst Neighbors (KNN) ROC:0.3797, precision @ rank n :0.0,extecution time :0.0879s
Local Outlier Factors(LOF) ROC:0.3844, precision @ rank n :0.0,extecution time :0.0281s




Minimum Covariance Determination(MCD) ROC:0.3742, precision @ rank n :0.0,extecution time :0.3407s
One Class SVM ROC:0.4547, precision @ rank n :0.0625,extecution time :0.016s
Principle Component Analysis(PCA) ROC:0.4875, precision @ rank n :0.1875,extecution time :0.008s

...Processing vowelS.mat
Angle-based Outlier Detector (ABOD) ROC:0.9165, precision @ rank n :0.5,extecution time :2.2891s
Cluster-based Local Outlier Factor (CBLOF) ROC:0.8927, precision @ rank n :0.1667,extecution time :0.9998s
feature Bagging ROC:0.9396, precision @ rank n :0.1667,extecution time :2.4152s
Histogram-base Outlier Detection(HBOS) ROC:0.681, precision @ rank n :0.0556,extecution time :0.035s
Isolation Forest ROC:0.7428, precision @ rank n :0.2778,extecution time :3.1501s
K Nearst Neighbors (KNN) ROC:0.9796, precision @ rank n :0.5,extecution time :0.7347s
Local Outlier Factors(LOF) ROC:0.9479, precision @ rank n :0.3333,extecution time :0.2732s
Minimum Covariance Determination(MCD) ROC:0.8655, precisio

__📌 Lets Analyze Output :__

In [10]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia.ma,452,274,14.6018,0.7687,0.7927,0.7806,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio.ma,1831,21,9.6122,0.5763,0.7029,0.4956,0.8453,0.9449,0.6959,0.4715,0.8627,0.9507,0.9638
0,glass.ma,214,9,4.2056,0.7104,0.8506,0.7439,0.6524,0.7287,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere.ma,351,33,35.8974,0.9004,0.8855,0.8891,0.5195,0.8315,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter.ma,1600,32,6.25,0.8465,0.7506,0.8581,0.5728,0.5742,0.845,0.8409,0.7568,0.5744,0.48
0,lympho.ma,148,18,4.0541,0.8304,0.9123,0.9474,0.9942,0.9649,0.9415,0.9415,0.9357,0.9532,0.9591
0,mnist.ma,7603,100,9.2069,0.7708,0.854,0.6872,0.5679,0.7999,0.8403,0.6905,0.8454,0.8586,0.8555
0,musk.ma,3062,166,3.1679,0.1139,1.0,0.6284,1.0,0.9998,0.7694,0.6308,0.9997,1.0,1.0
0,optdigits.ma,5216,64,2.8758,0.4428,0.7718,0.4715,0.8822,0.5714,0.3824,0.4584,0.3674,0.4972,0.504
0,pendigits.ma,6870,16,2.2707,0.7028,0.8508,0.4582,0.9284,0.9563,0.7678,0.4507,0.8302,0.9333,0.9341


In [13]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia.ma,452,274,14.6018,0.3571,0.5,0.4643,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio.ma,1831,21,9.6122,0.1875,0.2344,0.125,0.4688,0.4844,0.2812,0.125,0.4219,0.5938,0.6875
0,glass.ma,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere.ma,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter.ma,1600,32,6.25,0.275,0.2,0.3,0.125,0.05,0.3,0.325,0.1,0.1,0.05
0,lympho.ma,148,18,4.0541,0.3333,0.6667,0.6667,0.6667,0.6667,0.6667,0.6667,0.6667,0.6667,0.6667
0,mnist.ma,7603,100,9.2069,0.35,0.4107,0.3036,0.1214,0.2714,0.4,0.3071,0.2,0.3893,0.3679
0,musk.ma,3062,166,3.1679,0.0312,1.0,0.3438,0.9688,0.9375,0.2812,0.1875,0.9375,1.0,0.9688
0,optdigits.ma,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits.ma,6870,16,2.2707,0.0606,0.197,0.0606,0.2879,0.3939,0.0909,0.0758,0.0606,0.303,0.3485


In [14]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia.ma,452,274,14.6018,12.663,15.2883,5.0838,13.4449,3.0056,0.6961,0.586,6.0658,0.368,0.456
0,cardio.ma,1831,21,9.6122,3.5714,1.4483,5.5535,0.08,3.5529,1.2002,0.7525,4.2137,0.6253,0.0399
0,glass.ma,214,9,4.2056,0.384,0.3121,0.2644,0.016,2.1926,0.0879,0.016,0.2964,0.008,0.016
0,ionosphere.ma,351,33,35.8974,0.6093,0.4083,0.5366,0.072,3.1863,0.128,0.0566,0.5738,0.048,0.016
0,letter.ma,1600,32,6.25,2.5368,0.7843,5.8412,0.072,2.9933,1.1844,0.7687,8.2043,0.6641,0.048
0,lympho.ma,148,18,4.0541,0.2241,0.3164,0.232,0.04,2.088,0.0564,0.024,0.28,0.016,0.016
0,mnist.ma,7603,100,9.2069,54.5005,7.1044,344.207,0.48,14.3032,49.513,47.5837,18.8947,35.2252,0.9881
0,musk.ma,3062,166,3.1679,16.2573,2.4496,88.1617,0.504,8.8446,13.3421,12.9942,75.9832,9.3187,1.1041
0,optdigits.ma,5216,64,2.8758,18.4897,3.9236,103.132,0.264,6.4913,13.7271,12.4292,8.5202,10.592,0.384
0,pendigits.ma,6870,16,2.2707,11.5995,1.9286,31.1362,0.064,5.5443,5.1769,5.2009,22.3748,8.8,0.2


# 🔜
__Next Module Anonaly Point Detection and Correction will coming soon...__