# Anomaly Detection Example

<a href="https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/"><img src="https://cdn.analyticsvidhya.com/wp-content/uploads/2019/02/Outliers.jpeg" /></a>

<blockquote>

    “Outliers are not necessarily a bad thing. These are just observations that are not following the same pattern as the other ones. But it can be the case that an outlier is very interesting. For example, if in a biological experiment, a rat is not dead whereas all others are, then it would be very interesting to understand why. This could lead to new scientific discoveries.  So, it is important to detect outliers.”
                                                                                                          
    – Pierre Lafaye de Micheaux, Author and Statistician
</blockquote>

The following example was inspired by <a href="https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/">this example</a>.

It uses a special Python toolkit dedicated to Outliers Detection called <a href="https://pyod.readthedocs.io/en/latest/index.html">PyOD</a>, additional info are <a href="http://www.jmlr.org/papers/volume20/19-011/19-011.pdf">here</a>. 
<br />
<br />
PyOD is a comprehensive and scalable Python toolkit for detecting outlying objects in multivariate data. This exciting yet challenging field is commonly referred as Outlier Detection or Anomaly Detection.

In [None]:
#import std packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy import stats


# Import models from PyOD
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF

#Import data-generation tool from PyOD
from pyod.utils.data import generate_data, get_outliers_inliers



## Setup 

In [None]:
random_state = np.random.RandomState(3)
outliers_fraction = 0.1
# Define six outlier detection tools to be compared
#
classifiers = {
        'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
        'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
        'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Average KNN': KNN(method='mean',contamination=outliers_fraction)
}

## Data gathering and visualization

In [None]:
#generate random data with two features
X_train, Y_train,X_test, Y_test = generate_data(n_train=500,n_test=200, n_features=2,random_state=3,contamination=outliers_fraction)


# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train,Y_train)
xt_outliers, xt_inliers = get_outliers_inliers(X_test,Y_test)


n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

#separate the two features and use it to plot the data 
F1 = X_train[:,[0]].reshape(-1,1)
F2 = X_train[:,[1]].reshape(-1,1)
# create a meshgrid 
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))

# scatter plot 
plt.figure(figsize=[15,9])
plt.scatter(x_outliers[:,0],x_outliers[:,1],c='black',edgecolor='k',label='Outliers')
plt.scatter(x_inliers[:,0],x_inliers[:,1],c='white',edgecolor='k',label='Inliers')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

## Train different models evaluate and visualize results

In [None]:
#set the figure size
plt.figure(figsize=(19, 20))
dfx = pd.DataFrame(X_train)
dfx['y'] = Y_train

for i, (clf_name,clf) in enumerate(classifiers.items()) :
    # fit the dataset to the model
    clf.fit(X_train)

    # predict raw anomaly score
    scores_pred = clf.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != Y_train).sum()

    dfx['outlier'] = y_pred.tolist()
    
    # IX1 - inlier feature 1,  IX2 - inlier feature 2
    IX1 =  np.array(dfx[0][dfx['outlier'] == 0]).reshape(-1,1)
    IX2 =  np.array(dfx[1][dfx['outlier'] == 0]).reshape(-1,1)
    
    # OX1 - outlier feature 1, OX2 - outlier feature 2
    OX1 =  dfx[0][dfx['outlier'] == 1].values.reshape(-1,1)
    OX2 =  dfx[1][dfx['outlier'] == 1].values.reshape(-1,1)
    
        # True - outlier feature 1, OX2 - outlier feature 2
    TX1 =  dfx[0][dfx['y'] == 1].values.reshape(-1,1)
    TX2 =  dfx[1][dfx['y'] == 1].values.reshape(-1,1)
    
    text ='No of mis-detected outliers : '+clf_name+" "+str(n_errors)
    if(n_errors==0):
        text ="\033[1m"+"\033[91m"+'No of mis-detected outliers : '+clf_name+" "+str(n_errors)+"\033[0m"
    print(text)

    # rest of the code is to create the visualization

    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 *outliers_fraction)
   
    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)

    subplot = plt.subplot(2, 3, i + 1)

    # fill blue colormap from minimum anomaly score to threshold value
    subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 10),cmap=plt.cm.Blues_r)

    # draw red contour line where anomaly score is equal to threshold
    a = subplot.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')

    # scatter plot of inliers with white dots 
    b = subplot.scatter(IX1,IX2, c='white',s=100, edgecolor='k')
    # scatter plot of detected outliers with black dots
    c = subplot.scatter(OX1,OX2, c='black',s=100, edgecolor='k')
    # scatter plot of true outliers with red dots
    d = subplot.scatter(x_outliers[:,0],x_outliers[:,1], c='red',s=20,)
    subplot.axis('tight')

    subplot.legend(
        [a.collections[0], b, c, d],
        ['learned decision function', 'inliers', 'detected outliers','true outliers'],
        loc='lower right')

    subplot.set_title(clf_name)
    subplot.set_xlim((-10, 10))
    subplot.set_ylim((-10, 10))
plt.show() 

## Test Dataset

In [None]:
#set the figure size
plt.figure(figsize=(19, 20))
dfxt = pd.DataFrame(X_test)
dfxt['y'] = Y_test

for i, (clf_name,clf) in enumerate(classifiers.items()) :
    
    # predict raw anomaly score
    scores_pred = clf.decision_function(X_test)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_test)

    # no of errors in prediction
    n_errors = (y_pred != Y_test).sum()

    dfxt['outlier'] = y_pred.tolist()
    
    # IX1 - inlier feature 1,  IX2 - inlier feature 2
    IX1 =  np.array(dfxt[0][dfx['outlier'] == 0]).reshape(-1,1)
    IX2 =  np.array(dfxt[1][dfx['outlier'] == 0]).reshape(-1,1)
    
    # OX1 - outlier feature 1, OX2 - outlier feature 2
    OX1 =  dfxt[0][dfxt['outlier'] == 1].values.reshape(-1,1)
    OX2 =  dfxt[1][dfxt['outlier'] == 1].values.reshape(-1,1)
    
        # True - outlier feature 1, OX2 - outlier feature 2
    TX1 =  dfxt[0][dfxt['y'] == 1].values.reshape(-1,1)
    TX2 =  dfxt[1][dfxt['y'] == 1].values.reshape(-1,1)
    
    text ='No of mis-detected outliers : '+clf_name+" "+str(n_errors)
    if(n_errors==0):
        text ="\033[1m"+"\033[91m"+'No of mis-detected outliers : '+clf_name+" "+str(n_errors)+"\033[0m"
    print(text)

    # rest of the code is to create the visualization

    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 *outliers_fraction)
   
    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)

    subplot = plt.subplot(2, 3, i + 1)

    # fill blue colormap from minimum anomaly score to threshold value
    subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 10),cmap=plt.cm.Blues_r)

    # draw red contour line where anomaly score is equal to threshold
    a = subplot.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')

    # scatter plot of inliers with white dots
    #b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=100, edgecolor='k') 
    b = subplot.scatter(IX1,IX2, c='white',s=100, edgecolor='k')
    # scatter plot of outliers with black dots
    #c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=100, edgecolor='k')
    c = subplot.scatter(OX1,OX2, c='black',s=100, edgecolor='k')
    
    # scatter plot of true outliers with red dots
    d = subplot.scatter(xt_outliers[:,0],xt_outliers[:,1], c='red',s=20,)
    subplot.axis('tight')

    subplot.legend(
        [a.collections[0], b, c, d],
        ['learned decision function', 'inliers', 'detected outliers','true outliers'],
        loc='lower right')

    subplot.set_title(clf_name)
    subplot.set_xlim((-10, 10))
    subplot.set_ylim((-10, 10))
plt.show() 