In [9]:
import numpy as np
from scipy import stats
 
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from keras.models import Model
from keras.layers import Dense, Input, BatchNormalization, Dropout, Activation

def Autoencoder():
    input_data = Input(shape=(2, ))

    encoded = Dense(4)(input_data)
    encoded = BatchNormalization()(encoded)
    encoded = Activation('relu')(encoded)
    encoded = Dropout(rate=0.2)(encoded)
    encoded = Dense(8)(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = Activation('relu')(encoded)
    encoded = Dropout(rate=0.2)(encoded)
    encoded_output = Dense(16, activation='linear')(encoded)
    
    decoded = Dense(8)(encoded_output)
    decoded = BatchNormalization()(decoded)
    decoded = Activation('relu')(decoded)
    decoded = Dropout(rate=0.2)(decoded)
    decoded = Dense(4)(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Activation('relu')(decoded)
    decoded = Dropout(rate=0.2)(decoded)
    output = Dense(2, activation='linear')(decoded)

    autoencoder = Model(input_data, output)
    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.summary()
    return autoencoder
    
#随机数发生器
rng = np.random.RandomState(42)
 
# Example settings 示例设置
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
 
# define two outlier detection tools to be compared 定义两个异常的检测工具进行比较
classifiers = {
     "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                 kernel="rbf", gamma=0.1),
    "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest": IsolationForest(max_samples=n_samples,
                                    contamination=outliers_fraction,
                                    random_state=rng),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors = 35,
                                           contamination=outliers_fraction),
    "Autoencoder": Autoencoder()}
 
# Compare given classifiers under given settings 比较给定设置下的分类器
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1

accuracy = {}
# Fit the problem with varying cluster separation 将不同的集群分离拟合
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
    # Data generation 生成数据
    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
    X = np.r_[X1, X2]
    # Add outliers 添加异常值
    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
    
    # Fit the model 模型拟合
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        # fit the data and tag outliers 拟合数据和标签离群值
        if clf_name == "Local Outlier Factor":
            y_pred = clf.fit_predict(X)
            scores_pred = clf.negative_outlier_factor_
        elif clf_name == "Autoencoder":
            X_ae = X[:(n_samples-n_outliers)]
            result = clf.fit(X_ae, X_ae, batch_size=8, shuffle=True, epochs=50, verbose=0)
            loss = np.mean(result.history['loss'][-10:]) # --> deision function
            y_pred = (np.mean(np.square(clf.predict(X) - X), axis=1) < loss).astype(int)
            # 異常為0
            y_pred = np.array([gg - 1 if gg == 0 else 1 for gg in y_pred ])
        else:
            clf.fit(X)
            scores_pred = clf.decision_function(X)
            y_pred = clf.predict(X)
#         threshold = stats.scoreatpercentile(scores_pred,
#                                         100 * outliers_fraction)
        n_errors = (y_pred != ground_truth).sum()
        accuracy[clf_name] = (1 - n_errors/ n_samples) * 100
        # print(scores_pred)
        if clf_name == "Local Outlier Factor":
            # decision_function is private for LOF 决策函数是LOF的私有函数
            Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
        elif clf_name == 'Autoencoder':
            pass
        else:
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

print(accuracy)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_31 (Dense)             (None, 4)                 12        
_________________________________________________________________
batch_normalization_21 (Batc (None, 4)                 16        
_________________________________________________________________
activation_21 (Activation)   (None, 4)                 0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 4)                 0         
_________________________________________________________________
dense_32 (Dense)             (None, 8)                 40        
_________________________________________________________________
batch_normalization_22 (Batc (None, 8)                 32        
__________



{'One-Class SVM': 93.0, 'Robust covariance': 93.0, 'Isolation Forest': 97.0, 'Local Outlier Factor': 97.0, 'Autoencoder': 97.0}


## Note
1. 這些方法(除AE)都必須知道outlier的比例，一改了這個參數果然準確率下降。
2. 除LOF以外的方法，contamination fraction似乎可以為0，那這樣準確度也就會些微下降。

In [113]:
np.random.seed(42)
print(np.random.uniform(low=-6, high=6, size=(n_outliers, 2)))
print(0.3 * np.random.randn(n_inliers // 2, 2) + offset)
print(0.3 * np.random.randn(n_inliers // 2, 2) - offset)

[[-1.50551857  5.40857168]
 [ 2.7839273   1.18390181]
 [-4.12777631 -4.12806576]
 [-5.30299665  4.39411375]
 [ 1.21338014  2.49687093]
 [-5.75298607  5.63891823]
 [ 3.98931169 -3.45193067]
 [-3.81810039 -3.79914588]
 [-2.34909308  0.29707718]
 [-0.81665978 -2.50525032]
 [ 1.34223474 -4.32607367]
 [-2.49426422 -1.60365788]
 [-0.52716019  3.42211154]
 [-3.60391461  0.17081326]
 [ 1.10897483 -5.44259505]
 [ 1.29053822 -3.95371052]
 [-5.21938088  5.38662645]
 [ 5.5875844   3.70076818]
 [-2.34463477 -4.82793463]
 [ 2.21079632 -0.71817008]
 [-4.53554118 -0.05787708]
 [-5.58733775  4.91184482]
 [-2.89464022  1.95026741]
 [-2.25946709  0.24081625]
 [ 0.56052335 -3.78174653]
 [ 5.63501553  3.30159388]
 [ 5.2739873   4.73792821]
 [ 1.17479975  5.06249082]
 [-4.93808998 -3.64820565]
 [-5.45727253 -2.09603603]
 [-1.33587252 -2.74381162]
 [ 3.94485011 -1.71896008]
 [-2.62878588  0.512353  ]
 [-4.3089093   3.62636377]
 [-5.10539228  5.84264324]
 [ 3.26693723 -3.61541182]
 [-5.93373459  3.78553714]
 