# Outlier Detection With Autoencoders

based on: https://towardsdatascience.com/outlier-detection-with-autoencoders-6c7ac3e2aa90

<b> environment

In [2]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<b> Define data

In [None]:
tf.random.set_seed(1234)
t = tf.expand_dims(tf.linspace(0., 2*3.14, 1000), -1)
noise = tf.random.normal((1000, 2), stddev=0.05)
points = tf.concat([tf.cos(t), tf.sin(t)], axis=1) + noise

<b> Define outlier

In [None]:
outliers = tf.constant([[0., 0.], [2., 2.]])
points_with_outliers = tf.concat([points, outliers], axis=0)

<b> Plot data

In [None]:
points_with_outliers = points_with_outliers.numpy()
plt.scatter(points_with_outliers[:,0], points_with_outliers[:,1])

<b> Define autoencoder model

In [None]:
encoder = tf.keras.Sequential([
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=1)
])

decoder = tf.keras.Sequential([
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=16, activation="relu"),
    layers.Dense(units=2)
])
autoencoder = tf.keras.Sequential([encoder,decoder])
autoencoder.compile(loss="mse")

<b> Shuffle trainings data

In [None]:
shuffled_points = tf.random.shuffle(points)

<b> Training (without outliers)

In [None]:
autoencoder.fit(
    x=shuffled_points,
    y=shuffled_points,
    validation_split=0.2,
    epochs=500
)

<b> Inference (Reconstruct points with outliers)

In [None]:
reconstructed_points = autoencoder(points_with_outliers)

<b> Plot reconstructed points

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(points_with_outliers[:,0], points_with_outliers[:,1], s=10, c='b', marker="s", label='original data with outlier')
ax1.scatter(reconstructed_points[:,0], reconstructed_points[:,1], s=10, c='r', marker="o", label='autoencoder')
plt.legend(loc='upper left')
plt.show()

<b> Reconstruction error

Autoencoder puts all points (including outliers) on the circle. Thus non-outliers should have a small reconstruction error whereas outliers should have a huge reconstruction error

In [None]:
reconstruction_errors = tf.reduce_sum((autoencoder(points_with_outliers) - points_with_outliers)**2, axis=1)

pd.DataFrame({
    "x": points_with_outliers[:, 0],
    "y": points_with_outliers[:, 1],
    "reconstruction_error": reconstruction_errors
})