## Import TensorFlow and other libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split



## Third example: Anomaly detection

## Overview


In this example, you will train an autoencoder to detect anomalies on the [ECG5000 dataset](http://www.timeseriesclassification.com/description.php?Dataset=ECG5000). This dataset contains 5,000 [Electrocardiograms](https://en.wikipedia.org/wiki/Electrocardiography), each with 140 data points. You will use a simplified version of the dataset, where each example has been labeled either `0` (corresponding to an abnormal rhythm), or `1` (corresponding to a normal rhythm). You are interested in identifying the abnormal rhythms.

Note: This is a labeled dataset, so you could phrase this as a supervised learning problem. The goal of this example is to illustrate anomaly detection concepts you can apply to larger datasets, where you do not have labels available (for example, if you had many thousands of normal rhythms, and only a small number of abnormal rhythms).

How will you detect anomalies using an autoencoder? Recall that an autoencoder is trained to minimize reconstruction error. You will train an autoencoder on the normal rhythms only, then use it to reconstruct all the data. Our hypothesis is that the abnormal rhythms will have higher reconstruction error. You will then classify a rhythm as an anomaly if the reconstruction error surpasses a fixed threshold.

### Load ECG data

The dataset you will use is based on one from [timeseriesclassification.com](http://www.timeseriesclassification.com/description.php?Dataset=ECG5000).


In [None]:
import pandas as pd
# Download the dataset
dataframe = pd.read_csv('http://storage.googleapis.com/download.tensorflow.org/data/ecg.csv', header=None)
raw_data = dataframe.values
dataframe

In [None]:
# The last element contains the labels
labels = raw_data[:, -1]

# The other data points are the electrocadriogram data
data = raw_data[:, 0:-1]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=21
)

#-->
train_data = torch.tensor(train_data, dtype=torch.float32)
test_data = torch.tensor(test_data, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.float32)



Normalize the data to `[0,1]`.


In [None]:
min_val = torch.min(train_data)
max_val = torch.max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)

train_data = torch.tensor(train_data)
test_data = torch.tensor(test_data)

You will train the autoencoder using only the normal rhythms, which are labeled in this dataset as `1`. Separate the normal rhythms from the abnormal rhythms.

In [None]:
train_labels = train_labels.bool()
test_labels = test_labels.bool()

normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]


Plot a normal ECG.

In [None]:
plt.grid()
plt.plot(np.arange(140), normal_train_data[0])
plt.title("A Normal ECG")
plt.show()

Plot an anomalous ECG.

In [None]:
plt.grid()
plt.plot(np.arange(140), anomalous_train_data[0])
plt.title("An Anomalous ECG")
plt.show()

### Build the model

In [None]:
class AnomalyDetector(nn.Module):
  def __init__(self, input_dim=140):
    super(AnomalyDetector, self).__init__()
    self.encoder = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU(),
    )

    self.decoder = nn.Sequential(
        nn.Linear(16, 32),
        nn.ReLU(),
        nn.Linear(32, 64),
        nn.ReLU(),
        nn.Linear(64, input_dim),
    )

  def forward(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

input_dim = 140
autoencoder = AnomalyDetector(input_dim=input_dim)

In [None]:
# Optimize
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001, weight_decay=1e-5)


Notice that the autoencoder is trained using only the normal ECGs, but is evaluated using the full test set.

In [None]:
# แบ่งข้อมูลออกเป็น Batch
batch_size = 8
train_loader = DataLoader(TensorDataset(
    normal_train_data, normal_train_data
  ), batch_size=8, shuffle=True
)
val_loader = DataLoader(TensorDataset(
    normal_test_data, normal_test_data
  ), batch_size=8, shuffle=True
)

In [None]:
print("train_data.shape:", normal_train_data.shape)

In [None]:
#Training
#-->
train_losses = []
val_losses = []

epochs = 100

for epoch in range(epochs):
    autoencoder.train()
    running_loss = 0.0
    #----- Training Loop -----#

    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = autoencoder(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_x.size(0)

    epoch_train_loss = running_loss / len(train_loader.dataset)

    #----- validation -----
    autoencoder.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            outputs = autoencoder(batch_x)
            loss = criterion(outputs, batch_y)
            running_val_loss += loss.item() * batch_x.size(0)

    epoch_val_loss = running_val_loss / len(val_loader.dataset)

    train_losses.append(epoch_train_loss)
    val_losses.append(epoch_val_loss)

    if epoch % 5 == 0:
      print(f"Epoch {epoch:03d} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}")

In [None]:
plt.figure(figsize=(8,5))
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss (MAE)")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()

You will soon classify an ECG as anomalous if the reconstruction error is greater than one standard deviation from the normal training examples. First, let's plot a normal ECG from the training set, the reconstruction after it's encoded and decoded by the autoencoder, and the reconstruction error.

In [None]:
# ---> validate autoencoder
autoencoder.eval()
with torch.no_grad():
  encode_data = autoencoder.encoder(normal_test_data[0].unsqueeze(0))
  decoded_data = autoencoder.decoder(encode_data)


# Plot ตัวอย่าง
plt.figure(figsize=(10,4))
plt.plot(normal_test_data[0].numpy(), 'b')         # original
plt.plot(decoded_data[0], 'r')                     # reconstructed
plt.fill_between(np.arange(input_dim), decoded_data[0], normal_test_data[0].numpy(), color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

Create a similar plot, this time for an anomalous test example.

In [None]:
autoencoder.eval()
with torch.no_grad():
  encode_data = autoencoder.encoder(anomalous_test_data[0].unsqueeze(0))
  decoded_data = autoencoder.decoder(encode_data)


plt.plot(anomalous_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(140), decoded_data[0], anomalous_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

### Detect anomalies

Detect anomalies by calculating whether the reconstruction loss is greater than a fixed threshold. In this tutorial, you will calculate the mean average error for normal examples from the training set, then classify future examples as anomalous if the reconstruction error is higher than one standard deviation from the training set.


Plot the reconstruction error on normal ECGs from the training set

In [None]:
autoencoder.eval()
with torch.no_grad():
  reconstruc = autoencoder(normal_train_data)
  loss_fn = nn.L1Loss(reduction='none')
  sample_losses = loss_fn(reconstruc, normal_train_data)
  train_losses = sample_losses.mean(dim=1).numpy()

print(sample_losses.shape)
plt.hist(train_losses, bins=50)
plt.xlabel("Train loss")
plt.ylabel("No of examples")
plt.show()

Choose a threshold value that is one standard deviations above the mean.

In [None]:
threshold = np.mean(train_losses) + np.std(train_losses)
print("Threshold: ", threshold)

Note: There are other strategies you could use to select a threshold value above which test examples should be classified as anomalous, the correct approach will depend on your dataset. You can learn more with the links at the end of this tutorial.

If you examine the reconstruction error for the anomalous examples in the test set, you'll notice most have greater reconstruction error than the threshold. By varing the threshold, you can adjust the [precision](https://developers.google.com/machine-learning/glossary#precision) and [recall](https://developers.google.com/machine-learning/glossary#recall) of your classifier.

In [None]:
autoencoder.eval()
with torch.no_grad():
  reconstruc = autoencoder(anomalous_test_data)
  loss_fn = nn.L1Loss(reduction='none')
  sample_losses = loss_fn(reconstruc, anomalous_test_data)
  test_losses = sample_losses.mean(dim=1).numpy()

plt.hist(test_losses, bins=50)
plt.xlabel("Test loss")
plt.ylabel("No of examples")
plt.show()

Classify an ECG as an anomaly if the reconstruction error is greater than the threshold.

In [None]:
def predict(model, data, threshold):

    model.eval()
    with torch.no_grad():
        reconstruc = model(data)
        loss_fn = nn.L1Loss(reduction='none')
        sample_losses = loss_fn(reconstruc, data)
        loss = sample_losses.mean(dim=1).numpy()

    return loss < threshold

def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

In [None]:
preds = predict(autoencoder, test_data, threshold)
print_stats(preds, test_labels)

## Next steps

To learn more about anomaly detection with autoencoders, check out this excellent [interactive example](https://anomagram.fastforwardlabs.com/#/) built with TensorFlow.js by Victor Dibia. For a real-world use case, you can learn how [Airbus Detects Anomalies in ISS Telemetry Data](https://blog.tensorflow.org/2020/04/how-airbus-detects-anomalies-iss-telemetry-data-tfx.html) using TensorFlow. To learn more about the basics, consider reading this [blog post](https://blog.keras.io/building-autoencoders-in-keras.html) by François Chollet. For more details, check out chapter 14 from [Deep Learning](https://www.deeplearningbook.org/) by Ian Goodfellow, Yoshua Bengio, and Aaron Courville.
