# Load water level data from DMI

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
from anomalydetection.detectors import AnomalyDetectionPipeline, RangeDetector, DiffRangeDetector, PeakDetector, HampelDetector

In [None]:
file_path = os.path.join("..", "tests", "data", "Ballen_20150218-20201222.csv")
df = pd.read_csv(file_path, index_col=0, parse_dates=True)

In [None]:
sampling_period = df.index.to_series().diff().median()  # detect sampling period
equidistant_times = pd.date_range(df.index.min(), df.index.max(), freq=sampling_period)
data = df.water_level.reindex(equidistant_times)  # add equidistant timestamps with no measurements

In [None]:
data.plot()

# Detect anomalies outside manually set range

In [None]:
range_anomalies = RangeDetector(-1, 1.3).detect(data)

In [None]:
detected = data.to_frame()
detected["anomalies"] = data[range_anomalies.values]
detected.plot(style=['-', 'o'], figsize=(8,3), title=f"Anomalies detected: {range_anomalies.sum()}")

In [None]:
data_clean = data.copy()
data_clean[range_anomalies.values] = np.nan

# Detect anomalies outside automatically set range

In [None]:
N = 1000
normal_data, test_data = data[:N], data[N:]

In [None]:
anomaly_detector = AnomalyDetectionPipeline([RangeDetector(), DiffRangeDetector()])
anomaly_detector.fit(normal_data)
detected_anomalies = anomaly_detector.detect_detailed(test_data)

In [None]:
detected_anomalies.head()

In [None]:
detected = test_data.to_frame()
detected["anomalies"] = test_data[detected_anomalies.is_anomaly]
detected.plot(style=['-', 'o'], figsize=(8,3))

# Detect peaks

In [None]:
detector = PeakDetector(10, 0.1)
std_anomalies = detector.detect(data)
std_anomalies[0] = False

In [None]:
detected = data.to_frame()
detected["anomalies"] = data[std_anomalies.values]
detected.plot(style=['-', 'o'], figsize=(8,3))

# Hampel filter

The default threshold of the HampelDetector is 3, which means that a sample that deviates by more than three times of the rolling window's standard deviation is marked as an anomaly. **Increasing** the threshold marks **more** samples as anomalies, **decreasing** the threshold marks **fewer**.

In [None]:
detector = HampelDetector(window_size=20, threshold=3, use_numba=True)

In [None]:
anomalies = detector.detect(data)

In [None]:
detected = data.to_frame()
detected["anomalies"] = data[anomalies]
detected.plot(style=['-', 'o'], figsize=(8,3), title=f'Anomalies detected: {sum(anomalies)}')

# AutoEncoder

In [None]:
from anomalydetection.detectors import AutoEncoder
from anomalydetection.lag

In [None]:
normal_data = data_clean["2015"].interpolate()  # fill gaps in chosen "normal" data period

In [None]:
def lag_time_series_features(time_series, lags):
    lagged_time_series = {}
    for lag in lags:
        lagged_time_series[str(lag)] = time_series.shift(lag)

    return pd.concat(lagged_time_series, axis=1)

In [None]:
max_lag = 100  # window_size
normal_data_lagged_features = lag_time_series_features(normal_data, lags=range(max_lag)).dropna() # TODO put in pipeline

In [None]:
detector = AutoEncoder(epochs=20) # hidden_neurons=[max_lag, 3, 3, max_lag]

In [None]:
detector.fit(normal_data_lagged_features)

In [None]:
data_lagged_features = lag_time_series_features(data.interpolate(), lags=range(max_lag)).dropna()

In [None]:
anomalies = detector.detect(data_lagged_features) == 1

In [None]:
anomalies_time = data_lagged_features.index[anomalies]

In [None]:
detected = data.to_frame()
detected["anomalies"] = data[anomalies_time]
detected["normal_data"] = normal_data
detected.plot(style=['-', 'o'], figsize=(8,3), title=f'Anomalies detected: {sum(anomalies)}')

In [None]:
# TODO extract reconstructed signal