In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.evaluation import label_evaluation

sns.set_style("whitegrid")

## Load Data

In [2]:
data_folder = '../'

train_file = 'train.csv'
test_file = 'test.csv'

train_path = data_folder + train_file
test_path = data_folder + test_file

In [3]:
df_train = pd.read_csv(train_path).rename(columns={'KPI ID': 'kpi_id'})
print(df_train.shape)
df_train.head(2)

(2476315, 4)


Unnamed: 0,timestamp,value,label,kpi_id
0,1493568000,1.901639,0,02e99bd4f6cfb33f
1,1493568060,1.786885,0,02e99bd4f6cfb33f


In [4]:
df_train['datetime'] = pd.to_datetime(df_train.timestamp, unit='s')

### Generate test evaluation files

to run : ```python evaluation.py 'ground_truth.hdf' 'predict.csv' 7```

#### Performance Metric

The anomaly detection algorithm need to give a predicted label (0 or 1) for every observation in testing set. We use F-score on test set as the final performance metric in the ranking.
In real applications, the human operators generally do not care about the point-wise metrics. It is acceptable for an algorithm to trigger an alert for any point in a contiguous anomaly segment if the delay is not too long.
**For an anomaly segment with start point i, if any points between *i* to *T+i* in the ground truth were detected, we say this segment is detected correctly, and all points in this segment are treated as true positives. Otherwise, all points in this segment are regarded as false negatives**. Meanwhile, the points outside the anomaly segments are treated as usual. For example (see the figure below), when *T=1*, the first anomaly segment was detected and the second segment
was not, so the precision=0.75, recall=0.5. Based on the above strategy, we calculate F-score.

<img src="./assets/evaluation.png">

In [5]:
result_file = 'predict.csv'
truth_file = 'ground_truth.hdf'
delay = 7

#### Prefect Prediction

In [6]:
predict = df_train.loc[:, ['timestamp', 'value', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID', 'label': 'predict'})
predict.to_csv(result_file)

In [7]:
ground_truth = df_train.loc[:, ['timestamp', 'value', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.

In [None]:
print(label_evaluation(truth_file, result_file, delay))

#### Random Prediction

In [None]:
predict = df_train.loc[:, ['timestamp', 'value', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID', 'label': 'predict'})
predict.predict = np.random.choice([0, 1], predict.predict.shape)
predict.to_csv(result_file)

In [None]:
ground_truth = df_train.loc[:, ['timestamp', 'value', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

In [None]:
print(label_evaluation(truth_file, result_file, delay))