In [None]:
from donut import complete_timestamp, standardize_kpi
from donut import DonutTrainer, DonutPredictor
from donut import Donut

from tfsnippet.utils import get_variables_as_dict, VariableSaver
from tfsnippet.modules import Sequential
from evaluation import label_evaluation
from tensorflow import keras as K
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import sys
import os


sys.path.append('../')

In [None]:
train_file = '../train.csv'
test_file = '../test.csv'

result_file = 'predict.csv'
truth_file = 'ground_truth.hdf'
delay = 7

In [None]:
train_df = pd.read_csv(train_file).rename(columns={'KPI ID': 'kpi_id'})
test_df = pd.read_csv(test_file).rename(columns={'KPI ID': 'kpi_id'})

In [None]:
train_df.kpi_id.unique()

In [None]:
kpi_id = '02e99bd4f6cfb33f'

In [None]:
kpi = train_df[train_df['kpi_id'] == kpi_id]
t_kpi = test_df[test_df['kpi_id'] == kpi_id]
t_kpi['label'] = 0

In [None]:
train_timestamp, train_missing, (train_values, train_labels) = complete_timestamp(kpi['timestamp'], (kpi['value'], kpi['label']))
test_timestamp, test_missing, (test_values, test_labels) = complete_timestamp(t_kpi['timestamp'], (t_kpi['value'], t_kpi['label']))

In [None]:
train_values, mean, std = standardize_kpi(
    train_values, excludes=np.logical_or(train_labels, train_missing))
test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

In [None]:
SLIDING_WINDOW = 120

with tf.variable_scope(kpi_id) as model_vs:
    model = Donut(
        h_for_p_x=Sequential([
            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
        ]),
        h_for_q_z=Sequential([
            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
        ]),
        x_dims=SLIDING_WINDOW,
        z_dims=5,
    )

trainer = DonutTrainer(model=model, model_vs=model_vs)
predictor = DonutPredictor(model)

In [None]:
save_dir = "models/{}/".format(kpi_id)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
saved = True
if len(os.listdir(save_dir)) == 0:
    saved = False
 
if saved is False:
    with tf.Session().as_default():
        # train the model
        trainer.fit(train_values, train_labels, train_missing, mean, std)
        # save variables to 'save_dir' directory
        var_dict = get_variables_as_dict(model_vs)
        saver = VariableSaver(var_dict, save_dir)
        saver.save()
        saved = True

Performance Metric function

In [None]:
def evaluate(preds, labels, kpi_id, missing, timestamp):

    truth = pd.DataFrame({'missing': missing, 'label': labels, 'KPI ID': kpi_id, 'timestamp': timestamp})
    truth[truth['missing'] == 0].to_hdf(truth_file, key='df')

    prediction = pd.DataFrame({'missing': missing, 'predict': preds, 'KPI ID': kpi_id, 'timestamp': timestamp})
    prediction[prediction['missing'] == 0].to_csv(result_file)

    return json.loads(label_evaluation(truth_file, result_file, delay))['data']

Donut train and inference

In [None]:
with tf.Session().as_default():
    # restore variables from 'save_dir'
    saver = VariableSaver(get_variables_as_dict(kpi_id), save_dir)
    saver.restore()
    # make predictions
    train_score = predictor.get_score(train_values, train_missing)
    test_score = predictor.get_score(test_values, test_missing)
    # try different thresholds
    best_threshold, best_f1, best_predictions = 0, 0, []
    thresholds = np.arange(0, 50, 0.2)

    for t in thresholds:
        anomaly_predictions = np.where(abs(train_score) > t, 1, 0)
        f1 = evaluate(np.concatenate([[0]*(SLIDING_WINDOW-1), anomaly_predictions]), train_labels, kpi_id, train_missing, train_timestamp)

        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t
            best_predictions = anomaly_predictions

    anomaly_predictions = best_predictions
    print("Best f1 score: {}".format(best_f1))

In [None]:
best_threshold

In [None]:
predictions = np.where(abs(test_score) > best_threshold, 1, 0)
predictions = pd.DataFrame({'missing': test_missing, 'predict': np.concatenate([[0]*(SLIDING_WINDOW-1), predictions]), 'KPI ID': kpi_id, 'timestamp': test_timestamp})
predictions[predictions['missing'] == 0].to_csv('{}.csv'.format(kpi_id))