In [2]:
"""
Adapted from https://github.com/wagner-d/TimeSeAD/tree/master/timesead
Implementation of the Server Machine Dataset [Su2019]_.
The data consists of traces from 28 different servers recorded over several weeks. We consider each trace to be a
separate dataset.

.. note::
    Automatically downloading the dataset currently requires that you have `git` installed on your system!

.. [Su2019] Y. Su, Y. Zhao, C. Niu, R. Liu, W. Sun, D. Pei.
    Robust anomaly detection for multivariate time series through stochastic recurrent neural network.
    In: Proceedings of the 25th ACM SIGKDD international conference on knowledge discovery & data mining,
    2019 Jul 25 (pp. 2828-2837).
"""
GITHUB_LINK = 'https://github.com/NetManAIOps/OmniAnomaly.git'

FILENAMES = [
    'machine-1-1.txt',
    'machine-1-2.txt',
    'machine-1-3.txt',
    'machine-1-4.txt',
    'machine-1-5.txt',
    'machine-1-6.txt',
    'machine-1-7.txt',
    'machine-1-8.txt',
    'machine-2-1.txt',
    'machine-2-2.txt',
    'machine-2-3.txt',
    'machine-2-4.txt',
    'machine-2-5.txt',
    'machine-2-6.txt',
    'machine-2-7.txt',
    'machine-2-8.txt',
    'machine-2-9.txt',
    'machine-3-1.txt',
    'machine-3-10.txt',
    'machine-3-11.txt',
    'machine-3-2.txt',
    'machine-3-3.txt',
    'machine-3-4.txt',
    'machine-3-5.txt',
    'machine-3-6.txt',
    'machine-3-7.txt',
    'machine-3-8.txt',
    'machine-3-9.txt'
]

# FILENAMES = [
#     'machine-1-1.txt'
# ]

TRAIN_LENS = [28479, 23694, 23702, 23706, 23705, 23688, 23697, 23698, 23693, 23699, 23688, 23689, 23688, 28743, 23696,
              23702, 28722, 28700, 23692, 28695, 23702, 23703, 23687, 23690, 28726, 28705, 28703, 28713]

TEST_LENS = [28479, 23694, 23703, 23707, 23706, 23689, 23697, 23699, 23694, 23700, 23689, 23689, 23689, 28743, 23696,
             23703, 28722, 28700, 23693, 28696, 23703, 23703, 23687, 23691, 28726, 28705, 28704, 28713]


In [3]:
import os
from typing import List

import numpy as np
import pandas as pd

def save_statistics(frame: pd.DataFrame, path: str):
    """
    Compute feature-wise mean, standard deviation, minimum, and maximum values for a dataset consisting of a single
    :class:`~pandas.DataFrame` and save them as a `.npz` file.

    :param frame: The dataset for which to compute and save statistics.
    :param path: Path to save the statistics via :func:`numpy.savez`.
    """
    mean = frame.mean().to_numpy()
    std = frame.std().to_numpy()
    min = frame.min().to_numpy()
    max = frame.max().to_numpy()
    median = frame.median().to_numpy()

    np.savez(path, mean=mean, std=std, min=min, max=max, median=median)

# preprocess data
def preprocess_smd_data(dataset_dir: str, out_dir: str, filenames: List[str]):
    """
    Preprocess SMD dataset for experiments

    :param dataset_dir: Path to the dataset folder
    :param out_dir: Directory where the preprocessed data should be saved. This directory should exist already.
    """
    for filename in filenames:
        data = np.genfromtxt(os.path.join(dataset_dir, 'train', filename), dtype=np.float32, delimiter=',')
        data = pd.DataFrame(data)

        file_info = filename.split('.')

        # Save dataset statistics
        stats_file = os.path.join(out_dir, f'{file_info[0]}_stats.npz')
        save_statistics(data, stats_file)

smd_path = "../data/ServerMachineDataset"
processed_dir = os.path.join(smd_path, 'processed')
os.makedirs(processed_dir, exist_ok=True)

In [4]:
import h5py
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

def read_data_to_clips(smd_path, test_str, clip_len=720, test_split=14400, scaler=None, shots=-1):
    """
    Read a single dataset from the SMD dataset.

    :param smd_path: Path to the SMD dataset folder.
    :param test_str: Either 'train' or 'test'.
    :param clip_len: Length of each clip (in minutes)
    """
    output_file = os.path.join(smd_path, "smd_" + test_str + ".h5")

    f = h5py.File(output_file, "w")
    f.close()

    all_clips = []
    all_targets = []

    for filename in tqdm(FILENAMES):
        if test_str == 'train':
            data = np.genfromtxt(os.path.join(smd_path, test_str, filename), dtype=np.float32, delimiter=',')
            target = np.zeros(data.shape[0])
            test_data = np.genfromtxt(os.path.join(smd_path, 'test', filename), dtype=np.float32, delimiter=',')[0:test_split]
            data = np.concatenate((data, test_data))
            test_target = np.genfromtxt(os.path.join(smd_path, 'test_label', filename), dtype=np.float32, delimiter=',')[0:test_split]
            target = np.concatenate((target, test_target))
        else:
            data = np.genfromtxt(os.path.join(smd_path, test_str, filename), dtype=np.float32, delimiter=',')[test_split:]
            target = np.genfromtxt(os.path.join(smd_path, 'test_label', filename), dtype=np.float32, delimiter=',')[test_split:]

        # Split data into clips
        num_clips = int(np.floor(data.shape[0] / clip_len))

        for i in range(num_clips):
            clip = data[i * clip_len:(i + 1) * clip_len, :]
            all_clips.append(clip)
            all_targets.append(int(any(target[i * clip_len:(i + 1) * clip_len])))

    data = np.stack(all_clips, axis=0)
    mean = np.mean(data, axis=(0,1))
    std = np.std(data, axis=(0,1))

    # Normalize data
    data = np.reshape(data, (-1, 38))
    if scaler==None:
        scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    data = np.reshape(data, (-1, clip_len, 38))
    data = np.transpose(data, (0, 2, 1))

    target = np.stack(all_targets, axis=0)

    abnormal_data = data[target==1]
    abnormal_target = target[target==1]

    if shots>0:
        abnormal_data = abnormal_data[:shots]
        abnormal_target = abnormal_target[:shots]

    print(f'Number of isolated anomaly clips: {abnormal_data.shape[0]}')

    print(f'Number of clips in {test_str} set: {data.shape[0]}')
    print(f'Percent anomaly in {test_str} set: {np.mean(target) * 100}%')
    
    with h5py.File(output_file, 'w') as hdf:
        hdf.create_dataset('X', data=data)
        hdf.create_dataset('y', data=target)
        hdf.create_dataset('X_anom', data=abnormal_data)
        hdf.create_dataset('y_anom', data=abnormal_target)

    return scaler, mean, std, data, target

scaler, train_mean, train_std, data, target = read_data_to_clips(smd_path, test_str='train', clip_len=200, test_split=20000, shots=10)
_, test_mean, test_std, _, _ = read_data_to_clips(smd_path, test_str='test', clip_len=200, test_split=20000, scaler=scaler)

print(f'Difference in mean: {train_mean - test_mean}')
print(f'Difference in std: {train_std - test_std}')

100%|██████████| 28/28 [00:26<00:00,  1.08it/s]


Number of isolated anomaly clips: 10
Number of clips in train set: 6328
Percent anomaly in train set: 5.847029077117573%


100%|██████████| 28/28 [00:12<00:00,  2.33it/s]

Number of isolated anomaly clips: 88
Number of clips in test set: 728
Percent anomaly in test set: 12.087912087912088%
Difference in mean: [-5.29941022e-02 -5.60901910e-02 -6.14546984e-02 -6.89448342e-02
  1.36468530e-01 -7.22491741e-03 -1.09375715e-02  0.00000000e+00
 -5.20095229e-03 -5.40256547e-03 -2.21763067e-02  1.00264698e-03
 -2.26004291e-02 -1.06054172e-02 -6.77057728e-03 -1.62157193e-02
 -1.33024834e-04 -1.79709605e-05 -4.10780907e-02 -2.99901217e-02
 -3.79961729e-02 -2.76627243e-02 -2.79120654e-02 -2.65056789e-02
  2.46123299e-02 -6.28837943e-03  2.23241095e-05 -3.53244096e-02
  4.26945808e-05  4.50034440e-03 -2.42450088e-02  1.23772025e-02
 -1.90595910e-03  2.15535238e-03 -9.53087211e-03 -1.47540569e-02
 -5.37676364e-03  8.36248510e-04]
Difference in std: [-0.04996069 -0.05103254 -0.05180858 -0.04964791  0.11900964  0.00413758
  0.01541451  0.         -0.00886978 -0.0242696  -0.02245094 -0.00131411
 -0.09674101 -0.0125635  -0.0066585  -0.0275014  -0.00419776 -0.00101301
 -0.


