In [1]:

import os
import sys
import numpy as np
import json as json
import pandas as pd
from datetime import datetime

# General Paths
data_path = os.getcwd()
project_path = os.path.dirname(data_path)
benchmarks_path = os.path.join(project_path, 'benchmarks')

NAB_path = os.path.join(benchmarks_path, 'NAB')
NAB_dataset = os.path.join(NAB_path, 'data')
NAB_labels = os.path.join(NAB_path, 'labels')

NAB_clean_path = os.path.join(data_path, 'NAB')

# ALL TS x all Task Families 345
    # 20 TS Per (TaskFamilyxTask)
        # 3  Pieces of Info for Each Charachter Image
            #[0] - Time Series ( [50] Vector ) 
            #[1] - Charchter Number (in aplhabet)
            #[2] - Language Number (in list of language directories)b

# Num Sequences: 6900
# Num Anomalies: 107
# Class Balance = 0.015507246376811595

# 345*0.8 = 276
# 345-275 = 70


In [2]:
# Load the Task Manifest
task_manifest = {
    "artificialNoAnomaly": {
        "dataset": "NAB",
        "name": "artificialNoAnomaly"
    },
    "artificialWithAnomaly": {
        "dataset": "NAB",
        "name": "artificialWithAnomaly"
    },
    "realAdExchange": {
        "dataset": "NAB",
        "name": "realAdExchange"
    },
    "realAWSCloudwatch": {
        "dataset": "NAB",
        "name": "realAWSCloudwatch"
    },
    "realKnownCause": {
        "dataset": "NAB",
        "name": "realKnownCause"
    },
    "realTraffic": {
        "dataset": "NAB",
        "name": "realTraffic"
    },
    "realTweets": {
        "dataset": "NAB",
        "name": "realTweets"
    }
}

In [3]:
# Datasets
datasets = {
    'NAB': {
        'path': NAB_dataset,
        # 'labels': get_nab_labels,
        'ts_name': 'timestamp'
    },
}


In [4]:
# Load the Original Lables
benchmark_labels = {}
labels_file = os.path.join(NAB_labels, 'combined_labels.json')
with open(labels_file, 'r') as f:
    benchmark_labels = json.load(f)

In [5]:
data = []

anoms = []

# Loop through each of the individual Tasks in the Dataset
for task_family_number, (task_family_name, task_data) in enumerate(task_manifest.items()):
    
    # Make the Directory for the Task
    proml_task_family = os.path.join(NAB_clean_path, task_family_name)
    if not os.path.exists(proml_task_family):
        os.makedirs(proml_task_family)

    # Loop through the Data Files in the Task Dataset
    task_path = os.path.join(datasets[task_data['dataset']]['path'], task_data['name'])
    for i, task_file in enumerate(os.listdir(task_path)):
        
        # Get the Name of the Data File for the given Family
        task_file_name, _ = task_file.split('.')

        task_file_path = os.path.join(task_path, task_file)
        task_df = pd.read_csv(task_file_path)

        # Get the Benchmark Labels for the Given Task
        benchmark_labels_key = f"{task_family_name}/{task_file}"
        benchmark_task_labels = benchmark_labels[benchmark_labels_key]

        # Do Any Datset Cleaning we want
        task_df.rename({datasets[task_data['dataset']]['ts_name']: 'timestamp'}, inplace=True)


        # Loop thrugh the Task Data to Create Frames Consistent Size
        splits_per_task = 20
        split_nu = 0
        split_size = 50
        split_start = 0
        split_end = split_size
        # (While we can still get 20(tracks)x50(samples)
        while split_start + (split_size*splits_per_task) < len(task_df): 

            # Make the Charachter Directory
            char_name = f"{task_file_name}_{split_nu}"
            char_path = os.path.join(proml_task_family, char_name)
            if not os.path.exists(char_path):
                os.makedirs(char_path)

            # Make Sure there are 20 Samples
            group_samples = []
            for j in range(splits_per_task):

                # DO THE SPLITZ
                split_df = task_df.iloc[split_start:split_end, :]
                split_start += split_size
                split_end += split_size

                # Check to See if there is an ANOM in the SPLIT
                is_anom = False
                for anom_ts in benchmark_task_labels:
                    anom_ts_dt = datetime.strptime(anom_ts, "%Y-%m-%d %H:%M:%S")
                    split_start_dt = datetime.strptime(split_df.iloc[0]['timestamp'], "%Y-%m-%d %H:%M:%S")
                    split_end_dt = datetime.strptime(split_df.iloc[-1]['timestamp'], "%Y-%m-%d %H:%M:%S")

                    if split_start_dt <= anom_ts_dt and anom_ts_dt <=  split_end_dt:
                        is_anom = True

                # Remove the TS
                split_df = split_df['value']

                # Save the Dataframe to a new ProML folder
                proml_file_path = os.path.join(char_path, f'{j+1}.csv')
                split_df.to_csv(proml_file_path, index=False)

                # Add the Sample to Tensor to be Loaded by VERSA
                split_arr = split_df.values.tolist()
                group_samples.append((np.array(split_arr), int(is_anom)))
                anoms.append(int(is_anom))
            
            # Append Data
            data.append(group_samples)

            # Increment Split NU
            split_nu += 1

data = np.array(data)
print(data.shape)
np.save('nab.npy', data)

(345, 20, 2)




In [6]:
sum(anoms)/len(anoms)

0.015507246376811595