In [1]:

import os
import sys
import numpy as np
import json as json
import pandas as pd
from datetime import datetime

# General Paths
data_path = os.getcwd()
project_path = os.path.dirname(data_path)
benchmarks_path = os.path.join(project_path, 'benchmarks')

SKAB_path = os.path.join(benchmarks_path, 'SKAB')
SKAB_dataset = os.path.join(SKAB_path, 'data')
SKAB_labels = os.path.join(SKAB_path, 'labels')

SKAB_clean_path = os.path.join(data_path, 'SKAB')

# ALL TS x all Task Families 345
    # 20 TS Per (TaskFamilyxTask)
        # 3  Pieces of Info for Each Charachter Image
            #[0] - Time Series ( [50] Vector ) 
            #[1] - Charchter Number (in aplhabet)
            #[2] - Language Number (in list of language directories)b

# Num Sequences: 6900
# Num Anomalies: 107
# Class Balance = 0.015507246376811595

# 345*0.8 = 276
# 345-275 = 70


In [2]:
# Load the Task Manifest
task_manifest = {
    "anomaly-free": {
        "dataset": "SKAB",
        "name": "anomaly-free",
        "anomaly_keys": []
    },
    "other": {
        "dataset": "SKAB",
        "name": "other",
        "anomaly_keys": ["anomaly", "changepoint"]
    },
    # "valve1": {
    #     "dataset": "SKAB",
    #     "name": "valve1",
    #     "anomaly_keys": ["anomaly", "changepoint"]
    # },
    # "valve2": {
    #     "dataset": "SKAB",
    #     "name": "valve2",
    #     "anomaly_keys": ["anomaly", "changepoint"]
    # }
}

In [3]:
# Datasets
datasets = {
    'SKAB': {
        'path': SKAB_dataset,
        # 'labels': get_nab_labels,
        'ts_name': 'datetime'
    },
}


In [17]:
data = []

anoms = []

# Loop through each of the individual Tasks in the Dataset
for task_family_number, (task_family_name, task_data) in enumerate(task_manifest.items()):
    
    # Make the Directory for the Task
    proml_task_family = os.path.join(SKAB_clean_path, task_family_name)
    if not os.path.exists(proml_task_family):
        os.makedirs(proml_task_family)

    # Loop through the Data Files in the Task Dataset
    task_path = os.path.join(datasets[task_data['dataset']]['path'], task_data['name'])
    for i, task_file in enumerate(os.listdir(task_path)):
        
        # Get the Name of the Data File for the given Family
        task_file_name, _ = task_file.split('.')

        task_file_path = os.path.join(task_path, task_file)
        task_df = pd.read_csv(task_file_path, sep=';')

        print(f"{task_family_name:20s} : {task_file:40s} : {len(task_df):8d}")

        # Do Any Datset Cleaning we want
        task_df.rename({datasets[task_data['dataset']]['ts_name']: 'timestamp'}, inplace=True)


        # Loop thrugh the Task Data to Create Frames Consistent Size
        splits_per_task = 20
        split_nu = 0
        split_size = 50
        split_start = 0
        split_end = split_size
        # (While we can still get 20(tracks)x50(samples)
        while split_start + (split_size*splits_per_task) < len(task_df): 
            print(f"\t\tSplit: {split_nu}")

            # Make the Charachter Directory
            char_name = f"{task_file_name}_{split_nu}"
            char_path = os.path.join(proml_task_family, char_name)
            if not os.path.exists(char_path):
                os.makedirs(char_path)

            # Make Sure there are 20 Samples
            group_samples = []
            for j in range(splits_per_task):

                # DO THE SPLITZ
                split_df = task_df.iloc[split_start:split_end, :]
                split_start += split_size
                split_end += split_size

                # Check to See if there is an ANOM in the SPLIT
                is_anom = False
                for anomaly_key in task_data['anomaly_keys']:
                    split_anomaly_labels = split_df[anomaly_key].values

                    if any([True if i == 1 else False for i in split_anomaly_labels ]):
                        is_anom = True

                # Remove the TS / Drop Columns for Anomaly Labels
                cols_to_drop = ['datetime']
                cols_to_drop.extend(task_data['anomaly_keys'])
                print(cols_to_drop)
                split_df = split_df.drop(labels=cols_to_drop, axis=1)

                # Save the Dataframe to a new ProML folder
                proml_file_path = os.path.join(char_path, f'{j+1}.csv')
                split_df.to_csv(proml_file_path, index=False)

                # Add the Sample to Tensor to be Loaded by VERSA
                split_arr = split_df.values.tolist()
                group_samples.append((np.array(split_arr), int(is_anom)))
                anoms.append(int(is_anom))
            
            # Append Data
            data.append(group_samples)

            # Increment Split NU
            split_nu += 1
display(split_df.head())

anomaly-free         : anomaly-free.csv                         :     9405
		Split: 0
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
		Split: 1
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
		Split: 2
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
		Split: 3
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime']
['datetime

Unnamed: 0,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS
950,0.217197,0.263931,3.04239,0.054711,88.5686,29.974,228.55,127.0
951,0.216577,0.265632,2.85443,0.054711,88.6397,29.9737,230.951,127.0
952,0.213146,0.266581,2.63666,0.382638,88.741,29.9883,224.276,127.685
953,0.214368,0.264552,2.71573,0.382638,88.7698,29.9851,208.547,128.0
954,0.214268,0.263457,2.7173,0.054711,88.646,29.9967,221.889,127.317


In [18]:
data = np.array(data)
print(data.shape)
np.save('skab.npy', data)


(19, 20, 2)


  """Entry point for launching an IPython kernel.


In [19]:
sum(anoms)/len(anoms)

0.21842105263157896

In [20]:
print(data[0][0][0].shape)

(50, 8)
