In [2]:
import asyncio
import httpx
import sys
sys.path.append('../')
sys.path.append('/app/python-modules')

!{sys.executable} -m pip install tqdm
from tqdm import tqdm

import os
import json
import pandas as pd
import datetime
from pathlib import Path

from utils.time_function import time_function as timeit
from utils.TimeSeriesClient import TimeSeriesClient

#custom imports
import config
import nab_utils

from time import perf_counter
client = TimeSeriesClient(base_url='http://127.0.0.1:8000', timeout=30)

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Build Collections

In [None]:
collections_path=Path(config.ROOT_PATH) / "static" / "nab" / "raw_datasets"
collection_folders=next(os.walk(collections_path))[1] #GET ALL MODEL FOLDERS

collection_descriptions={}
collection_descriptions["realAWSCloudwatch"] = "AWS server metrics as collected by the AmazonCloudwatch service. Example metrics include CPU Utilization, Network Bytes In, and Disk Read Bytes."
collection_descriptions["realAdExchange"] = "Online advertisement clicking rates, where the metrics are cost-per-click (CPC) and cost per thousand impressions (CPM). One of the files is normal, without anomalies."
collection_descriptions["realKnownCause"] = "This is data for which we know the anomaly causes; no hand labeling."
collection_descriptions["realTraffic"] = "Real time traffic data from the Twin Cities Metro area in Minnesota, collected by the Minnesota Department of Transportation. Included metrics include occupancy, speed, and travel time from specific sensors."
collection_descriptions["realTweets"] = "A collection of Twitter mentions of large publicly-traded companies such as Google and IBM. The metric value represents the number of mentions for a given ticker symbol every 5 minutes."
collection_descriptions["artificialNoAnomaly"] = "Artificially-generated data without any anomalies."
collection_descriptions["artificialWithAnomaly"] = "Artificially-generated data with varying types of anomalies."

def build_collection_object(collection_name):
    collection_object={}
    collection_object["name"]=collection_name
    collection_object["description"]=collection_descriptions[collection_name]
    collection_object["tags"]=["nab"]
    return collection_object

async def register_collections(collection_folders):
    collections=[]
    for collection_name in collection_folders:
        collection_object=build_collection_object(collection_name)#build collection object
        collections.append(collection_object)

    collection_data=await client.request("post","/collection/create_many", json=collections)
    
    return [{"name":entry['name'],"id":entry['id']} for entry in collection_data]

# Get Labels and Set up labeler detector

In [None]:
#get labels file for anomalies
LABELS_FILE=Path(config.NAB_ASSETS) / "labels" / "combined_labels.json"
with open(LABELS_FILE) as json_file:
    labels=json.load(json_file)

async def register_original_dectector():
    detector_object={}
    detector_object["name"] = "labeler"
    detector_object["description"] = "A collection of the original datasets with the provided ground truth labels for anomalies."
    detector_object["source"] = "https://github.com/numenta/NAB"
    detector_object["documentation"] = "https://github.com/numenta/NAB/wiki#nab-whitepaper"
    detector_object["tags"] = ['Demo']

    detector_data = await client.request("post","/detector/create", json={"payload":detector_object})
    return detector_data

detector_obj = await register_original_dectector()

### Process to build datafeed, dataset, health and anomaly objects for a specific datafeed, adds a dataset for each one that is the file representing the dataset, and all the anomalies which are later added to a prediction object

In [6]:
def build_objects(dataset_name):
    """ Starting with a datafeed object, load the dataset it represents, then generate a health and dataset object for each dataset and add as children during dataset generation
    then, also builds all anomaly objects which are later added to a prediction object in another function, assigned to the labeler detector that has anomalies for each ground truth label"""
    datafeed_object={}

    #load dataset and format
    dataset=pd.read_csv(config.NAB_ASSETS / "raw_datasets" / dataset_name) #load dataset
    dataset['timestamp']=pd.to_datetime(dataset['timestamp']) #set timestamps to datetime objects
    dataset=dataset.set_index("timestamp") #set index to datetime column
    dataset["anomaly_label"]=False # creates new column for labeled anomalies and fills with False
    nab_utils.add_labels_to_dataset(dataset,dataset_name)

    #build dataset model object
    datafeed_object["name"]=dataset_name.split("/")[1].replace('_', ' ').title()[:-4] #takes a string, replaces underscores with spaces, and capitalizes each word, then removes .csv from the end of the filename
    datafeed_object["folder"] = dataset_name.split("/")[0]
    datafeed_object["filename"] = dataset_name.split("/")[1]
    
    datafeed_object["feed_type"] = 'file'
    datafeed_object["start_time"] = dataset.index[0].strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    datafeed_object["end_time"] = dataset.index[1].strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    datafeed_object["anomaly_count"] = len(dataset[dataset['anomaly_label'] == True])

    health_object={}
    health_object["heartbeat_frequency"] = 3600
    health_object["heartbeat_timeout"] = 3
    health_object["last_received"] = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    health_object["score"]=100
    health_object["status"]="active"

    # datafeed_object["health"]=health_object

    dataset_object={}
    dataset_object["anomaly_count"] = len(dataset[dataset['anomaly_label'] == True])
    dataset_object["labeled"] = 'True'
    dataset_object["file_type"] = 'csv'
    dataset_object["path"] = dataset_name
    dataset_object["folder"] = dataset_name.split("/")[0]
    dataset_object["filename"] = dataset_name.split("/")[1]
    dataset_object["start_time"] = dataset.index[0].strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    dataset_object["end_time"] = dataset.index[1].strftime('%Y-%m-%dT%H:%M:%S.%f%z')


    if 'real' in dataset_name:
        datafeed_object["tags"]=['real']
        dataset_object["tags"]=['real']
    else:
        datafeed_object["tags"]=['artificial']
        dataset_object["tags"]=['real']

    # datafeed_object["dataset"]=dataset_object

    anomaly_records=[]
    #build anomaly records for this dataset
    for item in labels[dataset_name]: #for each entry in the labels for this file
        anomaly_object={}
        anomaly_object["dataset_name"] = dataset_name
        anomaly_object["detector_id"]=detector_obj['id']
        anomaly_object["detector_name"] = detector_obj['name']
        anomaly_object["value"] = float(dataset.at[item,"value"])
        anomaly_object["anomaly_score"] = float(0)
        anomaly_object["threshold"] = float(0)
        anomaly_object["time"] = item
        anomaly_object["status"] = 'test'
        anomaly_object["severity"] = "low"
        anomaly_object["tags"] = ['label']
        anomaly_records.append(anomaly_object)

    return datafeed_object, dataset_object, health_object, anomaly_records


def build_prediction_object(dataset_obj,detector_obj):
    """ has to be called after the other objects are made as it needs the dataset ID"""
    prediction_object={}
    prediction_object["dataset_id"]=dataset_obj['id']
    prediction_object["dataset_name"]=dataset_obj['path']
    prediction_object["detector_name"]=detector_obj['name']
    prediction_object["detector_id"]=detector_obj['id']

    prediction_object["url"]= str(Path(config.NAB_ASSETS / "predictions" / prediction_object["detector_name"]) / dataset_obj["folder"] / Path(prediction_object["detector_name"]+"_"+dataset_obj["filename"]))
    
    return prediction_object

## Build  / Datafeeds / Datasets / Precitions / Anomalies

### Outer Loop: process_collection: for each collection object, make a datafeed object for each file in the folder, runs as a ansyncio gather for register_datafeed
### Inner Loop: register_datafeed:  
* Each datafeed build the object, dataset, anomaly objects using the dataset name to load the csv file and generate all the object attributes
* Registers the datafeed with the dataset and health objects added in the same call 
* Generates a prediction object for the dataset, and adds all the anomalies in the prediction create call

In [7]:
@timeit
async def process_collection(collection_name, collection_id):
    csv_files=Path(collections_path / collection_name).glob('*.csv')
    datafeeds = await asyncio.gather(*[register_datafeed(filename,collection_id) for filename in csv_files])
    return datafeeds

async def register_datafeed(filename,collection_id):
    dataset_name=str(Path(filename.parent.name) / filename.name)
    datafeed_data, dataset_data, health_data, anomaly_entries=build_objects(dataset_name)

    #make datafeed object, pass in health and dataset objects as children
    response = await client.request("post","/datafeed/create/"+collection_id, json={"payload":datafeed_data,"datasets":[dataset_data],"health":health_data})

    dataset_object=response["datasets"][0] #get dataset (will only ever have one with this process)
    prediction=build_prediction_object(dataset_object,detector_obj)

    #make prediction object, pass in anomalies as children
    response = await client.request("post","/prediction/create", json={"payload":prediction,"anomalies":anomaly_entries})

In [8]:
collections = await register_collections(collection_folders)
#collections = await register_collections(["realAdExchange"])
datafeeds = await asyncio.gather(*[process_collection(item["name"],item["id"]) for item in collections])

process_collection.time
process_collection.time
process_collection.time
process_collection.time
process_collection.time
process_collection.time
process_collection.time
>>> 14.561225414276123
>>> 14.596603631973267
>>> 14.599840879440308
>>> 14.780881643295288
>>> 14.928279638290405
>>> 14.94666838645935
>>> 14.956026315689087
