In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

sys.path.append(str(Path(os.getcwd()).parent) + '\src')
from graph_dataset import GraphDataset
from simulation_anomalylabels import simulate_anomaly_labels
from simulation_continuous import get_cont_ts_df, populate_flow_all_nodes
from simulation_categorical import get_cat_ts_df
from simulation_monotonic import get_monotonic_ts_df
from simulation_binary import get_binary_ts_df
from pattern_anomalies import get_pattern_anomalies
from utils.utils_data_generation import generate_relationship_json, plot_ts

In [2]:
# Customize
experiment_name = 'v13_010121_020121'

plot = False
save = False
timestamp_noise = False
seed = 5

In [3]:
data_path = f'../data/synthetic_data/{experiment_name}/'
if save:
    Path(data_path).mkdir(parents=True, exist_ok=True)
np.random.seed(seed)

## Chapter 1. Graph Object Creation

#### Step 1.1. Ingest Topology Table

In [4]:
# Create a sample topology table
topo_df = pd.DataFrame(columns=['relationshipName', 'sourceId', 'targetId'])
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "B", 'targetId': 'E'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "B", 'targetId': 'F'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "B", 'targetId': 'G'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "B", 'targetId': 'I'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "D", 'targetId': 'H'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "H", 'targetId': 'J'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "A", 'targetId': 'C'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "A", 'targetId': 'D'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isParent", "sourceId": "A", 'targetId': 'E'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isRedundant", "sourceId": "I", 'targetId': 'F'}, ignore_index=True)
topo_df = topo_df.append({"relationshipName": "isRedundant", "sourceId": "F", 'targetId': 'I'}, ignore_index=True)

display(topo_df)

Unnamed: 0,relationshipName,sourceId,targetId
0,isParent,B,E
1,isParent,B,F
2,isParent,B,G
3,isParent,B,I
4,isParent,D,H
5,isParent,H,J
6,isParent,A,C
7,isParent,A,D
8,isParent,A,E
9,isRedundant,I,F


#### Step 1.2. Convert Tabular Topology into Graph

In [5]:
# Instantiate a GraphDataset object with topology given
init_kwargs = {'topo_df': topo_df,
               'relationship_to_flow': 'isParent',
               'simulated_nodes': ['A', 'B']}
gd = GraphDataset(**init_kwargs)

# Plot graph
if plot:
    gd.plot_graph()
               
# num_simulated_sensors = len(G.simulated_nodes)

## Chapter 2. Anomaly Labels Simulation

In [6]:
simulate_anomaly_labels_kwargs = {'num_simulated_anomaly_ts': 1}
simulate_anomaly_labels_kwargs_contd = {'time_range_lst': [['2021-01-01 00:00:00', '2021-02-01 00:00:00'] for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])], 
                                        'freq_lst': ['1min' for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])], 
                                        'start_of_day_range_lst': [['07:00:00', '09:00:00'] for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])], 
                                        'end_of_day_range_lst': [['16:00:00', '17:00:00'] for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])], 
                                        'surge_occurrence_range_lst': [[10, 10] for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])],
                                        'surge_length_range_lst': [[100, 100] for _ in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts'])]}
simulate_anomaly_labels_kwargs.update(simulate_anomaly_labels_kwargs_contd)

anomaly_label_lst, surge_start_end_indices_lst = simulate_anomaly_labels(**simulate_anomaly_labels_kwargs)

if save:
    for i in range(simulate_anomaly_labels_kwargs['num_simulated_anomaly_ts']):
        anomaly_label_lst[i].reset_index().to_csv(data_path + f'anomaly_label_{experiment_name}_{i}.csv', index=False)

Simulating Anomaly Labels for #0...
#0 Anomaly Labels Simulation Done.


## Chapter 3. Synthetic Data Simulation

### Part 3.1. Data Profile - Continuous

#### Step 3.1.1. Simulate Telemetry Time-series with Anomalies 
#### - First Simulate Time-series for Selected Source Nodes
#### - Then Populate Time-series for the Rest Nodes from Topology Flow Top-down

In [7]:
# E.g. Simulate Channel Amps.Ia for Current Telemetry
simulate_ts_kwargs_continuous = {'unique_anomaly_label': True, 
                                 'anomaly_label_lst': anomaly_label_lst, 
                                 'surge_start_end_indices_lst': surge_start_end_indices_lst, 
                                 'simulate_surge_lst': [True for _ in range(len(gd.simulated_nodes))], 
                                 'surge_ratio_range_lst': [[5, 10], [5, 10]], 
                                 'normal_mean_range_lst': [[6, 7], [26, 27]], 
                                 'normal_std_lst': [0.5 for _ in range(len(gd.simulated_nodes))],
                                 'gd': gd,
                                 'key_name': 'Amps_Ia', 
                                 'missing_ratio': 0.1,
                                 'value_noise': True, 
                                 'timestamp_noise': timestamp_noise,
                                 'surge_with_decay': False}

ts_df_ia = get_cont_ts_df(**simulate_ts_kwargs_continuous)

# Plot time-series for all nodes
if plot:
    plot_ts(ts_df=ts_df_ia, \
            anomaly_label=anomaly_label_lst[0])
    display(ts_df_ia)


Simulating Time-series for Sensor 0...

Sensor 0 Time-series Simulation Done.

Simulating Time-series for Sensor 1...

Sensor 1 Time-series Simulation Done.


In [8]:
# import datetime
# df_anom = anomaly_label_lst[0].copy()
# df_anom = df_anom[ (df_anom["isAnomaly"]!=0)]
# df_anom["subgroup"] =(df_anom['date'] != df_anom['date'].shift(1)).cumsum()
# df_anom["weekday"] = df_anom.index.weekday
# df_anom = df_anom.reset_index()
# df_anom.to_csv("../Data/debug_anom_03142022.csv")

# # df_anom.head()

# df_anom.groupby("subgroup", as_index=False).apply(lambda x: '_'.join([str(x["Timestamp"].values[0]), str(x["weekday"].values[0]), str(x.shape[0])]))

In [9]:
# df_anom = anomaly_label_lst[0]

# for i in surge_start_end_indices_lst[0]:
#     df_wanted = df_anom[(df_anom.index>=i[0]) & (df_anom.index<=i[1])]
#     print(i, len(df_wanted), df_wanted.isAnomaly.sum())

In [10]:
# Repeat for Channel Amps.Ib
simulate_ts_kwargs_continuous = {'unique_anomaly_label': True, 
                                 'anomaly_label_lst': anomaly_label_lst, 
                                 'surge_start_end_indices_lst': surge_start_end_indices_lst, 
                                 'simulate_surge_lst': [True for _ in range(len(gd.simulated_nodes))], 
                                 'surge_ratio_range_lst': [[5, 10], [5, 10]], 
                                 'normal_mean_range_lst': [[6, 7], [26, 27]], 
                                 'normal_std_lst': [0.5 for _ in range(len(gd.simulated_nodes))],
                                 'gd': gd,
                                 'key_name': 'Amps_Ib', 
                                 'missing_ratio': 0.1,
                                 'value_noise': True, 
                                 'timestamp_noise': timestamp_noise}
ts_df_ib = get_cont_ts_df(**simulate_ts_kwargs_continuous)

# Repeat for Channel Amps.Ic
simulate_ts_kwargs_continuous = {'unique_anomaly_label': True, 
                                 'anomaly_label_lst': anomaly_label_lst, 
                                 'surge_start_end_indices_lst': surge_start_end_indices_lst, 
                                 'simulate_surge_lst': [True for _ in range(len(gd.simulated_nodes))], 
                                 'surge_ratio_range_lst': [[5, 10], [5, 10]], 
                                 'normal_mean_range_lst': [[6, 7], [26, 27]], 
                                 'normal_std_lst': [0.5 for _ in range(len(gd.simulated_nodes))],
                                 'gd': gd,
                                 'key_name': 'Amps_Ic', 
                                 'missing_ratio': 0.1,
                                 'value_noise': True, 
                                 'timestamp_noise': timestamp_noise}
ts_df_ic = get_cont_ts_df(**simulate_ts_kwargs_continuous)


Simulating Time-series for Sensor 0...

Sensor 0 Time-series Simulation Done.

Simulating Time-series for Sensor 1...

Sensor 1 Time-series Simulation Done.

Simulating Time-series for Sensor 0...

Sensor 0 Time-series Simulation Done.

Simulating Time-series for Sensor 1...

Sensor 1 Time-series Simulation Done.


In [11]:
# # A quick one-dim side example to illustrate topology flow
# supply_mat_sample = np.array([300, 400, 0, 0, 0, 0, 0, 0, 0, 0])
# sln_mat_sample = populate_flow_all_nodes(gd=gd, \
#                                          supply_mat=supply_mat_sample)
# # gd.plot_graph(sln_mat=sln_mat_sample)

#### Step 3.1.2. Generate Continuous Output: 1. Topology_continuous.json&csv 2. Update_stream_continuous.csv

In [12]:
# Output Topology_continuous.json & Topology_continuous.csv
generate_relationship_json(topo_df=gd.topo_df, 
                           save=save, 
                           output_data_path=data_path, 
                           output_json_file_name=f'topology_continuous_{experiment_name}.json',
                           output_csv_file_name=f'topology_continuous_{experiment_name}.csv')

Sample Json File:
[
    {
        "$relationshipId": "B_isParent_E",
        "$sourceId": "B",
        "$targetId": "E",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_F",
        "$sourceId": "B",
        "$targetId": "F",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_G",
        "$sourceId": "B",
        "$targetId": "G",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_I",
        "$sourceId": "B",
        "$targetId": "I",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "D_isParent_H",
        "$sourceId": "D",
        "$targetId": "H",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "

In [13]:
# Output Update_stream_continuous.csv 
update_stream_continuous = pd.concat([ts_df_ia, ts_df_ib, ts_df_ic])
update_stream_continuous['ModelId'] = np.nan
update_stream_continuous.loc[update_stream_continuous['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine;1'
update_stream_continuous.loc[~update_stream_continuous['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine;1'
update_stream_continuous = update_stream_continuous[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
                           .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)

if save:
    update_stream_continuous.to_csv(data_path + f'update_stream_continuous_{experiment_name}.csv', index=False)
print('Sample Update_stream_continuous.csv:')
display(update_stream_continuous.head())

Sample Update_stream_continuous.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,A,dtmi:syntheticfactory:sourcemachine;1,Amps_Ia,2021-01-01,0.0
1,A,dtmi:syntheticfactory:sourcemachine;1,Amps_Ib,2021-01-01,0.0
2,A,dtmi:syntheticfactory:sourcemachine;1,Amps_Ic,2021-01-01,0.0
3,B,dtmi:syntheticfactory:sourcemachine;1,Amps_Ib,2021-01-01,0.0
4,B,dtmi:syntheticfactory:sourcemachine;1,Amps_Ic,2021-01-01,0.059607


#### Step 3.1.3. Add Pattern Anomalies

In [14]:
# timerange_str_lst = [['2021-01-07 11:00:00', '2021-01-07 14:00:00'], ['2021-01-08 11:00:00', '2021-01-08 14:00:00']]
    
# update_stream_continuous = get_pattern_anomalies(update_stream_continuous, 'C', 'Amps_Ia', timerange_str_lst, ['multiple' for _ in range(len(timerange_str_lst))], [10 for _ in range(len(timerange_str_lst))])
# update_stream_continuous = get_pattern_anomalies(update_stream_continuous, 'D', 'Amps_Ia', timerange_str_lst, ['multiple' for _ in range(len(timerange_str_lst))], [10 for _ in range(len(timerange_str_lst))])
# update_stream_continuous = get_pattern_anomalies(update_stream_continuous, 'E', 'Amps_Ia', timerange_str_lst, ['multiple' for _ in range(len(timerange_str_lst))], [1/100 for _ in range(len(timerange_str_lst))])

# plot_ts(ts_df=update_stream_continuous, \
#         anomaly_label=anomaly_label_lst[0])

In [15]:
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# df = pd.read_csv('synthetic_results.csv')
# df['timestamp'] = pd.to_datetime(df['timestamp'])

# fig = make_subplots(subplot_titles=['AD Results'])

# fig.add_trace(
#     go.Scatter(x=df.timestamp, y=df.isAnomaly, name='AD Results', mode='markers'),
#     row=1, col=1
#     )

### Part 3.2. Data Profile - Categorical

#### Step 3.2.1. Simulate Categorical Time-series (e.g. PowerLevel as one of 'High'/'Mid'/'Low' for devices A&B)

In [16]:
simulate_ts_kwargs_categorical = {'anomaly_label_lst': anomaly_label_lst, 
                                  'num_simulated_ts': 2, 
                                  'freq_lst': ['1min', '1min'], 
                                  'id_name_lst': ['A', 'B'], 
                                  'key_name_lst': ['PowerLevel', 'PowerLevel'], 
                                  'cat_names_lst': [['High', 'Mid', 'Low'], ['High', 'Mid', 'Low']], 
                                  'cat_ratio_lst': [[1/3, 1/3, 1/3], [1/3, 1/3, 1/3]], 
                                  'missing_ratio_lst': [0.8, 0.9], 
                                  'timestamp_noise_lst': [timestamp_noise, timestamp_noise]}
cat_ts_df = get_cat_ts_df(**simulate_ts_kwargs_categorical)

if plot:
    plot_ts(ts_df=cat_ts_df, \
            anomaly_label=anomaly_label_lst[0], \
            mode='markers')
    display(cat_ts_df)

Sensor 0 Categorical Simulation Done.
Sensor 1 Categorical Simulation Done.


#### Step 3.2.2. Generate Categorical Output: 1. Update_stream_categorical.csv

In [17]:
update_stream_categorical = cat_ts_df.copy()
update_stream_categorical['ModelId'] = np.nan
update_stream_categorical.loc[update_stream_categorical['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine;1'
update_stream_categorical.loc[~update_stream_categorical['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine;1'

update_stream_categorical = update_stream_categorical[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
                            .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)

if save:
    update_stream_categorical.to_csv(data_path + f'update_stream_categorical_{experiment_name}.csv', index=False)
print('Sample Update_stream_categorical.csv:')
display(update_stream_categorical.head())

Sample Update_stream_categorical.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,B,dtmi:syntheticfactory:sourcemachine;1,PowerLevel,2021-01-01 00:00:00,Low
1,A,dtmi:syntheticfactory:sourcemachine;1,PowerLevel,2021-01-01 00:03:00,Low
2,B,dtmi:syntheticfactory:sourcemachine;1,PowerLevel,2021-01-01 00:04:00,Mid
3,A,dtmi:syntheticfactory:sourcemachine;1,PowerLevel,2021-01-01 00:07:00,Mid
4,A,dtmi:syntheticfactory:sourcemachine;1,PowerLevel,2021-01-01 00:14:00,Mid


### Part 3.3. Data Profile - Monotonic

#### Step 3.3.1. Simulate Monotonic Time-series based on Continuous

In [18]:
simulate_ts_kwargs_monotonic = {'unique_anomaly_label': True, 
                                'anomaly_label_lst': anomaly_label_lst, 
                                'surge_start_end_indices_lst': surge_start_end_indices_lst, 
                                'simulate_surge_lst': [True for _ in range(len(gd.simulated_nodes))], 
                                'surge_ratio_range_lst': [[5, 10], [5, 10]], 
                                'normal_mean_range_lst': [[2, 3], [22, 23]], 
                                'normal_std_lst': [0.5 for _ in range(len(gd.simulated_nodes))],
                                'gd': gd,
                                'key_name': 'PowerMeter', 
                                'missing_ratio': 0.1,
                                'value_noise': True, 
                                'timestamp_noise': timestamp_noise}
                                 
monotonic_ts_df = get_monotonic_ts_df(**simulate_ts_kwargs_monotonic)

if plot:
    plot_ts(ts_df=monotonic_ts_df, \
            anomaly_label=anomaly_label_lst[0])
    display(monotonic_ts_df)


Simulating Time-series for Sensor 0...

Sensor 0 Time-series Simulation Done.

Simulating Time-series for Sensor 1...

Sensor 1 Time-series Simulation Done.


#### Step 3.3.2. Generate Monotonic Output: 1. Topology_monotonic.json&csv 2. Update_stream_monotonic.csv

In [19]:
# Output Topology_monotonic.json & Topology_monotonic.csv
generate_relationship_json(topo_df=gd.topo_df, 
                        save=save, 
                        output_data_path=data_path, 
                        output_json_file_name=f'topology_monotonic_{experiment_name}.json',
                        output_csv_file_name=f'topology_monotonic_{experiment_name}.csv')

Sample Json File:
[
    {
        "$relationshipId": "B_isParent_E",
        "$sourceId": "B",
        "$targetId": "E",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_F",
        "$sourceId": "B",
        "$targetId": "F",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_G",
        "$sourceId": "B",
        "$targetId": "G",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "B_isParent_I",
        "$sourceId": "B",
        "$targetId": "I",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "$etag": "NA"
    },
    {
        "$relationshipId": "D_isParent_H",
        "$sourceId": "D",
        "$targetId": "H",
        "$relationshipName": "isParent",
        "targetModel": "NA",
        "

In [20]:
# Output Update_stream_monotonic.csv 
update_stream_monotonic = monotonic_ts_df.copy()
update_stream_monotonic['ModelId'] = np.nan
update_stream_monotonic.loc[update_stream_monotonic['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine;1'
update_stream_monotonic.loc[~update_stream_monotonic['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine;1'

update_stream_monotonic = update_stream_monotonic[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
                          .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)

if save:
    update_stream_monotonic.to_csv(data_path + f'update_stream_monotonic_{experiment_name}.csv', index=False)
print('\nSample Update_stream_monotonic.csv:')
display(update_stream_monotonic.head())


Sample Update_stream_monotonic.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,A,dtmi:syntheticfactory:sourcemachine;1,PowerMeter,2021-01-01 00:02:00,0.0
1,B,dtmi:syntheticfactory:sourcemachine;1,PowerMeter,2021-01-01 00:02:00,0.0
2,C,dtmi:syntheticfactory:feedmachine;1,PowerMeter,2021-01-01 00:02:00,0.0
3,D,dtmi:syntheticfactory:feedmachine;1,PowerMeter,2021-01-01 00:02:00,0.0
4,E,dtmi:syntheticfactory:feedmachine;1,PowerMeter,2021-01-01 00:02:00,0.0


### Part 3.4. Data Profile - Binary

#### Step 3.4.1. Simulate Binary Time-series

In [21]:
simulate_ts_kwargs_binary = {'num_simulated_ts': 2, \
                             'time_range_lst': [['2021-01-01 00:00:00', '2021-02-01 00:00:00'], ['2021-01-01 00:00:00', '2021-02-01 00:00:00']], \
                             'freq_lst': ['5min', '5min'], \
                             'id_name_lst': ['A', 'B'], \
                             'key_name_lst': ['Status', 'Status'], \
                             'start_of_day_range_lst': [['07:00:00', '09:00:00'], ['07:00:00', '09:00:00']], \
                             'end_of_day_range_lst': [['16:00:00', '17:00:00'], ['16:00:00', '17:00:00']], \
                             'on_occurrence_range_lst': [[100, 100], [100, 100]], \
                             'on_length_range_lst': [[5, 10], [5, 10]], \
                             'missing_ratio_lst': [0.1, 0.1], \
                             'timestamp_noise_lst': [timestamp_noise, timestamp_noise]}

binary_ts_df = get_binary_ts_df(**simulate_ts_kwargs_binary)
print('Binary count for each Id:')
print(binary_ts_df.groupby('Id')['Value'].value_counts())

if plot:
    plot_ts(ts_df=binary_ts_df, \
            anomaly_label=anomaly_label_lst[0])
    display(binary_ts_df)

Simulating Binary Labels for Sensor 0...
Sensor 0 Binary Simulation Done.
Simulating Binary Labels for Sensor 1...
Sensor 1 Binary Simulation Done.
Binary count for each Id:
Id  Value
A   0        7506
    1         531
B   0        7512
    1         525
Name: Value, dtype: int64


#### Step 3.4.2. Generate Binary Output: 1. Update_stream_binary.csv

In [22]:
update_stream_binary = binary_ts_df.copy()
update_stream_binary['ModelId'] = np.nan
update_stream_binary.loc[update_stream_binary['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine;1'
update_stream_binary.loc[~update_stream_binary['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine;1'

update_stream_binary = update_stream_binary[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
                       .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)

if save:
    update_stream_binary.to_csv(data_path + f'update_stream_binary_{experiment_name}.csv', index=False)
print('Sample Update_stream_binary.csv:')
display(update_stream_binary.head(10))

Sample Update_stream_binary.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:00:00,0
1,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:00:00,0
2,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:05:00,0
3,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:05:00,0
4,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:10:00,0
5,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:10:00,0
6,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:15:00,0
7,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:15:00,0
8,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:20:00,0
9,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01 00:20:00,0


### Part 3.5. Combine Different Data Profile of Same Timerange Together

In [23]:
update_stream = pd.concat([update_stream_continuous, update_stream_categorical, update_stream_monotonic, update_stream_binary])
update_stream = update_stream.sort_values('Timestamp').reset_index(drop=True)

if save:
    update_stream.to_csv(data_path + f'update_stream_{experiment_name}.csv', index=False)
print('Sample Update_stream.csv:')
display(update_stream.head())

Sample Update_stream.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,A,dtmi:syntheticfactory:sourcemachine;1,Amps_Ia,2021-01-01,0.0
1,A,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01,0.0
2,B,dtmi:syntheticfactory:sourcemachine;1,Status,2021-01-01,0.0
3,J,dtmi:syntheticfactory:feedmachine;1,Amps_Ic,2021-01-01,0.0
4,J,dtmi:syntheticfactory:feedmachine;1,Amps_Ib,2021-01-01,0.0


In [24]:
# plot_ts(ts_df=update_stream, \
#         anomaly_label=anomaly_label_lst[0])

### Part 3.6. Repeat and Concatenate Different Timerange Together

In [25]:
# update_stream_tmp = pd.concat([pd.read_csv(data_path + 'update_stream_010121_020121.csv'), \
#                                pd.read_csv(data_path + 'update_stream_020121_030121.csv'), \
#                                pd.read_csv(data_path + 'update_stream_030121_040121.csv')])\
#                     .sort_values('Timestamp').reset_index(drop=True)

# tmp = update_stream_tmp[(update_stream_tmp['Id'].isin(['A', 'C', 'D', 'E']))&(update_stream_tmp['Key']=='Amps_Ia')]
# plot_ts(ts_df=tmp, \
#         anomaly_label=anomaly_label_lst[0])

# update_stream_tmp.to_csv(data_path + 'update_stream_010121_040121.csv', index=False)

### Part 3.7. Get Initial Twins

In [26]:
initial_df = update_stream.loc[update_stream.groupby(['Id', 'Key'])['Timestamp'].idxmin()].reset_index(drop=True)
initial_df['ModelId'] = np.nan
initial_df.loc[initial_df['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine2;1'
initial_df.loc[~initial_df['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine2;1'
initial_df = initial_df[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
                        .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)
if save:
    initial_df.to_csv(data_path + f'initial_twins_{experiment_name}.csv', index=False)
print('Sample Initial_twins.csv:')
display(initial_df)

Sample Initial_twins.csv:


Unnamed: 0,Id,ModelId,Key,Timestamp,Value
0,A,dtmi:syntheticfactory:sourcemachine2;1,Amps_Ia,2021-01-01 00:00:00,0.0
1,A,dtmi:syntheticfactory:sourcemachine2;1,Amps_Ib,2021-01-01 00:00:00,0.0
2,A,dtmi:syntheticfactory:sourcemachine2;1,Amps_Ic,2021-01-01 00:00:00,0.0
3,A,dtmi:syntheticfactory:sourcemachine2;1,Status,2021-01-01 00:00:00,0
4,B,dtmi:syntheticfactory:sourcemachine2;1,Amps_Ib,2021-01-01 00:00:00,0.0
5,B,dtmi:syntheticfactory:sourcemachine2;1,Amps_Ic,2021-01-01 00:00:00,0.059607
6,B,dtmi:syntheticfactory:sourcemachine2;1,PowerLevel,2021-01-01 00:00:00,Low
7,B,dtmi:syntheticfactory:sourcemachine2;1,Status,2021-01-01 00:00:00,0
8,C,dtmi:syntheticfactory:feedmachine2;1,Amps_Ia,2021-01-01 00:00:00,0.0
9,C,dtmi:syntheticfactory:feedmachine2;1,Amps_Ib,2021-01-01 00:00:00,0.0


In [27]:
# update_stream_tmp = pd.read_csv(data_path + 'update_stream_010121_040121_train.csv')
# update_stream_tmp['Timestamp'] = pd.to_datetime(update_stream_tmp['Timestamp'])
# initial_df = update_stream_tmp.loc[update_stream_tmp.groupby(['Id', 'Key'])['Timestamp'].idxmin()].reset_index(drop=True)
# initial_df['ModelId'] = np.nan
# initial_df.loc[initial_df['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:sourcemachine2;1'
# initial_df.loc[~initial_df['Id'].isin(['A', 'B']), 'ModelId'] = 'dtmi:syntheticfactory:feedmachine2;1'
# initial_df = initial_df[['Id', 'ModelId', 'Key', 'Timestamp', 'Value']]\
#                         .sort_values(['Timestamp', 'Id', 'Key']).reset_index(drop=True)
# initial_df.to_csv(os.path.join(data_path, 'initial_twins.csv'), index=False)
# print('Sample Initial_twins.csv:')
# display(initial_df)

In [28]:
# bbb = anomaly_label_lst[0].loc[surge_start_end_indices_lst[0][2][0]: surge_start_end_indices_lst[0][2][1]]
# bbb[bbb['isAnomaly']==0]