# Visualize Synthetic

Goal:

- For each sensor
    - For wet and dry periods
        - Visualize 

In [None]:
import os
import time
import copy
import itertools
import yaml

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import h5py
import nexusformat.nexus as nx
import pickle
import json
from tqdm import tqdm
import datetime
import matplotlib.dates as mdates
import matplotlib.gridspec as gridspec
from datetime import timedelta




from fault_management_uds.data.hdf_functions import print_tree, load_dataframe_from_HDF5
from fault_management_uds.data.process import remove_nans_from_start_end

from fault_management_uds.modelling.classifiers import classify_rain_events
from fault_management_uds.plots import get_segment_start_end_color, set_meaningful_xticks
from fault_management_uds.plots import visualize_error_span
from fault_management_uds.config import indicator_2_meta, bools_2_meta, error_indicators, natural_sensor_order


from fault_management_uds.config import PROJ_ROOT
from fault_management_uds.config import DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR, PROCESSED_DATA_DIR, EXTERNAL_DATA_DIR
from fault_management_uds.config import MODELS_DIR, REPORTS_DIR, FIGURES_DIR, REFERENCE_DIR


from fault_management_uds.data.load import import_external_metadata, import_metadata
from fault_management_uds.data.load import load_data_period, filenames_based_on_period, provided_2_full_range, get_event
from fault_management_uds.data.format import create_individual_indicators, create_indicator


from fault_management_uds.synthetic.synthetic_generator import AnomalyHandler


# set random seed
seed = 42
np.random.seed(seed)
random.seed(seed)

2024-11-14 10:48:32.431 | INFO     | fault_management_uds.config:<module>:11 - PROJ_ROOT path is: /Users/arond.jacobsen/Documents/GitHub/fault_management_uds


In [2]:
data_file_path = PROCESSED_DATA_DIR / 'Bellinge.h5'
external_metadata = import_metadata(REFERENCE_DIR / 'external_metadata.csv')
metadata = import_metadata(REFERENCE_DIR / 'sensor_metadata.csv')

# Raw sensor path
raw_sensor_path = RAW_DATA_DIR / 'Bellinge' / 'sensor-data'

In [3]:
# create a figure save folder
figure_save_folder = FIGURES_DIR / 'synthetic'
figure_save_folder.mkdir(parents=True, exist_ok=True)

### Get dry and wet periods for each sensor

- We want non-erroneous data

In [None]:
# # load the classified rain events
# clf_rain_events = pd.read_csv(REFERENCE_DIR / 'evetns' / 'rain_events.csv', index_col=0)
# clf_rain_events['start'] = pd.to_datetime(clf_rain_events['start'])
# clf_rain_events['end'] = pd.to_datetime(clf_rain_events['end'])

# sub_rain_events = clf_rain_events[clf_rain_events['duration'] < 60].copy()
# # sort by total rain
# sub_rain_events.sort_values('total_rain', ascending=False, inplace=True)
# sub_rain_events.reset_index(drop=True, inplace=True)

In [5]:
# dry_wet_periods = {}

# # create a dry event dataframe that goes from rain event end to next rain event start
# dry_events = []
# # sort clf_rain_events by start
# clf_rain_events.sort_values('start', inplace=True)
# for i in range(clf_rain_events.shape[0] - 1):
#     end = clf_rain_events.loc[i, 'end']
#     start = clf_rain_events.loc[i+1, 'start']
#     duration = (start - end).total_seconds() / 60
#     dry_events.append({'start': end, 'end': start, 'duration': duration})

# dry_events = pd.DataFrame(dry_events)
# # sort by duration
# dry_events.sort_values('duration', ascending=False, inplace=True)
# dry_events.reset_index(drop=True, inplace=True)



# for sensor_name in natural_sensor_order:
#     print(f"Sensor: {sensor_name}")
#     dry_wet_periods[sensor_name] = {}
#     # load the data
#     data, _, _, _ = load_dataframe_from_HDF5(data_file_path, f"single_series/sewer_data/{sensor_name}/{sensor_name}_clean")
#     data = remove_nans_from_start_end(data, 'value')
    
    
#     ### Find a rain event
#     dry_wet_periods[sensor_name]['rain_event'] = {}
#     # filter rain events given sensor time range
#     start, end = data.index[0], data.index[-1]
#     sensor_rain_events = sub_rain_events[(sub_rain_events['start'] < end) & (sub_rain_events['end'] > start)].copy()
#     # iterate rain events until one is found, where the data is not missing
#     for i, rain_event in sensor_rain_events.iterrows():
#         # get the data for the rain event
#         data_rain_period = data.loc[rain_event['start']:rain_event['end']]
#         # check if the data is missing
#         if data_rain_period['value'].isna().sum() == 0:
#             print(f"    Found rain event on attempt {i+1}")
#             # round up and down to nearest half hour
#             dry_wet_periods[sensor_name]['rain_event']['start'] = rain_event['start'].replace(minute=0, second=0, microsecond=0)
#             dry_wet_periods[sensor_name]['rain_event']['end'] = rain_event['start'].replace(minute=0, second=0, microsecond=0) + timedelta(hours=2)
#             break
#     # this else will only be executed if the for loop is not broken
#     else:
#         print("    No rain event found")
#         dry_wet_periods[sensor_name]['rain_event']['start'] = None
#         dry_wet_periods[sensor_name]['rain_event']['end'] = None


#     ### Find a dry period
#     dry_wet_periods[sensor_name]['dry_period'] = {}
#     # filter out dry events given sensor time range
#     sensor_dry_events = dry_events[(dry_events['start'] < end) & (dry_events['end'] > start)].copy()
#     # iterate dry events until one is found, where the data is not missing
#     for i, dry_event in sensor_dry_events.iterrows():
#         # get the data for the dry event
#         data_dry_period = data.loc[dry_event['start']:dry_event['end']]
#         # check if the data is missing
#         if data_dry_period['value'].isna().sum() == 0:
#             print(f"    Found dry event on attempt {i+1}")
#             dry_wet_periods[sensor_name]['dry_period']['start'] = dry_event['start'].replace(minute=0, second=0, microsecond=0)
#             dry_wet_periods[sensor_name]['dry_period']['end'] = dry_event['start'].replace(minute=0, second=0, microsecond=0) + timedelta(hours=2)
#             break

#     # this else will only be executed if the for loop is not broken
#     else:
#         print("    No dry event found")
#         dry_wet_periods[sensor_name]['dry_period']['start'] = None
#         dry_wet_periods[sensor_name]['dry_period']['end'] = None

# del data
# # save the sensor dry wet periods
# with open(REFERENCE_DIR / 'events' / 'dry_wet_periods.json', 'w') as f:
#     json.dump(dry_wet_periods, f, default=str, indent=4)


### Load dry and wet periods

In [6]:
# load the sensor dry wet periods
with open(REFERENCE_DIR / 'events' / 'dry_wet_periods.json', 'r') as f:
    dry_wet_periods = json.load(f)
    # convert the start and end times to datetime
    for sensor_name in dry_wet_periods.keys():
        for event_type in dry_wet_periods[sensor_name].keys():
            for event in dry_wet_periods[sensor_name][event_type].keys():
                dry_wet_periods[sensor_name][event_type][event] = pd.to_datetime(dry_wet_periods[sensor_name][event_type][event])
                

### Load sensor ranges

In [7]:
# load the sensor ranges
with open(REFERENCE_DIR / 'sensor_ranges.json', 'r') as f:
    sensor_ranges = json.load(f)

### Load configurations


In [8]:
# load yaml file
with open(REFERENCE_DIR / 'synthetic_config.yaml', 'r') as f:
    synthetic_config = yaml.safe_load(f)


## Visualize anomalies

In [11]:
def get_data_dict(sensor_name, period_type, dry_wet_periods, buffer=None):

    # get the start and end times
    start = dry_wet_periods[sensor_name][period_type]['start']
    end = dry_wet_periods[sensor_name][period_type]['end']

    if buffer == "day":
        # round start to start of day and end to end of day
        start = start.replace(hour=0, minute=0, second=0, microsecond=0)
        # end is then the next day
        end = start + timedelta(days=1)
    else:
        pass

    # load the event data
    rain_5425, _, _, _ = load_dataframe_from_HDF5(data_file_path, "single_series/rain_gauge_data/5425", starttime=start, endtime=end, complete_range=True, verbose=True)
    rain_5427, _, _, _ = load_dataframe_from_HDF5(data_file_path, "single_series/rain_gauge_data/5427", starttime=start, endtime=end, complete_range=True, verbose=True)
    sensor_data, _, _, _ = load_dataframe_from_HDF5(data_file_path, f"single_series/sewer_data/{sensor_name}/clean", starttime=start, endtime=end, complete_range=True, verbose=True)


    data_dict = {
        '5425': rain_5425,
        '5427': rain_5427,
        'original': sensor_data,
        'start': start,
        'end': end,
    }

    return data_dict



def insert_anomaly(data_dict, anomaly_config, anomaly, sensor_scale, obvious_min, obvious_max, seed, center=False):
    # create an anomaly handler
    n_obs = data_dict['original'].shape[0]
    anomaly_handler = AnomalyHandler(anomaly_config, anomaly, 'value', n_obs, sensor_scale, obvious_min, obvious_max, seed)
    
    # handling these example cases
    # these can have multiple injections
    if center:
        # set start index to be in the middle of the time series
        start_idx = n_obs*2 // 5
        anomaly_handler.set_injection_start(start_idx)
    else:
        anomaly_handler.initialize_injections()

    polluted_sensor = anomaly_handler.inject_anomalies(data_dict['original'])
    data_dict['polluted'] = polluted_sensor
    data_dict['indicator_dict'] = {
        'indicator': anomaly_handler.get_indicator(),
        'colormap': {
            0: 'none',
            1: 'firebrick',
        }
    }
    return data_dict




In [12]:
def visualize_rain(ax, title, data_dict, marker, linewidth=1):
    ax.set_title(title, fontsize=20)
    ### Visualize rain data
    for rain_gauge, rain_color in zip(['5427', '5425'], ['purple', 'darkblue']):
        ax.plot(data_dict[rain_gauge].index, data_dict[rain_gauge].value, 
            label=f'Rain gauge {rain_gauge}', color=rain_color,
            linewidth=linewidth, linestyle='-', 
            marker=marker, markersize=1, alpha=1)
        ax.set_ylabel('Rain (mm)')
        ax.legend(loc='upper right')
        ax.set_xticks([])
        # set y limits based on 0 and max wrt both
        ax.set_ylim(-1, data_dict['max_rain'])  
        ax.set_xlim(data_dict['start'], data_dict['end'])
    return ax


def visualize_injected_synthetics(ax, title, data_dict, unit, marker):

    # visualize the sensor data; severities
    ax.set_title(title, fontsize=20)
    ax.plot(data_dict['original'].index, data_dict['original'].value, 
        label='Original data', color='grey', 
        linewidth=1, linestyle='-', 
        marker='', markersize=1, alpha=1)

    ax.plot(data_dict['polluted'].index, data_dict['polluted'].value, 
        label='Erroneous data', color='grey', 
        linewidth=2, linestyle='-', 
        marker=marker, markersize=2, alpha=1)
    # visualzie error span
    ax = visualize_error_span(ax, data_dict['indicator_dict'], data_dict['start'], data_dict['end'], adjust='full-point')
    ax.set_xlim(data_dict['start'], data_dict['end'])
    ax.set_ylabel(unit)
    ax.legend()
    ax.set_xticks([])
    return ax



In [13]:
# create a folder
examples_save_folder = figure_save_folder / 'examples'
examples_save_folder.mkdir(exist_ok=True)

anomalies = list(synthetic_config['anomalies'].keys())
severity = "medium"


In [14]:

for sensor_name in tqdm(natural_sensor_order, total=len(natural_sensor_order)):
    if sensor_name != 'G80F11B_Level1':
        continue

    # create a folder for the sensor
    sensor_save_folder = examples_save_folder / sensor_name
    sensor_save_folder.mkdir(exist_ok=True)

    # extract meta
    sensor_range = sensor_ranges[sensor_name]['clean']['range']
    sensor_meta = metadata[metadata['IdMeasurement'] == sensor_name]
    unit = sensor_meta['UnitAlias'].values[0]
    obvious_min = sensor_meta['obvious_min'].values[0]
    obvious_max = sensor_meta['obvious_max'].values[0]  


    # plot a short and long period
    #for buffer in [0, 1080]:
    for buffer in ['period', 'day']:
        marker = "o" if buffer == 'period' else ""

        dry_data_dict = get_data_dict(sensor_name, 'dry_period', dry_wet_periods, buffer=buffer)
        wet_data_dict = get_data_dict(sensor_name, 'rain_event', dry_wet_periods, buffer=buffer)

        abs_max_rain = max([dry_data_dict['5425'].value.max(), dry_data_dict['5427'].value.max(), wet_data_dict['5425'].value.max(), wet_data_dict['5427'].value.max()]) + 1
        dry_data_dict['max_rain'] = abs_max_rain    
        wet_data_dict['max_rain'] = abs_max_rain

        ### Visualizing all anomalies in one
        fig, axs = plt.subplots(1+len(anomalies), 2, figsize=(14, 14), sharey="row")
        # set main title
        fig.suptitle(f"{sensor_name}\n{buffer.capitalize()}", fontsize=24)
        axs[0, 0] = visualize_rain(axs[0, 0], 'Dry period', dry_data_dict, marker)
        axs[0, 1] = visualize_rain(axs[0, 1], 'Rain event', wet_data_dict, marker)
        # iterate over the anomalies
        for i, anomaly in enumerate(anomalies):
            anomaly_config = synthetic_config['anomalies'][anomaly][severity]

            dry_data_dict = insert_anomaly(dry_data_dict, anomaly_config, anomaly, sensor_range, obvious_min, obvious_max, seed, center=True)
            wet_data_dict = insert_anomaly(wet_data_dict, anomaly_config, anomaly, sensor_range, obvious_min, obvious_max, seed, center=True)
            
            # Visualize the injected anomalies
            sensor_title = f"{anomaly.capitalize()}"
            axs[i+1, 0] = visualize_injected_synthetics(axs[i+1, 0], sensor_title, dry_data_dict, unit, marker)
            axs[i+1, 1] = visualize_injected_synthetics(axs[i+1, 1], sensor_title, wet_data_dict, unit, marker)
            

        axs[-1, 0] = set_meaningful_xticks(axs[-1, 0], dry_data_dict['start'], dry_data_dict['end'])
        axs[-1, 1] = set_meaningful_xticks(axs[-1, 1], wet_data_dict['start'], wet_data_dict['end'])

        plt.tight_layout()
        # # save the figure
        fig.savefig(sensor_save_folder / f"all_{buffer}.png", dpi=150)
        plt.close(fig)

        continue


        ### Visualizing each anomaly
        # iterate over the anomalies
        for anomaly in anomalies:
            anomaly_config = synthetic_config['anomalies'][anomaly][severity]
            # using standard deviation as the range
            #sensor_range = sensor_ranges[sensor_name]['clean']['std']
            sensor_range = sensor_ranges[sensor_name]['clean']['range']


            dry_data_dict = insert_anomaly(dry_data_dict, anomaly_config, anomaly, sensor_range, obvious_min, obvious_max, seed, center=True)
            wet_data_dict = insert_anomaly(wet_data_dict, anomaly_config, anomaly, sensor_range, obvious_min, obvious_max, seed, center=True)

            fig, axs = plt.subplots(2, 2, figsize=(16, 6), sharey="row", height_ratios=[1, 2])
            # set main title
            fig.suptitle(f"{sensor_name}\n{anomaly.capitalize()}", fontsize=24)
            sensor_title = f"{severity.capitalize()} severity"
            axs[0, 0] = visualize_rain(axs[0, 0], 'Dry period', dry_data_dict)
            axs[1, 0] = visualize_injected_synthetics(axs[1, 0], sensor_title, dry_data_dict, unit)
            axs[-1, 0] = set_meaningful_xticks(axs[-1, 0], dry_data_dict['start'], dry_data_dict['end'])

            axs[0, 1] = visualize_rain(axs[0, 1], 'Rain event', wet_data_dict)
            axs[1, 1] = visualize_injected_synthetics(axs[1, 1], sensor_title, wet_data_dict, unit)
            axs[-1, 1] = set_meaningful_xticks(axs[-1, 1], wet_data_dict['start'], wet_data_dict['end'])
            
            plt.tight_layout()
            # save the figure
            fig.savefig(sensor_save_folder / f"{buffer}_{anomaly}.png", dpi=150)
            plt.close(fig)


100%|██████████| 19/19 [00:22<00:00,  1.20s/it]


### Visualize full timeline

In [15]:
sensor_name = 'G80F11B_Level1'
unit = metadata[metadata['IdMeasurement'] == sensor_name]['UnitAlias'].values[0]

# create a folder for the sensor
sensor_save_folder = examples_save_folder / sensor_name
sensor_save_folder.mkdir(exist_ok=True)

# extract meta
sensor_range = sensor_ranges[sensor_name]['clean']['range']
sensor_meta = metadata[metadata['IdMeasurement'] == sensor_name]
unit = sensor_meta['UnitAlias'].values[0]
obvious_min = sensor_meta['obvious_min'].values[0]
obvious_max = sensor_meta['obvious_max'].values[0]  


In [16]:
# load the full data
sensor_data, _, _, _ = load_dataframe_from_HDF5(data_file_path, f"single_series/sewer_data/{sensor_name}/clean")

start, end = sensor_data.index[0], sensor_data.index[-1]

# load the event data
rain_5425, _, _, _ = load_dataframe_from_HDF5(data_file_path, "single_series/rain_gauge_data/5425", starttime=start, endtime=end, complete_range=True)
rain_5427, _, _, _ = load_dataframe_from_HDF5(data_file_path, "single_series/rain_gauge_data/5427", starttime=start, endtime=end, complete_range=True)


data_dict = {
    '5425': rain_5425,
    '5427': rain_5427,
    'original': sensor_data,
    'start': start,
    'end': end,
}

In [17]:

def visualize_injected_synthetics(ax, title, data_dict, unit, marker):

    # visualize the sensor data; severities
    ax.set_title(title, fontsize=20)

    ax.plot(data_dict['polluted'].index, data_dict['polluted'].value, 
        label='Erroneous data', color='grey', 
        linewidth=0.3, linestyle='-', 
        marker=marker, markersize=1, alpha=1)
    # visualzie error span
    ax = visualize_error_span(ax, data_dict['indicator_dict'], data_dict['start'], data_dict['end'], adjust=60*3, alpha=0.5)
    ax.set_xlim(data_dict['start'], data_dict['end'])
    ax.set_ylabel(unit)
    ax.legend(loc='upper right')
    ax.set_xticks([])
    return ax


In [18]:
marker = ""

abs_max_rain = max([data_dict['5425'].value.max(), data_dict['5427'].value.max()]) + 1
data_dict['max_rain'] = abs_max_rain    

### Visualizing all anomalies in one
fig, axs = plt.subplots(1+len(anomalies), 1, figsize=(14, 14), sharey="row", dpi=150)
# set main title
fig.suptitle(f"{sensor_name}\n{' '}", fontsize=24)
axs[0] = visualize_rain(axs[0], 'Rain', data_dict, marker, linewidth=0.3)
print('Rain done')
# iterate over the anomalies
for i, anomaly in enumerate(anomalies):
    anomaly_config = synthetic_config['anomalies'][anomaly][severity]
    sensor_range = sensor_ranges[sensor_name]['clean']['range']

    data_dict = insert_anomaly(data_dict, anomaly_config, anomaly, sensor_range, obvious_min, obvious_max, seed, center=False)
    
    # Visualize the injected anomalies
    sensor_title = f"{anomaly.capitalize()}"
    axs[i+1] = visualize_injected_synthetics(axs[i+1], sensor_title, data_dict, unit, marker)

    print(f"{anomaly} done: {data_dict['indicator_dict']['indicator'].sum() / data_dict['indicator_dict']['indicator'].shape[0]}")

    

axs[-1] = set_meaningful_xticks(axs[-1], data_dict['start'], data_dict['end'])
plt.tight_layout()
# # save the figure
fig.savefig(sensor_save_folder / f"all_complete.png", dpi=150)
plt.close(fig)

Rain done
spike done: 0.00010136105958880849
noise done: 0.0020669296481099298
frozen done: 0.005003265498053763
offset done: 0.020494788265106194
drift done: 0.020130097442461924



---
# Todo

- compare this real errors?
- some synthetic data summary?


---