# Error Analysis

In [7]:
import os

import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.patches import Patch
import h5py
from anytree import Node, RenderTree
import nexusformat.nexus as nx
import pickle

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.cm as cm
from mlxtend.plotting import scatterplotmatrix




from fault_management_uds.data.data_utilities import print_tree
from fault_management_uds.data.load_save_data import load_dataframe, save_dataframe, overwrite_dataset, save_filtered_data
from fault_management_uds.data.load_save_data import save_metadata, load_metadata, import_metadata
from fault_management_uds.plots import set_meaningful_xaxis_timestamps, get_segment_start_end_color


from fault_management_uds.config import PROJ_ROOT
from fault_management_uds.config import DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR, PROCESSED_DATA_DIR, EXTERNAL_DATA_DIR
from fault_management_uds.config import MODELS_DIR, REPORTS_DIR, FIGURES_DIR
from fault_management_uds.config import rain_gauges, natural_structure_order, natural_sensor_order, series_order, error_indicators

In [2]:
np.random.seed(11)

#### Load subset

In [3]:
data_file_path = PROCESSED_DATA_DIR / 'Bellinge.h5'
f = print_tree(data_file_path)

root
├── dummy
└── processed
    ├── combined_data
    │   ├── cleaned
    │   │   ├── columns
    │   │   ├── data
    │   │   └── timestamps
    │   ├── interpolated
    │   │   ├── columns
    │   │   ├── data
    │   │   └── timestamps
    │   ├── is_error
    │   │   ├── columns
    │   │   ├── data
    │   │   └── timestamps
    │   ├── no_data
    │   │   ├── columns
    │   │   ├── data
    │   │   └── timestamps
    │   └── raw
    │       ├── columns
    │       ├── data
    │       └── timestamps
    └── single_series
        ├── 5425
        │   ├── columns
        │   ├── data
        │   └── timestamps
        ├── 5427
        │   ├── columns
        │   ├── data
        │   └── timestamps
        ├── G71F04R_Level1
        │   ├── columns
        │   ├── data
        │   └── timestamps
        ├── G71F04R_Level2
        │   ├── columns
        │   ├── data
        │   └── timestamps
        ├── G71F05R_LevelBasin
        │   ├── columns
        │   ├── data
        │   └

In [4]:

raw_path = '/processed/combined_data/raw'
is_error_path = '/processed/combined_data/is_error'
no_data_path = '/processed/combined_data/no_data'

starttime = "01-01-2019 00:00:00"
endtime = "01-01-2021 00:00:00"

return_type = "df"

In [5]:
# load

raw, timestamps, columns, start_idx, end_idx, column_indices = load_dataframe(
    data_file_path, raw_path, 
    return_type=return_type, starttime=starttime, endtime=endtime
    )
    
is_error, timestamps, columns, start_idx, end_idx, column_indices = load_dataframe(
    data_file_path, is_error_path, 
    return_type=return_type, starttime=starttime, endtime=endtime
    )

no_data, timestamps, columns, start_idx, end_idx, column_indices = load_dataframe(
    data_file_path, no_data_path, 
    return_type=return_type, starttime=starttime, endtime=endtime
    )


# fill na with 0
raw = raw.fillna(0)
is_error = is_error.fillna(0)
no_data = no_data.fillna(0)

# readjust the order of the columns
raw = raw[series_order]
is_error = is_error[series_order]
no_data = no_data[series_order]

        Data loaded from group '/processed/combined_data/raw'
        Data loaded from group '/processed/combined_data/is_error'
        Data loaded from group '/processed/combined_data/no_data'


In [8]:
error_indicators

['stamped', 'man_remove', 'outbound', 'outlier', 'frozen', 'frozen_high']

In [9]:
error = 'stamped'

raw.columns

Index(['5425', '5427', 'G80F11B_Level1', 'G80F11B_Level2', 'G80F66Y_Level1',
       'G80F66Y_Level2', 'G80F13P_LevelPS', 'G73F010', 'G72F040',
       'G71F05R_LevelInlet', 'G71F05R_LevelBasin', 'G71F05R_position',
       'G71F04R_Level1', 'G71F04R_Level2', 'G71F06R_LevelInlet',
       'G71F68Y_LevelPS', 'G71F68Yp1'],
      dtype='object')

# TODO: is_error, convert into indicator for each error type

Per sensor per error
- Average
- Boxplot
- Time series visualization

Correlation?
- Occur next to each other?
- Time of the day related?
