# Diagnosing Model Driven Telemetry timeseries

This notebook loads an MDT dataset and uses "semantic feature selection" to choose the most "relevant" features for a change point.
For details, see T. Feltin, J. A. C. Fuertes, F. Brockners and T. H. Clausen, ["Understanding Semantics in Feature Selection for Fault Diagnosis in Network Telemetry Data”](https://www.researchgate.net/publication/371814291_Understanding_Semantics_in_Feature_Selection_for_Fault_Diagnosis_in_Network_Telemetry_Data), NOMS 2023 - 2023 IEEE/IFIP Network Operations and Management Symposium.

In [None]:
%load_ext autoreload
%autoreload 2

### Load dataset information

In [None]:
import modules.dataset as ds
ds.extract_dataset('./datasets/mdt-demo.tgz', './output')

In [None]:
import modules.mdt.datasets as mdt_ds

datasets = mdt_ds.Datasets(datasets_dir='./output')
datasets.jupyter_select_dataset_device(select_file=False)

## Show Dataset Sample

In [None]:
import pandas as pd
import modules.utils as utils

data_fn, _ = datasets.get_input_data_file("merged.csv")

df = pd.read_csv(open(data_fn, 'rb'))  

utils.displayDataFrame(df.iloc[0:9,0:9])


### Helper functions (load data)

In [None]:
import re
from datetime import datetime, timezone

import pandas as pd
import numpy as np

MIN_TIMESTAMP = -62135596800
MAX_TIMESTAMP = 253402214400

ORIGINAL_DATA     = "original data"
REDUCED_DATA      = "reduced data"
FIRST_DERIVATIVE  = "first derivative"
SECOND_DERIVATIVE = "second derivative"

def get_feature_names_bis(path, delimiter=','):
    "a more direct and simpler implementation than get_feature_names()"
    with open(path, "r") as f:
        header = f.readline().strip('\n')
    return header.split(delimiter)

def scale_data(d):
    d = d - np.mean(d, axis=0)
    ft_scale = np.std(d, axis=0)
    z_index = np.where(ft_scale < 1e-6)
    ft_scale[z_index] = 1
    d = d / ft_scale
    return d

def load_data(in_fn, reduced=None, startTime=MIN_TIMESTAMP, endTime=MAX_TIMESTAMP, 
              scale=False, data_selection={}, ft_regex=None, remove_nan=False, remove_inf=False) -> (np.array, pd.DataFrame):
    data = np.genfromtxt(in_fn, dtype=float, delimiter=',', skip_header=1)

    if isinstance(data_selection, str):
        selection = {
            ORIGINAL_DATA    : False,
            REDUCED_DATA     : False,
            FIRST_DERIVATIVE : False,
            SECOND_DERIVATIVE: False
        }
        selection[data_selection] = True
        data_selection = selection

    tstp = data[:,0]
    data = data[:,1:]
    ft_names = np.asarray(get_feature_names_bis(in_fn)[1:])
    if ft_regex:
        ft_filter = re.compile(ft_regex, re.IGNORECASE)
        ft_idx = np.array([i for i, v in enumerate(map(ft_filter.match, ft_names)) if v is not None])
        if len(ft_idx) > 0:
            data = data[:, ft_idx]
            ft_names = ft_names[ft_idx]
        else:
            data = np.array([])
            ft_names = np.array([])

    if remove_nan:
        inval_col = np.where(np.any(np.isnan(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)

    if remove_inf:
        inval_col = np.where(np.any(np.isinf(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)
    
    if scale:
        data = scale_data(data)
    
    final_names = np.asarray([])
    final_data = np.array([[] for _ in range(len(data))])
    derivative = None
    if data_selection[FIRST_DERIVATIVE] or data_selection[SECOND_DERIVATIVE]:
        derivative = np.diff(data, axis=0)

    if data_selection[ORIGINAL_DATA]:
        final_data = np.append(final_data, data, axis=1)
        final_names = np.append(final_names, ft_names)
    
    if data_selection[REDUCED_DATA]:
        final_data = np.append(final_data, reduced, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_reduced" for x in range(len(reduced[0]))])

    if data_selection[FIRST_DERIVATIVE]:
        final_data = np.append(final_data, np.vstack([derivative[0,:], derivative]), axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-send_deriv" for x in ft_names])

    if data_selection[SECOND_DERIVATIVE]:
        second_derivative = np.diff(derivative, axis=0)
        second_derivative = np.vstack([second_derivative[0,:], second_derivative[0,:], second_derivative])
        final_data = np.append(final_data, second_derivative, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_deriv2" for x in ft_names])

    # add timestamp            
    final_data = np.append(tstp.reshape(-1,1), final_data, axis=1)
    final_names = np.append(np.asarray('ts'), final_names)

    # filter by time
    if isinstance(startTime, datetime):
        startTime = startTime.replace(tzinfo=timezone.utc).timestamp()
    if isinstance(endTime, datetime):
        endTime = endTime.replace(tzinfo=timezone.utc).timestamp()
    final_data = final_data[
        (final_data[:,0] >= startTime) &
        (final_data[:,0] <= endTime)
    ]
    final_tstp = final_data[:,0]

    return final_tstp, pd.DataFrame(final_data, columns=final_names)

### Retriever Relevant Features

Note: For a selection for other change-points adjust ``timestamp = 4820`` accordingly.

In [None]:
from collections import defaultdict

from modules.mdt import explain_lib as explib
from modules.mdt.traffic_leaf_classifier import traffic_leaf_test
from modules.mdt.selection_lib import run_opti
from modules.mdt.feature_store import FeatureStore
from modules.mdt.utils import minmax, adaptive_diff, ft_dissect
import modules.utils as utils

tstp, dataframe = load_data(data_fn, scale=False, data_selection=ORIGINAL_DATA, ft_regex="^(?!.*(time|second|minute|hour|pid|port)).*",
                            remove_nan=True, remove_inf=True)

one_sided_window = 150
max_selection_size = 5
regularization_term = 2
max_number_of_epochs = 20
traffic_metric_name = 'sort_metric_step_eyeQ'
timestamp = 4820

# get metric func
metric_func = getattr(explib, traffic_metric_name)

fulldata = dataframe.to_numpy(dtype=float)
tstp = fulldata[:,0]
data = fulldata[:,1:]        
ft_names = dataframe.columns.values[1:]

ft_store = FeatureStore(ft_names)
ft_names_idx = list(range(len(ft_names)))

ft_class = []  # type of each feature
# leaf count associate to each kv
mainft_leaf_cnt = defaultdict(int)
traffic_leaf_cnt = defaultdict(int)

onbox_name_format = '[' in ft_store.get_flat_name(0)

for fti in ft_names_idx:
    if onbox_name_format:
        leaf = ft_store.get_joined_path(fti)
        kv_str = ft_store.get_joined_kv(fti)
    else:
        _, _, kv, leaf = ft_dissect(ft_store.get_flat_name(fti))
        kv_str = ':'.join(kv)

    if traffic_leaf_test(leaf):
        ft_class.append(1)
        traffic_leaf_cnt[kv_str] += 1
    else:
        ft_class.append(-1)
        mainft_leaf_cnt[kv_str] += 1

ft_class = np.array(ft_class)
main_ft_idx = np.where(ft_class == -1)[0]
traffic_ft_idx = np.where(ft_class == 1)[0]

# area of interest
changepoint = tstp[0] + timestamp
win_idx = np.where(abs(tstp - changepoint) <= one_sided_window)[0]
# in window data preprocessing, take care of the numpy view and copy in slicing the data
cp_window = data[win_idx,:]
data_slice_shape = minmax(adaptive_diff(cp_window))  # cares only about the shape
data_slice_raw = adaptive_diff(cp_window)

# sorting metrics on a per counter/feature base
main_metric = metric_func(data_slice_shape[:, main_ft_idx])
traffic_metric = metric_func(data_slice_raw[:, traffic_ft_idx])

sorted_main = np.array(sorted(enumerate(main_metric.tolist()), key=lambda s: s[1], reverse=True))
sorted_traffic = np.array(sorted(enumerate(traffic_metric.tolist()), key=lambda s: s[1], reverse=True))

if len(sorted_traffic) > 0:
    traffic_ft_names = [ft_names_idx[i] for i in traffic_ft_idx]
    t_names = [traffic_ft_names[i] for i in sorted_traffic[:,0].astype(int)]
    traffic_scores = sorted_traffic[:,1] / max(sorted_traffic[:,1])
else:
    t_names = []
    traffic_scores = np.array([])

if len(sorted_main) > 0:
    main_ft_names = [ft_names_idx[i] for i in main_ft_idx]
    m_names = [main_ft_names[i] for i in sorted_main[:,0].astype(int)]
    main_scores = sorted_main[:,1]
else:
    m_names = []
    main_scores = np.array([])

full_names = m_names + t_names
scores = np.concatenate((main_scores, traffic_scores))

selection = run_opti(ft_store, full_names, scores, alpha=regularization_term, 
                        N_max_epochs=max_number_of_epochs)

features = []
for x in range(0, min(len(selection), max_selection_size)):
    cp_feature = ':'.join(ft_store.get_flat_name(selection[x]).split(':')[1:]) + " CHANGE: " 
    cp_feature +=  str(cp_window[:, selection[x]][-1] - cp_window[:, selection[x]][0])
    features.append(cp_feature)

utils.displayDictionary({'Features': '\n'.join(features)})