# Diagnosing Model Driven Telemetry timeseries

This notebook loads an MDT dataset, visualizes it using t-SNE and uses DBSCAN to detect clusters and associated state transitions ("change-points").

In [None]:
%load_ext autoreload
%autoreload 2

### Load dataset information

In [None]:
import modules.dataset as ds
ds.extract_dataset('./datasets/mdt-demo.tgz', './output')

In [None]:
import modules.mdt.datasets as mdt_ds

datasets = mdt_ds.Datasets(datasets_dir='./output')
datasets.jupyter_select_dataset_device(select_file=False)

## Show Dataset Sample

In [None]:
import pandas as pd
import modules.utils as utils

data_fn, _ = datasets.get_input_data_file("preprocessed_offline.csv")

df = pd.read_csv(open(data_fn, 'rb'))  

utils.displayDataFrame(df.iloc[0:19,0:9])


### Helper functions (load data)

In [None]:
import re
from datetime import datetime, timezone


import numpy as np

MIN_TIMESTAMP = -62135596800
MAX_TIMESTAMP = 253402214400

ORIGINAL_DATA     = "original data"
REDUCED_DATA      = "reduced data"
FIRST_DERIVATIVE  = "first derivative"
SECOND_DERIVATIVE = "second derivative"

def get_feature_names_bis(path, delimiter=','):
    "a more direct and simpler implementation than get_feature_names()"
    with open(path, "r") as f:
        header = f.readline().strip('\n')
    return header.split(delimiter)

def scale_data(d):
    d = d - np.mean(d, axis=0)
    ft_scale = np.std(d, axis=0)
    z_index = np.where(ft_scale < 1e-6)
    ft_scale[z_index] = 1
    d = d / ft_scale
    return d

def load_data(in_fn, reduced=None, startTime=MIN_TIMESTAMP, endTime=MAX_TIMESTAMP, 
              scale=False, data_selection={}, ft_regex=None, remove_nan=False, remove_inf=False) -> (np.array, pd.DataFrame):
    data = np.genfromtxt(in_fn, dtype=float, delimiter=',', skip_header=1)

    if isinstance(data_selection, str):
        selection = {
            ORIGINAL_DATA    : False,
            REDUCED_DATA     : False,
            FIRST_DERIVATIVE : False,
            SECOND_DERIVATIVE: False
        }
        selection[data_selection] = True
        data_selection = selection

    tstp = data[:,0]
    data = data[:,1:]
    ft_names = np.asarray(get_feature_names_bis(in_fn)[1:])
    if ft_regex:
        ft_filter = re.compile(ft_regex, re.IGNORECASE)
        ft_idx = np.array([i for i, v in enumerate(map(ft_filter.match, ft_names)) if v is not None])
        if len(ft_idx) > 0:
            data = data[:, ft_idx]
            ft_names = ft_names[ft_idx]
        else:
            data = np.array([])
            ft_names = np.array([])

    if remove_nan:
        inval_col = np.where(np.any(np.isnan(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)

    if remove_inf:
        inval_col = np.where(np.any(np.isinf(data), axis=0))
        data = np.delete(data, inval_col, axis=1)
        ft_names = np.delete(ft_names, inval_col)
    
    if scale:
        data = scale_data(data)
    
    final_names = np.asarray([])
    final_data = np.array([[] for _ in range(len(data))])
    derivative = None
    if data_selection[FIRST_DERIVATIVE] or data_selection[SECOND_DERIVATIVE]:
        derivative = np.diff(data, axis=0)

    if data_selection[ORIGINAL_DATA]:
        final_data = np.append(final_data, data, axis=1)
        final_names = np.append(final_names, ft_names)
    
    if data_selection[REDUCED_DATA]:
        final_data = np.append(final_data, reduced, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_reduced" for x in range(len(reduced[0]))])

    if data_selection[FIRST_DERIVATIVE]:
        final_data = np.append(final_data, np.vstack([derivative[0,:], derivative]), axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-send_deriv" for x in ft_names])

    if data_selection[SECOND_DERIVATIVE]:
        second_derivative = np.diff(derivative, axis=0)
        second_derivative = np.vstack([second_derivative[0,:], second_derivative[0,:], second_derivative])
        final_data = np.append(final_data, second_derivative, axis=1)
        final_names = np.append(final_names, [f"{x}_bytes-sent_deriv2" for x in ft_names])

    # add timestamp            
    final_data = np.append(tstp.reshape(-1,1), final_data, axis=1)
    final_names = np.append(np.asarray('ts'), final_names)

    # filter by time
    if isinstance(startTime, datetime):
        startTime = startTime.replace(tzinfo=timezone.utc).timestamp()
    if isinstance(endTime, datetime):
        endTime = endTime.replace(tzinfo=timezone.utc).timestamp()
    final_data = final_data[
        (final_data[:,0] >= startTime) &
        (final_data[:,0] <= endTime)
    ]
    final_tstp = final_data[:,0]

    return final_tstp, pd.DataFrame(final_data, columns=final_names)

### Detect Changepoints

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

max_data_point_distance = 0.05

tstp, dataframe = load_data(data_fn, scale=False, data_selection=ORIGINAL_DATA, ft_regex="^(?!.*(time|second)).*")

fulldata = dataframe.to_numpy(dtype=float)
tstp = fulldata[:,0]
data = fulldata[:,1:]

solver = TSNE(n_components=2, init='pca', random_state=0)
reduced = solver.fit_transform(data)

solver = DBSCAN(eps = max_data_point_distance)
clusters = solver.fit(MinMaxScaler().fit_transform(reduced)).labels_

changes = np.where(clusters[:-1] != clusters[1:])[0]
changepoints = []
for t in changes:
    changepoints.append(tstp[t])

print(changepoints)

## Show Changepoints

In [None]:
from modules.mdt.data_utils import plot_data_anime
import plotly.graph_objects as go

events = [
    {
        "timestamp": (tstp[t+1] + tstp[t])/2.0,
        "event": str(i+1),
        "device": datasets.get_device(),
        "interface": None
    } for i, t in enumerate(changes)]
plot_data, frames = plot_data_anime(reduced, tstp, events, color='rgb(128,177,211)')
fig = go.Figure(
        data = plot_data,
        layout = {
            'title': "tSNE 2-D Visualization",
            'autosize': False,
            'width': 1000,
            'height': 1000,
            'updatemenus': [{
                'buttons': [
                    {
                        'args': [None, {
                            'frame': {'duration': 100, 'redraw': False},
                            'fromcurrent': True, 'transition': {'duration': 50, 'easing': 'quadratic-in-out'}}],
                            'label': 'Go', 'method': 'animate'
                    },
                    {
                        'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                        'transition': {'duration': 0}}],
                        'label': 'Pause',
                        'method': 'animate'
                    }],
                'direction': 'left',
                'pad': {'r': 10, 't': 10},
                'showactive': False,
                'type': 'buttons',
                'x': 0.1,
                'xanchor': 'right',
                'y': 1,
                'yanchor': 'bottom'
            }]},
        frames = frames)

fig.show()