# 数据可视化


In [27]:
import os
import pandas as pd
import warnings
from tqdm.notebook import tqdm, trange
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import plotly_express as px
import plotly as py
import plotly.io as pio
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots

warnings.filterwarnings("ignore")

py.offline.init_notebook_mode(connected=True)
tqdm.pandas()


## 画图函数

In [None]:
def create_shapes(starts, _min, _max, type=None, xref=None, yref=None):
    if type == 'service':
        color = 'red'
    elif type=='pod':
        color = 'blue'
    elif type == 'node':
        color = 'green'
    else:
        color = 'red'
        
    if _min==_max:
        _min=0
        _max=1
    
    shapes = []

    for r in starts:
        w = timedelta(minutes=10)
        x0 = r
        x1 = r + w
        shape = {
            'type': 'rect',
            'x0': x0,
            'y0': _min,
            'x1': x1,
            'y1': _max,
            'fillcolor': color,
            'opacity': 0.3,
            'line': {
                'width': 0,
            },
        }
        if xref is not None:
            shape['xref'] = xref
            shape['yref'] = yref

        shapes.append(shape)

    return shapes

## 读取数据

### 标签数据

In [29]:
label_data1 = pd.read_csv(
    '../../data/training_data_with_faults/groundtruth/groundtruth-k8s-1-2022-03-20.csv')
label_data2 = pd.read_csv(
    '../../data/training_data_with_faults/groundtruth/groundtruth-k8s-1-2022-03-21.csv')



In [None]:
label_data = pd.concat(
    [label_data1, label_data2])
label_data


In [None]:
label_data.sort_values(by=['level', 'cmdb_id','timestamp'], inplace=True)
label_data.reset_index(drop=True, inplace=True)
label_data['datetime'] = pd.to_datetime(
    label_data['timestamp'], unit='s')

label_data.to_csv('../data/label/label1.csv', index=False)
label_data


### 业务指标

#### service级别

In [None]:
label_data = pd.read_csv('../data/label/label1.csv')
label_data['datetime'] = pd.to_datetime(label_data['datetime'])
label_data

In [None]:
# service_metric_data1 = pd.read_csv(
#     '../data/training_data_normal/cloudbed-1/metric/service/metric_service.csv')
# service_metric_data2 = pd.read_csv(
#     '../data/training_data_normal/cloudbed-2/metric/service/metric_service.csv')
# service_metric_data3 = pd.read_csv(
#     '../data/training_data_normal/cloudbed-3/metric/service/metric_service.csv')


service_metric_data1 = pd.read_csv(
    '../data/training_data_with_faults/tar/2022-03-20-cloudbed1/metric/service/metric_service.csv')
service_metric_data2 = pd.read_csv(
    '../data/training_data_with_faults/tar/2022-03-21-cloudbed1/metric/service/metric_service.csv')
# service_metric_data3 = pd.read_csv(
#     '../data/training_data_with_faults/tar/2022-03-20-cloudbed2/metric/service/metric_service.csv')
# service_metric_data4 = pd.read_csv(
#     '../data/training_data_with_faults/tar/2022-03-21-cloudbed2/metric/service/metric_service.csv')
# service_metric_data5 = pd.read_csv(
#     '../data/training_data_with_faults/tar/2022-03-20-cloudbed3/metric/service/metric_service.csv')
# service_metric_data6 = pd.read_csv(
#     '../data/training_data_with_faults/tar/2022-03-21-cloudbed3/metric/service/metric_service.csv')
# service_metric_data7 = pd.read_csv(
#     '../data/training_data_with_faults/tar/2022-03-24-cloudbed3/metric/service/metric_service.csv')

# service_metric_data=pd.concat([service_metric_data1,service_metric_data2,service_metric_data3])
# service_metric_data = pd.concat(
#     [service_metric_data1, service_metric_data2, service_metric_data3, service_metric_data4, service_metric_data5,service_metric_data6,service_metric_data7])
service_metric_data = pd.concat(
    [service_metric_data1, service_metric_data2])
# service_metric_data=service_metric_data1
service_metric_data


In [None]:
processed_service_metric_data = list(service_metric_data.groupby('service'))
for (service_name, service_data) in tqdm(processed_service_metric_data):
    service_data.sort_values(by='timestamp', inplace=True)
    service_data.reset_index(drop=True, inplace=True)
    service_data['datetime'] = pd.to_datetime(
        service_data['timestamp'], unit='s')

    # processed_data_path = f'../data/training_data_normal/processed_service_metric_data/'
    processed_data_path = f'../data/training_data_with_faults/tar/processed_service_metric_data/'
    os.makedirs(processed_data_path, exist_ok=True)
    service_data.to_csv(processed_data_path+service_name +
                        '_metrics.csv', index=False)


In [None]:
processed_service_metric_data = list(service_metric_data.groupby('service'))
for (service_name,service_data) in tqdm(processed_service_metric_data):
    service_data.sort_values(by='timestamp', inplace=True)
    service_data.reset_index(drop=True, inplace=True)
    service_data['datetime'] = pd.to_datetime(
        service_data['timestamp'], unit='s')
    
    visualization_path = f'../result/visualization/with_faults/service/'
    os.makedirs(visualization_path, exist_ok=True)
    
    label_data_service= label_data[(label_data['level'] == 'service') & (
        label_data['cmdb_id'] == service_name.split('-')[0])]
    label_data_pod=label_data[(label_data['level'] == 'pod') & (
        label_data['cmdb_id'].str.contains(service_name.split('-')[0]))]
    
    fig = make_subplots(rows=4, cols=1, shared_xaxes=True)
    shapes = []
    
    fig.append_trace(go.Scatter(x=service_data['datetime'], y=service_data['rr'], name='rr',
                                marker=dict(color='rgb(255, 127, 14, 1)', size=1), mode='markers',legendgroup="group1",
                                legendgrouptitle_text="Data Type"), row=1, col=1)
    shapes += create_shapes(label_data_service['datetime'], _min=0,
                            _max=service_data['rr'].max(), type='service', xref='x1', yref='y1')
    shapes += create_shapes(label_data_pod['datetime'], _min=0,
                            _max=service_data['rr'].max(), type='pod', xref='x1', yref='y1')
    
    fig.append_trace(go.Scatter(x=service_data['datetime'], y=service_data['sr'], name='sr',
                                marker=dict(color='rgb(0, 204, 150, 1)', size=1), mode='markers', legendgroup="group1",
                                legendgrouptitle_text="Data Type"), row=2, col=1)
    shapes += create_shapes(label_data_service['datetime'], _min=0,
                            _max=service_data['sr'].max(), type='service', xref='x2', yref='y2')
    shapes += create_shapes(label_data_pod['datetime'], _min=0,
                            _max=service_data['sr'].max(), type='pod', xref='x2', yref='y2')
    
    fig.append_trace(go.Scatter(x=service_data['datetime'], y=service_data['mrt'], name='mrt',
                                line=dict(color='rgb(31, 119, 180, 1)', width=1), mode='lines', legendgroup="group1",
                                legendgrouptitle_text="Data Type"), row=3, col=1)
    shapes += create_shapes(label_data_service['datetime'], _min=0,
                            _max=service_data['mrt'].max(), type='service', xref='x3', yref='y3')
    shapes += create_shapes(label_data_pod['datetime'], _min=0,
                            _max=service_data['mrt'].max(), type='pod', xref='x3', yref='y3')
    
    fig.append_trace(go.Scatter(x=service_data['datetime'], y=service_data['count'], name='count',
                                line=dict(color='rgb(0, 0, 0, 1)', width=1), mode='lines', legendgroup="group1",
                                legendgrouptitle_text="Data Type"), row=4, col=1)
    shapes += create_shapes(label_data_service['datetime'], _min=0,
                            _max=service_data['count'].max(), type='service', xref='x4', yref='y4')
    shapes += create_shapes(label_data_pod['datetime'], _min=0,
                            _max=service_data['count'].max(), type='pod', xref='x4', yref='y4')
    
    fig.append_trace(go.Scatter(x=[service_data['datetime'][0]], y=[service_data['rr'][0]], legendgroup='group2', legendgrouptitle_text='Label Type',
                                name='service label', mode="lines", line=dict(color='red')), row=1, col=1)
    fig.append_trace(go.Scatter(x=[service_data['datetime'][0]], y=[service_data['rr'][0]], legendgroup='group2', legendgrouptitle_text='Label Type',
                                name='pod label',mode="lines",line=dict(color='blue')), row=1, col=1)
    
    fig.update_layout(title_text=service_name, shapes=shapes)
    pio.write_html(fig, file=visualization_path+service_name+'.html')
    

### 性能指标

#### node级别

In [None]:
label_data = pd.read_csv('../data/label/label1.csv')
label_data['datetime'] = pd.to_datetime(label_data['datetime'])
cmdb_id = label_data[label_data['level'] =='node']['cmdb_id'].drop_duplicates().tolist()
cmdb_id

In [None]:
node_metric_data1=pd.read_csv('../data/training_data_with_faults/tar/2022-03-20-cloudbed1/metric/node/kpi_cloudbed1_metric_0320.csv')
node_metric_data2=pd.read_csv('../data/training_data_with_faults/tar/2022-03-21-cloudbed1/metric/node/kpi_cloudbed1_metric_0321.csv')
node_metric_data=pd.concat([node_metric_data1,node_metric_data2])

# node_metric_data = pd.read_csv(
#     '../data/training_data_normal/cloudbed-1/metric/node/kpi_cloudbed1_metric_0319.csv')

node_metric_data['datetime'] = pd.to_datetime(
    node_metric_data['timestamp'], unit='s')

node_metric_data

In [None]:
processed_node_metric_data=list(node_metric_data.groupby('kpi_name'))

for kpi_name,kpi_data in tqdm(processed_node_metric_data):
    kpi_data.sort_values(by=['cmdb_id', 'timestamp'], inplace=True)
    kpi_data.reset_index(drop=True, inplace=True)
    # processed_data_path = f'../data/training_data_normal/processed_node_metric_data/'
    processed_data_path = f'../data/training_data_with_faults/tar/processed_node_metric_data/'
    os.makedirs(processed_data_path, exist_ok=True)
    kpi_data.to_csv(processed_data_path+kpi_name +'_metrics.csv', index=False)


In [None]:
categories=['cpu&load','mem','disk&fs','io','net&can_connect&tcp&udp','swap','os&user','process']
colors = ['blue', 'green', 'orange', 'purple', 'brown','pink', 'gray', 'olive', 'cyan', 'magenta']

for cat in tqdm(categories):
    node_data=pd.DataFrame()
    cat_list=cat.split('&')
    for c in cat_list:
        node_metric_data['kpi_cat'] = node_metric_data['kpi_name'].apply(
            lambda x: x.split('.')[1])
        node_data = pd.concat(
            [node_data, node_metric_data[node_metric_data['kpi_cat'] == c]])
        
    kpi_num = len(node_data['kpi_name'].drop_duplicates())
    cmdb_num = len(node_data['cmdb_id'].drop_duplicates())
    
    processed_node_data=list(node_data.groupby('cmdb_id'))
    
    visualization_path = f'../result/visualization/with_faults/node/'+cat+'/'
    os.makedirs(visualization_path, exist_ok=True)
    
    for node_id, kpi_data in processed_node_data:
        fig = make_subplots(rows=kpi_num, cols=1, shared_xaxes=True,
                            subplot_titles=kpi_data['kpi_name'].drop_duplicates().sort_values().tolist())
        shapes = []
        
        kpi_data.sort_values(by='timestamp', inplace=True)
        kpi_data.reset_index(drop=True, inplace=True)
        
        label_data_i = label_data[(label_data['level'] == 'node') & (
            label_data['cmdb_id'] == node_id)]
        
        kpi_data = list(kpi_data.groupby(['kpi_name']))
        
        for i,(kpi,data) in enumerate(kpi_data):
            data.sort_values(by='timestamp', inplace=True)
            data.reset_index(drop=True, inplace=True)

            shapes += create_shapes(
                label_data_i['datetime'], _min=0, _max=data['value'].max(), xref='x'+str(i+1), yref='y'+str(i+1))
            fig.append_trace(go.Scatter(x=data['datetime'], y=data['value'], name=kpi,
                                        line=dict(color=colors[i % 10], width=1.5), mode='lines'), row=i+1, col=1)

        fig.update_layout(title_text=node_id, shapes=shapes)
        pio.write_html(fig, file=visualization_path+node_id+'.html')


#### container级别

In [None]:
label_data = pd.read_csv('../data/label/label1.csv')
label_data['datetime'] = pd.to_datetime(label_data['datetime'])

cmdb_id_service = label_data[label_data['level'] ==
                     'service']['cmdb_id'].drop_duplicates().tolist()

cmdb_id_raw = label_data[label_data['level'] ==
                     'service']['cmdb_id'].drop_duplicates().tolist()
cmdb_id_pod = []
pre = ['', '2']
for id in cmdb_id_raw:
    for p in pre:
        for i in range(3):
            cmdb_id_pod.append(id+f'{p}-{i}')

cmdb_id_node = label_data[label_data['level'] ==
                     'node']['cmdb_id'].drop_duplicates().tolist()

# 查看后发现container的network类的指标多了这一类的cmdb_id，但是再label中没有此类cmdb_id
cmdb_id_pod.append('redis-cart')
cmdb_id_pod

In [None]:
dir_path1 = f'../data/training_data_with_faults/tar/2022-03-20-cloudbed1/metric/container/'
dir_path2 = f'../data/training_data_with_faults/tar/2022-03-21-cloudbed1/metric/container/'
dir_content = os.listdir(dir_path1)
categories = ['cpu', 'memory', 'fs', 'network',
              'spec', 'threads&processes&ulimits']
colors = ['blue', 'green', 'orange', 'purple', 'brown',
          'pink', 'gray', 'olive', 'cyan', 'magenta']


In [None]:
for cat in tqdm(categories):
    file_name_set = [i for i in dir_content if i.split('.')[0].split('_')[2] in cat]
    container_data=pd.DataFrame()
    for filename in file_name_set:
        file_path1=dir_path1+filename
        file_path2=dir_path2+filename
        container_data1=pd.read_csv(file_path1)
        container_data2=pd.read_csv(file_path2)
        container_data = pd.concat(
            [container_data,container_data1, container_data2])
    
    container_data['datetime'] = pd.to_datetime(container_data['timestamp'], unit='s')
    container_data['cmdb_id_pod'] = container_data['cmdb_id']
    container_data['cmdb_id_service'] = container_data['cmdb_id']
    container_data['cmdb_id_node'] = container_data['cmdb_id']
    
    for i in cmdb_id_pod:
        container_data['cmdb_id_pod'] = container_data['cmdb_id_pod'].apply(
            lambda x: i if i in x else x)
    
    for i in cmdb_id_service:
        container_data['cmdb_id_service'] = container_data['cmdb_id_service'].apply(
            lambda x: i if i in x else x)
        
    # for i in cmdb_id_node:
    #     container_data['cmdb_id_node'] = container_data['cmdb_id_node'].apply(
    #         lambda x: i if i in x else x)
        
    processed_container_data = list(container_data.groupby(['cmdb_id_pod']))
    
    for id, kpi_data in processed_container_data:
        kpi_data.sort_values(by='timestamp', inplace=True)
        kpi_data.reset_index(drop=True, inplace=True)

        cmdb_num = len(kpi_data['cmdb_id'].drop_duplicates())
        kpi_num = len(kpi_data['kpi_name'].drop_duplicates())
        
        service_ids=kpi_data['cmdb_id_service'].drop_duplicates().tolist()
        # node_ids=kpi_data['cmdb_id_node'].drop_duplicates().tolist()
        
        service_id = service_ids[0] if len(service_ids)==1 else ''
        # node_id = node_ids[0] if len(service_ids) == 1 else ''
        
        label_data_pod = label_data[(label_data['level'] == 'pod') & (
            label_data['cmdb_id'] == id)]
        label_data_service = label_data[(label_data['level'] == 'service') & (
            label_data['cmdb_id'] == service_id)]
        # label_data_node = label_data[(label_data['level'] == 'node') & (
        #     label_data['cmdb_id'] == node_id)]

        visualization_path = f'../result/visualization/with_faults/container/'+cat+'/'
        os.makedirs(visualization_path, exist_ok=True)

        fig = make_subplots(rows=kpi_num, cols=1, shared_xaxes=True,
                            subplot_titles=kpi_data['kpi_name'].drop_duplicates().sort_values().tolist())
        
        shapes = []

        kpi_data_temp = list(kpi_data.groupby(['kpi_name']))
        kpi_data = list(kpi_data.groupby(['kpi_name', 'cmdb_id']))

        for i in range(kpi_num):
            shapes += create_shapes(
                label_data_service['datetime'], _min=0, _max=kpi_data_temp[i][1]['value'].max(),type='service', xref='x'+str(i+1), yref='y'+str(i+1))
            shapes += create_shapes(
                label_data_pod['datetime'], _min=0, _max=kpi_data_temp[i][1]['value'].max(), type='pod', xref='x'+str(i+1), yref='y'+str(i+1))
            # shapes += create_shapes(
            #     label_data_node['datetime'], _min=0, _max=kpi_data_temp[i][1]['value'].max(), type='node', xref='x'+str(i+1), yref='y'+str(i+1))
            for j in range(cmdb_num):
                ((kpi, cmdb), data) = kpi_data[i*cmdb_num+j]
                data.sort_values(by='timestamp', inplace=True)
                data.reset_index(drop=True, inplace=True)

                fig.append_trace(go.Scatter(x=data['datetime'], y=data['value'], name=cmdb,
                                            line=dict(color=colors[j % 10], width=1.5), mode='lines', legendgroup="group1",
                                            legendgrouptitle_text="Data Type"), row = i+1, col = 1)

        fig.append_trace(go.Scatter(x=[data['datetime'][0]], y=[data['value'][0]], legendgroup='group2', legendgrouptitle_text='Label Type',
                                    name='service label', mode="lines", line=dict(color='red')), row=1, col=1)
        fig.append_trace(go.Scatter(x=[data['datetime'][0]], y=[data['value'][0]], legendgroup='group2', legendgrouptitle_text='Label Type',
                                name='pod label', mode="lines", line=dict(color='blue')), row=1, col=1)
        # fig.append_trace(go.Scatter(x=[data['datetime'][0]], y=[data['value'][0]], legendgroup='group2', legendgrouptitle_text='Label Type',
        #                             name='node label', mode="lines", line=dict(color='green')), row=1, col=1)
        
        fig.update_layout(title_text=id, shapes=shapes)
        pio.write_html(fig, file=visualization_path+id+'.html')
