In [None]:
import pandas as pd
from typing import Tuple, List
import numpy as np
import os
from ruptures_simple.cost import CostL2
from ruptures_simple.search import BinSeg
from ruptures_simple.search.bin_seg import SegNode
from plot_utils import plot_seg_tree
import graphviz
from tree_utils import postorder_traverse,child_sort
import matplotlib.pyplot as plt
import matplotlib
from collections import deque
font = {'size': 24}

matplotlib.rc('font', **font)

In [None]:
DATASET_PATH = "../dataset"
INDEX_FIELD = "timestamp"
DATA_FIELD = "num_request"
TREE_IMG_ROOT = "binseg_tree_worldcup"
CPD_IMG_ROOT = "binseg_cpd_worldcup"
CPD_CANDIDATE_ROOT = "binseg_cpd_candidate"

In [None]:
def get_data_file_list(dataset_path: str) -> List[str]:
    return os.listdir(dataset_path)

In [None]:
def read_dataset(csv_path: str,index_field:str,data_field:str) -> Tuple[np.ndarray, np.ndarray]:
    df = pd.read_csv(csv_path)
    return df[index_field].to_numpy(), df[data_field].to_numpy()

In [None]:
def save_tree_to_file(workload_name: str, graph: graphviz.Digraph):
    if not os.path.exists(TREE_IMG_ROOT):
        os.makedirs(TREE_IMG_ROOT)
    graph.attr(rankdir='TB')  # Set direction to top to bottom
    graph.render(os.path.join(TREE_IMG_ROOT, workload_name), format="pdf", cleanup=True)

In [None]:
def get_candidate_cpds(root: SegNode, np_data: np.ndarray,epsilon:float):
    node_deque = deque([])
    node_deque.append(root)
    seg_list=[]
    while len(node_deque) > 0:
        current_node: SegNode=node_deque.popleft()
        if current_node is None:
            continue
        else:
            if current_node.cost < (current_node.end-current_node.start)*epsilon:
                seg_list.append((current_node.start,current_node.end))
            else:
                node_deque.append(current_node.left_child)
                node_deque.append(current_node.right_child)
    seg_list = sorted(seg_list, key=lambda x: x[0])
    cpd_list = [seg[0] for seg in seg_list[1:]]
    return cpd_list

In [None]:
def plot_cpd_result(candidate_cpds: List, np_data: np.ndarray, workload_name: str):
    color_workload, color_change = "#3F51B5", "#009688"  # material teal
    fig, ax = plt.subplots()
    fig.set_size_inches(14,7)
    ax.plot(np.arange(len(np_data)), np_data/10, color=color_workload)
    for cp in candidate_cpds:
        ax.axvline(x=cp, color=color_change, linestyle='--', linewidth=1)
    ax.set_xlabel("time (min)")
    ax.set_ylabel("workload x (10 requests)")
    ax.set_title(workload_name.replace("_"," "))
    ax.grid(True,linestyle="--")
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if not os.path.exists(CPD_IMG_ROOT):
        os.makedirs(CPD_IMG_ROOT)
    fig.savefig(CPD_IMG_ROOT+"/"+workload_name+".pdf")
    return fig, ax

In [None]:
import json

def save_candidate_cpds(candidate_cpds: List, workload_name: str):
    if not os.path.exists(CPD_CANDIDATE_ROOT):
        os.makedirs(CPD_CANDIDATE_ROOT)
    with open(os.path.join(CPD_CANDIDATE_ROOT, workload_name+".json"), "w") as f:
        json.dump(candidate_cpds, f, indent=4)
    plt.close()

In [None]:
data_file_list = get_data_file_list(DATASET_PATH)
for file_name in data_file_list:
    workload_name = file_name.split(".")[0]
    print("run segment on %s" % (file_name))
    np_index, np_data = read_dataset(os.path.join(DATASET_PATH, file_name), INDEX_FIELD, DATA_FIELD)
    np_data = np_data.reshape((-1, 1))
    algo = BinSeg(cost_model=CostL2(), init_seg_size=1)
    algo.fit(np_data)
    root_seg = algo.bin_search()
    child_sort(root_seg)
    graph = plot_seg_tree(root_seg)
    save_tree_to_file(workload_name, graph)
    candidate_cpds = get_candidate_cpds(root_seg, np_data, 6250000)
    save_candidate_cpds(candidate_cpds, workload_name)
    plot_cpd_result(candidate_cpds, np_data, workload_name)