# 无压力下Workload聚合数据分析

In [None]:
import sys
sys.path.append('./tools')

from aggregation import *
import analyze
import display

pd.set_option('display.max_rows', 20)

exp_root = "/home/ict/appProfile/data/redis_1/no_stress/redis_no_20231102130306"
qos_columns = [
    "app_redis_qos_qps_of_redis_get",
    "app_redis_qos_qps_of_redis_total",
    "app_redis_qos_qps_of_redis_set",
    "app_redis_qos_p99_latency_set",
    "app_redis_qos_p99_latency_get",
]

exp_data = read_from_dir(exp_root)
exp_data.set_workload_preprocess_funcs([
    filter_column_startswith(col_prefix=("stress", "vm", "app")),
    filter_column_useless(excol_prefix=("stress")),
    filter_row_noise(col_prefix=("app")),
])

keys = list(exp_data.exp["info_per_epoch"][0]["workloads"].keys())
keys

In [None]:
# 选择QoS指标
qos_column = qos_columns[3]
# 相关性阈值
c = 0.90

df_workload = exp_data.agg_epoch()
df_workload

In [None]:
corr_matrix = analyze.pearson_correlation(df_workload)

single_corr = analyze.single_corr(corr_matrix, "app_redis_qos_qps_of_redis_total", similarity_filter = [
    lambda x: x[x.abs() > c].dropna(axis=0, how='all'),
])

display.plt_by_column(df_workload, columns=list(single_corr.index))
single_corr

In [None]:
columns = ["vm_cpu_cpi", "vm_cache_misses_per_thousand_instructions", "vm_mem_bandwidth_total_numa_3"]
for column in columns:
    display.plt_box(
        exp_data.one_column_on_workloads(
            column,
            df_funcs=[filter_row_noise(col_prefix=(column))]),
            "", column)

# 一、相关性分析

## (1) 皮尔逊相关性

In [None]:
pearson_corr_matrix = analyze.pearson_correlation(df_workload)
pearson_corr_matrix

In [None]:
display.plt_corr_heatmap(pearson_corr_matrix)

### 1. 相关性排序

In [None]:
flat_corr_df = analyze.flatten_corr(
    pearson_corr_matrix,
    similarity_filter = [
        lambda x: x[x.abs() > c].dropna(axis=0, how='all'),
])
flat_corr_df

### 2. QoS 相关指标

In [None]:
qos_corr = analyze.single_corr(pearson_corr_matrix, qos_column, similarity_filter = [
    lambda x: x[x.abs() > c].dropna(axis=0, how='all'),
])
qos_corr

In [None]:
display.plt_by_column(df_workload, columns=list(qos_corr.index))

## 3. 相关性最大团

In [None]:
G, cliques = analyze.cliques_from_corr(pearson_corr_matrix, threshold=c)
largest_clique = cliques[0]
largest_clique

In [None]:
display.plt_clique_on_graph(G, largest_clique, scaling=1.25, label_pos=0.35)

## 4. 不同Workload的最大团

In [None]:
import functools


largest_clique_dict = {}
for key in keys:
    # no stress workload always have one workload
    workload_info = exp_data.workloads_of(key)[0]
    corr_matrix = analyze.pearson_correlation(exp_data.workload_df(workload_info))
    G, cliques = analyze.cliques_from_corr(corr_matrix, threshold=0.95, abs=True)
    largest_clique_dict[key] = set(cliques[0])
    
# 判断与交集的差异
sets = list(largest_clique_dict.values())
common_clique = functools.reduce(lambda a, b: a.intersection(b), sets)
print("common_clique len:", len(common_clique))
diff_largest_clique = {k : len(v) - len(common_clique) for k,v in largest_clique_dict.items()}
diff_largest_clique

In [None]:
display.plt_clique_on_graph(G, common_clique, scaling=1.25, label_pos=0.35)

## (2) 余弦相似性

In [None]:
cos_similarity_matrix = analyze.cosine_similarity(df_workload)
cos_similarity_matrix

In [None]:
display.plt_corr_heatmap(cos_similarity_matrix)

### 1. 相似性排序

In [None]:
flat_corr_df = analyze.flatten_corr(cos_similarity_matrix, similarity_filter = [
    lambda x: x[x > c].dropna(axis=0, how='all'),
])
flat_corr_df

### 2. QoS 相似指标

In [None]:
qos_corr = analyze.single_corr(cos_similarity_matrix, qos_column, similarity_filter = [
    lambda x : x[x.abs() > c].dropna(axis=0, how='all'),
    lambda x : x.drop(qos_columns, errors='ignore'),
])
qos_corr

In [None]:
display.plt_by_column(df_workload, columns=list(qos_corr.index))

### 3. 相似性最大团

In [None]:
G, cliques = analyze.cliques_from_corr(cos_similarity_matrix, threshold=0.99)
largest_clique = cliques[0]
largest_clique

In [None]:
display.plt_clique_on_graph(G, largest_clique, scaling=1.25, label_pos=0.35)

### 4. 不同Workload的最大团

In [None]:
import functools


largest_clique_dict = {}
for key in keys:
    # no stress workload always have one workload
    workload_info = exp_data.workloads_of(key)[0]
    corr_matrix = analyze.cosine_similarity(exp_data.workload_df(workload_info))
    G, cliques = analyze.cliques_from_corr(corr_matrix, threshold=c)
    largest_clique_dict[key] = set(cliques[0])
    
# 判断与交集的差异
sets = list(largest_clique_dict.values())
common_clique = functools.reduce(lambda a, b: a.intersection(b), sets)
print(len(common_clique))
diff_largest_clique = {k : len(v) - len(common_clique) for k,v in largest_clique_dict.items()}
diff_largest_clique

In [None]:
display.plt_clique_on_graph(G, common_clique, scaling=1.25, label_pos=0.35)