# 单 Workload 时序数据分析

In [None]:
import sys
sys.path.append('./tools')

from aggregation import *
import analyze
import display

pd.set_option('display.max_rows', 20) 

In [None]:
exp_root = "/home/ict/appProfile/data/redis/no_stress/redis_no_20231102130306"
qos_columns = [
    "app_redis_qos_qps_of_redis_get",
    "app_redis_qos_qps_of_redis_total",
    "app_redis_qos_qps_of_redis_set",
    "app_redis_qos_p99_latency_set",
    "app_redis_qos_p99_latency_get",
]

exp_data = read_from_dir(exp_root)
exp_data.set_workload_preprocess_funcs([
    filter_column_startswith(col_prefix=("stress", "vm", "app")),
    filter_column_useless(excol_prefix=("stress")),
    filter_row_noise(col_prefix=("app")),
])

keys = list(exp_data.exp["info_per_epoch"][0]["workloads"].keys())
keys

In [None]:
workload_info = exp_data.workloads_of(keys[8])[0]
# 选择QoS指标
qos_column = qos_columns[3]
# 相关性阈值
c = 0.95

df_workload = exp_data.workload_df(workload_info)
df_workload

In [None]:
import numpy as np
import matplotlib.pyplot as plt
data_series = df_workload[qos_column]

# 计算CDF
sorted_data = np.sort(data_series)
cdf = np.arange(1, len(sorted_data) + 1) / float(len(sorted_data))

# 计算CDF
sorted_data = np.sort(data_series)
cdf = np.arange(1, len(sorted_data) + 1) / float(len(sorted_data))

# 绘制CDF和直方图
fig, ax1 = plt.subplots(figsize=(10, 6))

# 左y轴绘制直方图
ax1.hist(sorted_data, bins=50, density=True, alpha=0.5, color='b', label='Histogram')
ax1.set_xlabel('p99 Latency')
ax1.set_ylabel('Fraction of Time', color='b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.grid(True)

# 使用twinx()创建右y轴，绘制CDF
ax2 = ax1.twinx()
ax2.plot(sorted_data, cdf, color='r', label='CDF')
ax2.set_ylabel('CDF', color='r')
ax2.tick_params(axis='y', labelcolor='r')
ax2.legend(loc='upper right')

plt.show()

# 一、相关性分析

## (1) 皮尔逊相关性

In [None]:
pearson_corr_matrix = analyze.pearson_correlation(df_workload)
pearson_corr_matrix

In [None]:
display.plt_corr_heatmap(pearson_corr_matrix)

### 1. 相关性排序

In [None]:
flat_corr_df = analyze.flatten_corr(
    pearson_corr_matrix,
    similarity_filter = [
        lambda x: x[x.abs() > c].dropna(axis=0, how='all'),
])
flat_corr_df

### 2. QoS 相关指标

In [None]:
qos_corr = analyze.single_corr(pearson_corr_matrix, qos_column, similarity_filter = [
    lambda x: x[x.abs() > c].dropna(axis=0, how='all'),
])
qos_corr

In [None]:
columns = list(qos_corr.index)[:16]
display.plt_by_column(df_workload, x_column=qos_column, columns=columns)

### 3. 相关性最大团

In [None]:
G, cliques = analyze.cliques_from_corr(pearson_corr_matrix, threshold=0.99)
largest_clique = cliques[0]
largest_clique

In [None]:
display.plt_clique_on_graph(G, largest_clique, scaling=1.25, label_pos=0.35)

## (2) 余弦相似性

In [None]:
cos_similarity_matrix = analyze.cosine_similarity(df_workload)
cos_similarity_matrix

In [None]:
display.plt_corr_heatmap(cos_similarity_matrix)

### 1. 相似性排序

In [None]:
flat_corr_df = analyze.flatten_corr(cos_similarity_matrix, similarity_filter = [
    lambda x: x[x > c].dropna(axis=0, how='all'),
])
flat_corr_df

### 2. QoS 相似指标

In [None]:
qos_corr = analyze.single_corr(cos_similarity_matrix, qos_column, similarity_filter = [
    lambda x : x[x.abs() > c].dropna(axis=0, how='all'),
    lambda x : x.drop(qos_columns, errors='ignore'),
])
qos_corr

In [None]:
display.plt_by_column(df_workload, x_column=qos_column, columns=list(qos_corr.index), ncols=3)

### 3. 相似性最大团

In [None]:
G, cliques = analyze.cliques_from_corr(cos_similarity_matrix, threshold=0.99)
largest_clique = cliques[0]
largest_clique

In [None]:
display.plt_clique_on_graph(G, largest_clique, scaling=1.25, label_pos=0.35)