# Example with a regressed target metric due to Population Bias

Here is an example of a pair of  Control and Treatment datasets where the regression in target metric is due to population bias. We will see that after the datasets are normalized, there is no more statistical differences between two datasets.  

In [1]:
import os
import pandas as pd
import hashlib
import json
from mct.BiasTester import BiasTester
from mct.MetricComparer import MetricComparer

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [None]:
id='25969'
# 创建结果保存的文件夹
results_dir = f'results_{id}'
os.makedirs(results_dir, exist_ok=True)

In [None]:
# 生成字符串的MD5码
def generate_md5(input_string):
    md5_object = hashlib.md5()
    md5_object.update(input_string.encode('utf-8'))
    return md5_object.hexdigest()

In [None]:
# 定义文件路径
kpi_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_kpi.csv'
log_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_log.csv'
train_output_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names = {col: generate_md5(col) for col in combined_df.columns}

# 输出属性名的MD5码
for original, md5_name in md5_column_names.items():
    print(f"Original column name: {original}, MD5 hash: {md5_name}")

# # 获取当前所有列名
# original_columns = combined_df.columns
# #修改输出后的列名
# new_columns = [f'invariant_columns_{i}' for i in range(len(original_columns))]
# combined_df.columns = new_columns

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(train_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'train_combined.csv')
combined_df.to_csv(output_file, index=False)

print(f"数据已成功合并并保存到 {output_file}")

In [None]:
# 定义文件路径
kpi_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_kpi.csv'
log_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_log.csv'
test_output_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names = {col: generate_md5(col) for col in combined_df.columns}

# 输出属性名的MD5码
for original, md5_name in md5_column_names.items():
    print(f"Original column name: {original}, MD5 hash: {md5_name}")

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(test_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'test_combined.csv')
combined_df.to_csv(output_file, index=False)
print(f"数据已成功合并并保存到 {output_file}")

In [2]:
control     = pd.read_csv(train_output_file,   na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
treatment   = pd.read_csv(test_output_file, na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
config_file = 'config_yid.json'
with open(config_file) as file:
    config = json.load(file)

In [3]:
delta_comparer = MetricComparer(config)
metric_delta = delta_comparer.compare(control, treatment)
metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,-0.01388,0.93693,0.92305,5.419659e-34,True


In [4]:
bias_tester = BiasTester(config)
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
0,country,5932.719164,0.0,13,7.1405,14,yes
1,platform,910.533358,4.594706e-197,3,2.9035,4,yes
4,city,5958.097785,0.0,36,1.9305,37,yes
5,network,275.556129,1.938026e-59,3,1.1075,4,yes
2,client_version,7.786619,0.5557849,9,0.123,10,no
3,media,0.126772,0.9385813,2,0.022,3,no


In [5]:
n_control, n_treatment =bias_tester.normalize_bias(control, treatment, bias_results)
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
n_bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
0,country,7.649257,0.865676,13,0.199221,14,no
1,platform,0.934298,0.817144,3,0.116893,4,no
4,city,9.669949,0.999913,31,0.079814,32,no
2,client_version,3.028545,0.963152,9,0.076043,10,no
3,media,0.691694,0.707621,2,0.05279,3,no
5,network,0.29593,0.960792,3,0.04462,4,no


In [6]:
n_metric_delta = delta_comparer.compare(n_control, n_treatment)
n_metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,-0.000817,0.933258,0.932441,0.521495,False


In [None]:
# 保存结果到文件夹
metric_delta.to_csv(os.path.join(results_dir, 'metric_delta.csv'), index=False)
bias_results.to_csv(os.path.join(results_dir, 'bias_results.csv'), index=False)
n_bias_results.to_csv(os.path.join(results_dir, 'n_bias_results.csv'), index=False)
n_metric_delta.to_csv(os.path.join(results_dir, 'n_metric_delta.csv'), index=False)