# Example with a regressed target metric due to Population Bias

Here is an example of a pair of  Control and Treatment datasets where the regression in target metric is due to population bias. We will see that after the datasets are normalized, there is no more statistical differences between two datasets.  

In [1]:
import pandas as pd
import json
from mct.BiasTester import BiasTester
from mct.MetricComparer import MetricComparer

In [None]:
# 定义文件路径
kpi_file = '/home/zhengtinghua/shenchao/baseline/new_dataset/70001/'
log_file = '/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid_k8s/10033/train_log.csv'
output_file = '/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid_k8s/10033/train_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([kpi_df, log_df], axis=1)

# 保存合并后的数据到新文件
combined_df.to_csv(output_file, index=False)

print(f"数据已成功合并并保存到 {output_file}")

In [2]:
control     = pd.read_csv('population_bias_control.csv',   na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
treatment   = pd.read_csv('population_bias_treatment.csv', na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
config_file = 'config.json'
with open(config_file) as file:
    config = json.load(file)

In [3]:
delta_comparer = MetricComparer(config)
metric_delta = delta_comparer.compare(control, treatment)
metric_delta

Unnamed: 0,Is Stat-Sig,P-Value,Percent Control,Percent Difference,Percent Treatment
0,True,5.419659e-34,0.93693,-0.01388,0.92305


In [4]:
bias_tester = BiasTester(config)
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
bias_results

Unnamed: 0,Percentage Deviation,chi_square,dof,feature,p_value,num bins,resample
0,7.1405,5932.719164,13,country,0.0,14,yes
1,2.9035,910.533358,3,platform,4.594706e-197,4,yes
4,1.9305,5958.097785,36,city,0.0,37,yes
5,1.1075,275.556129,3,network,1.938026e-59,4,yes
2,0.123,7.786619,9,client_version,0.5557849,10,no
3,0.022,0.126772,2,media,0.9385813,3,no


In [5]:
n_control, n_treatment =bias_tester.normalize_bias(control, treatment, bias_results)
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
n_bias_results

Unnamed: 0,Percentage Deviation,chi_square,dof,feature,p_value,num bins,resample
0,0.199238,8.442611,13,country,0.813618,14,no
4,0.077935,11.416894,30,city,0.999135,31,no
2,0.04651,1.551026,9,client_version,0.99675,10,no
5,0.04211,0.245269,3,network,0.96997,4,no
1,0.040853,0.185716,3,platform,0.979862,4,no
3,0.036454,0.11496,2,media,0.944141,3,no


In [6]:
n_metric_delta = delta_comparer.compare(n_control, n_treatment)
n_metric_delta

Unnamed: 0,Is Stat-Sig,P-Value,Percent Control,Percent Difference,Percent Treatment
0,False,0.718525,0.933013,-0.000465,0.932548
