# Example with a regressed target metric due to Population Bias

Here is an example of a pair of  Control and Treatment datasets where the regression in target metric is due to population bias. We will see that after the datasets are normalized, there is no more statistical differences between two datasets.  

In [1]:
import os
import pandas as pd
import hashlib
import json
from mct.BiasTester import BiasTester
from mct.MetricComparer import MetricComparer

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
id='25969'
# 创建结果保存的文件夹
results_dir = f'results_{id}'
os.makedirs(results_dir, exist_ok=True)

In [3]:
# 生成字符串的MD5码
def generate_md5(input_string):
    md5_object = hashlib.md5()
    md5_object.update(input_string.encode('utf-8'))
    return md5_object.hexdigest()

In [4]:
# 定义文件路径
kpi_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/train/{id}/train_kpi.csv'
log_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/train/{id}/train_log.csv'
train_output_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/train/{id}/train_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names = {col: generate_md5(col) for col in combined_df.columns}

# 输出属性名的MD5码
for original, md5_name in md5_column_names.items():
    print(f"Original column name: {original}, MD5 hash: {md5_name}")

# # 获取当前所有列名
# original_columns = combined_df.columns
# #修改输出后的列名
# new_columns = [f'invariant_columns_{i}' for i in range(len(original_columns))]
# combined_df.columns = new_columns

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(train_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'train_combined.csv')
combined_df.to_csv(output_file, index=False)

print(f"数据已成功合并并保存到 {output_file}")

Original column name: isEqualDirectSuperior failed, err=fanweiEmployee not exist in yid, MD5 hash: 676d60696ca737db0634b52c6d7625aa
Original column name: [isEqualDirectSuperior] fanwei Employee not exist in yid, MD5 hash: b0c52740b2c50bfce18ca4b4abe54589
Original column name: the employee does not exist, MD5 hash: 8ae34a213e21ba77a38bd7cb4c9ab59c
Original column name: not find dictionary type=<:NUM:>, MD5 hash: 49a31c561aad9ddcfc906e27c33d8de5
Original column name: AuthSessionVali<:updateFanweiEmployee failed:>, MD5 hash: 6689748c163274d0b5ff0da639c826c0
Original column name: [getOtpKey] <:*:> failed:>, MD5 hash: 3ef3bde3ec13d1e570e0609ce3ea2f39
Original column name: [securitySessionCheck] GetSecuritySessionFromReq failed, err=securecookie: expired timestamp [trace.id=<:SEQ:> <:transactionID:> <:spanID:>, MD5 hash: a52809c13042e72dfad83cec67c9eb77
Original column name: [securitySessionCheck] CreatAndSaveSessio<:updateFanweiEmployee failed:>, MD5 hash: 1a2716f0775e9300c7518464840cf343
O

In [5]:
# 定义文件路径
kpi_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/test/{id}/test_kpi.csv'
log_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/test/{id}/test_log.csv'
test_output_file = f'/home/zhengtinghua/shenchao/baseline/new_dataset/test/{id}/test_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names = {col: generate_md5(col) for col in combined_df.columns}

# 输出属性名的MD5码
for original, md5_name in md5_column_names.items():
    print(f"Original column name: {original}, MD5 hash: {md5_name}")

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(test_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'test_combined.csv')
combined_df.to_csv(output_file, index=False)
print(f"数据已成功合并并保存到 {output_file}")

Original column name: isEqualDirectSuperior failed, err=fanweiEmployee not exist in yid, MD5 hash: 676d60696ca737db0634b52c6d7625aa
Original column name: [isEqualDirectSuperior] fanwei Employee not exist in yid, MD5 hash: b0c52740b2c50bfce18ca4b4abe54589
Original column name: the employee does not exist, MD5 hash: 8ae34a213e21ba77a38bd7cb4c9ab59c
Original column name: not find dictionary type=<:NUM:>, MD5 hash: 49a31c561aad9ddcfc906e27c33d8de5
Original column name: AuthSessionVali<:updateFanweiEmployee failed:>, MD5 hash: 6689748c163274d0b5ff0da639c826c0
Original column name: [getOtpKey] <:*:> failed:>, MD5 hash: 3ef3bde3ec13d1e570e0609ce3ea2f39
Original column name: [securitySessionCheck] GetSecuritySessionFromReq failed, err=securecookie: expired timestamp [trace.id=<:SEQ:> <:transactionID:> <:spanID:>, MD5 hash: a52809c13042e72dfad83cec67c9eb77
Original column name: [securitySessionCheck] CreatAndSaveSessio<:updateFanweiEmployee failed:>, MD5 hash: 1a2716f0775e9300c7518464840cf343
O

In [6]:
control     = pd.read_csv(train_output_file,   na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
treatment   = pd.read_csv(test_output_file, na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
config_file = 'config_yid.json'
with open(config_file) as file:
    config = json.load(file)

In [7]:
delta_comparer = MetricComparer(config)
metric_delta = delta_comparer.compare(control, treatment)
metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,0.471095,101.802841,102.273936,0.0,True


In [8]:
bias_tester = BiasTester(config)
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
0,"isEqualDirectSuperior failed, err=fanweiEmploy...",925.761691,7.069314e-198,5,28.942409,6,yes
1,the employee does not exist,802.80133,5.647329e-144,38,16.974525,39,yes
6,[getOtpKey] <:*:> failed:>,17.189147,0.02819868,8,4.92456,9,no
7,[GetWeChatUserInfo] GetWeChatUserInf<:updateFa...,17.189147,0.02819868,8,4.92456,9,no
3,AuthSessionVali<:updateFanweiEmployee failed:>,22.170279,0.01426025,10,4.344909,11,no
4,[securitySessionCheck] GetSecuritySessionFromR...,18.760433,0.008971872,7,4.002285,8,yes
5,[securitySessionCheck] CreatAndSaveSessio<:upd...,18.760433,0.008971872,7,4.002285,8,yes
2,not find dictionary type=<:NUM:>,23.703851,0.03396598,13,1.905136,14,no
8,no employee,82.952183,7.139377e-18,3,1.56008,4,yes
9,"[SimplePost] failed, <:ERRORPost:>",1.6e-05,0.9968134,1,0.062536,2,no


In [9]:
n_control, n_treatment =bias_tester.normalize_bias(control, treatment, bias_results)
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
n_bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
3,AuthSessionVali<:updateFanweiEmployee failed:>,69.63211,1.753225e-12,7,11.749347,8,yes
4,[securitySessionCheck] GetSecuritySessionFromR...,52.675101,1.364009e-09,6,10.313316,7,yes
5,[securitySessionCheck] CreatAndSaveSessio<:upd...,52.675101,1.364009e-09,6,10.313316,7,yes
6,[getOtpKey] <:*:> failed:>,53.744151,2.642482e-09,7,9.921671,8,yes
7,[GetWeChatUserInfo] GetWeChatUserInf<:updateFa...,53.744151,2.642482e-09,7,9.921671,8,yes
1,the employee does not exist,119.26624,3.995958e-11,35,8.093995,36,yes
9,"[SimplePost] failed, <:ERRORPost:>",2.663001,0.1027068,1,2.610966,2,no
11,"[updateFanweiEmployee] failed, <:updateEmploye...",2.399652,0.1213623,1,2.480418,2,no
10,[UpdateEmployee] <:SimplePos<:updateFanweiEmpl...,2.399652,0.1213623,1,2.480418,2,no
0,"isEqualDirectSuperior failed, err=fanweiEmploy...",3.282433,0.1937442,2,2.088773,3,no


In [10]:
n_metric_delta = delta_comparer.compare(n_control, n_treatment)
n_metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,0.300345,101.969692,102.270038,7.828687e-109,True


In [11]:
# 保存结果到文件夹
metric_delta.to_csv(os.path.join(results_dir, 'metric_delta.csv'), index=False)
bias_results.to_csv(os.path.join(results_dir, 'bias_results.csv'), index=False)
n_bias_results.to_csv(os.path.join(results_dir, 'n_bias_results.csv'), index=False)
n_metric_delta.to_csv(os.path.join(results_dir, 'n_metric_delta.csv'), index=False)