# Example with a regressed target metric due to Population Bias

Here is an example of a pair of  Control and Treatment datasets where the regression in target metric is due to population bias. We will see that after the datasets are normalized, there is no more statistical differences between two datasets.  

In [1]:
import os
import pandas as pd
import hashlib
import json
from mct.BiasTester import BiasTester
from mct.MetricComparer import MetricComparer

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
id='25494'
# 创建结果保存的文件夹
results_dir = f'results_{id}'
os.makedirs(results_dir, exist_ok=True)

In [3]:
# 生成字符串的MD5码
def generate_md5(input_string):
    md5_object = hashlib.md5()
    md5_object.update(input_string.encode('utf-8'))
    return md5_object.hexdigest()

In [4]:
# 定义文件路径
kpi_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_kpi.csv'
log_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_log.csv'
train_output_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/sc/yid/{id}/train_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names = {col: generate_md5(col) for col in combined_df.columns}

# 输出属性名的MD5码
for original, md5_name in md5_column_names.items():
    print(f"Original column name: {original}, MD5 hash: {md5_name}")

# # 获取当前所有列名
# original_columns = combined_df.columns
# #修改输出后的列名
# new_columns = [f'invariant_columns_{i}' for i in range(len(original_columns))]
# combined_df.columns = new_columns

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(train_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'train_combined.csv')
combined_df.to_csv(output_file, index=False)

# print(f"数据已成功合并并保存到 {output_file}")

Original column name: pageResource all api not hava log, MD5 hash: 1fbf83546230fce10599e69b1ac78a07
Original column name: isEqualDirectSuperior failed, err=fanweiEmployee not exist in yid, MD5 hash: 676d60696ca737db0634b52c6d7625aa
Original column name: [isEqualDirectSuperior] fanwei Employee not exist in yid, MD5 hash: b0c52740b2c50bfce18ca4b4abe54589
Original column name: the employee does not exist, MD5 hash: 8ae34a213e21ba77a38bd7cb4c9ab59c
Original column name: not find dictionary type=<:NUM:>, MD5 hash: 49a31c561aad9ddcfc906e27c33d8de5
Original column name: [ApplySyncEmp] ExMail err=<nil>, MD5 hash: 58f86cc0c6def7af9775cf8e7f4992bc
Original column name: AuthSessionVali<:updateFanweiEmployee failed:>, MD5 hash: 6689748c163274d0b5ff0da639c826c0
Original column name: [securitySessionCheck] GetSecuritySessionFromReq failed, err=securecookie: expired timestamp [trace.id=<:SEQ:> <:transactionID:> <:spanID:>, MD5 hash: a52809c13042e72dfad83cec67c9eb77
Original column name: [securitySess

In [5]:
# 定义文件路径
kpi_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_kpi.csv'
log_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_log.csv'
test_output_file = f'/home/sunyongqian/liuheng/aiops-scwarn/data/daily/yid/{id}/test_combined.csv'

# 读取两个 CSV 文件
kpi_df = pd.read_csv(kpi_file)
log_df = pd.read_csv(log_file)

# 确保两个 DataFrame 的行数相同
if len(kpi_df) != len(log_df):
    raise ValueError("两个 CSV 文件的行数不一致。请检查数据。")

# 合并 DataFrame
combined_df = pd.concat([log_df,kpi_df], axis=1)

# 获取原始列名并转换为MD5码
md5_column_names_1 = {col: generate_md5(col) for col in combined_df.columns}


# 删除表头中的双引号
md5_column_names = [col.replace('"', '') for col in md5_column_names_1]


# # 输出属性名的MD5码
# for original, md5_name in md5_column_names.items():
#     print(f"Original column name: {original}, MD5 hash: {md5_name}")

# 修改最后一列的列名
last_column_name = combined_df.columns[-1]
combined_df.rename(columns={last_column_name: 'target_metric'}, inplace=True)

# 保存合并后的数据到新文件
combined_df.to_csv(test_output_file, index=False)

# 目标目录
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
output_file = os.path.join(target_dir, 'test_combined.csv')
combined_df.to_csv(output_file, index=False)
# print(f"数据已成功合并并保存到 {output_file}")

In [6]:
from sklearn.preprocessing import StandardScaler

# print("训练集和测试集开始标准化并保存。")
target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'
# 确保目标目录存在，如果不存在则创建
os.makedirs(target_dir, exist_ok=True)
# 初始化标准化器
scaler = StandardScaler()

train_output_file=f"/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}/train_combined.csv"
test_output_file=f"/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}/test_combined.csv"


# 读取 CSV 文件
train_data = pd.read_csv(train_output_file)
test_data = pd.read_csv(test_output_file)

# # 将 timestamp 列从数据集中分离出来
# train_timestamp = train_data[['timestamp']]
# test_timestamp = test_data[['timestamp']]
# 选择所有包含 'timestamp' 的列
train_timestamp = train_data.filter(like='timestamp')
test_timestamp = test_data.filter(like='timestamp')


# # 删除 timestamp 列以进行标准化
# train_data_no_timestamp = train_data.drop(columns=['timestamp'])
# test_data_no_timestamp = test_data.drop(columns=['timestamp'])

# 删除所有包含 'timestamp' 的列
train_data_no_timestamp = train_data.drop(columns=train_data.filter(like='timestamp').columns)
test_data_no_timestamp = test_data.drop(columns=test_data.filter(like='timestamp').columns)


# 对训练集和测试集进行标准化
train_data_scaled_no_timestamp = pd.DataFrame(scaler.fit_transform(train_data_no_timestamp), columns=train_data_no_timestamp.columns)
test_data_scaled_no_timestamp = pd.DataFrame(scaler.transform(test_data_no_timestamp), columns=test_data_no_timestamp.columns)

# 将 timestamp 列重新添加到标准化后的数据集中
train_data_scaled = pd.concat([train_data_scaled_no_timestamp, train_timestamp.reset_index(drop=True)], axis=1)
test_data_scaled = pd.concat([test_data_scaled_no_timestamp, test_timestamp.reset_index(drop=True)], axis=1)



# # 对训练集和测试集进行标准化
# train_data_scaled = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)
# test_data_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)

# 保存标准化后的数据

train_scaled_output_file = os.path.join(target_dir, 'train_combined_scaled.csv')
test_scaled_output_file = os.path.join(target_dir, 'test_combined_scaled.csv')


train_data_scaled.to_csv(train_scaled_output_file , index=False)
test_data_scaled.to_csv(test_scaled_output_file, index=False)

print("训练集和测试集已成功标准化并保存。")


训练集和测试集已成功标准化并保存。


In [7]:
# target_dir = f'/home/sunyongqian/liuheng/shenchao/Lumos/results_{id}'

# train_scaled_output_file = os.path.join(target_dir, 'train_combined_scaled.csv')
# test_scaled_output_file = os.path.join(target_dir, 'test_combined_scaled.csv')

In [8]:
control     = pd.read_csv(train_scaled_output_file ,   na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
treatment   = pd.read_csv(test_scaled_output_file , na_values=["", "nan", "NaN", "#NULL#", "#NUL#"])
config_file = 'config_yid.json'
with open(config_file) as file:
    config = json.load(file)

In [9]:
delta_comparer = MetricComparer(config)
metric_delta = delta_comparer.compare(control, treatment)
metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,-0.939438,-8.062496e-14,-0.939438,0.0,True


In [10]:
bias_tester = BiasTester(config)
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
0,"isEqualDirectSuperior failed, err=fanweiEmploy...",2058.52356,0.0,7,28.272493,8,yes
1,the employee does not exist,812.639603,2.392952e-145,39,14.736152,40,yes
2,not find dictionary type=<:NUM:>,202.251907,2.535978e-37,11,11.578441,12,yes
4,[securitySessionCheck] GetSecuritySessionFromR...,19.4288,0.006945062,7,5.33516,8,yes
5,[securitySessionCheck] CreatAndSaveSessio<:upd...,19.4288,0.006945062,7,5.33516,8,yes
3,AuthSessionVali<:updateFanweiEmployee failed:>,39.152511,2.387373e-05,10,5.088001,11,yes
6,[getOtpKey] <:*:> failed:>,14.902919,0.06106085,8,4.436451,9,no
7,[GetWeChatUserInfo] GetWeChatUserInf<:updateFa...,14.902919,0.06106085,8,4.436451,9,no
8,no employee,0.634214,0.8885572,3,0.751685,4,no
9,"[SimplePost] failed, <:ERRORPost:>",0.000265,0.9870235,1,0.041594,2,no


In [11]:
n_control, n_treatment =bias_tester.normalize_bias(control, treatment, bias_results)
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
n_bias_results

Unnamed: 0,feature,chi_square,p_value,dof,Percentage Deviation,num bins,resample
3,AuthSessionVali<:updateFanweiEmployee failed:>,26.998772,0.000145,6,9.019608,7,yes
4,[securitySessionCheck] GetSecuritySessionFromR...,19.782986,0.003027,6,7.843137,7,yes
5,[securitySessionCheck] CreatAndSaveSessio<:upd...,19.782986,0.003027,6,7.843137,7,yes
6,[getOtpKey] <:*:> failed:>,26.187788,0.000466,7,7.647059,8,yes
7,[GetWeChatUserInfo] GetWeChatUserInf<:updateFa...,26.187788,0.000466,7,7.647059,8,yes
1,the employee does not exist,79.25697,2.8e-05,35,4.901961,36,yes
11,"[updateFanweiEmployee] failed, <:updateEmploye...",8.294263,0.003977,1,3.333333,2,yes
10,[UpdateEmployee] <:SimplePos<:updateFanweiEmpl...,8.294263,0.003977,1,3.333333,2,yes
9,"[SimplePost] failed, <:ERRORPost:>",8.294263,0.003977,1,3.333333,2,yes
0,"isEqualDirectSuperior failed, err=fanweiEmploy...",11.004482,0.026514,4,3.137255,5,no


In [12]:
n_metric_delta = delta_comparer.compare(n_control, n_treatment)
n_metric_delta

Unnamed: 0,Percent Difference,Percent Control,Percent Treatment,P-Value,Is Stat-Sig
0,-0.97828,0.145747,-0.832532,1.783648e-58,True


In [13]:
# 保存结果到文件夹
metric_delta.to_csv(os.path.join(results_dir, 'metric_delta.csv'), index=False)
bias_results.to_csv(os.path.join(results_dir, 'bias_results.csv'), index=False)
n_bias_results.to_csv(os.path.join(results_dir, 'n_bias_results.csv'), index=False)
n_metric_delta.to_csv(os.path.join(results_dir, 'n_metric_delta.csv'), index=False)