In [7]:
import os
import json
from utils.vocabulary import *

import os
import json
from utils.vocabulary import *

data = "jigsaw"
root_dir = f"/scratch/yifwang/fairness_x_explainability/decoder_results_{data}"

models = ["llama_3b", "qwen3_4b"]
bias_types = ["race", "gender", "religion"]
debiasing_methods = ["zero_shot", "few_shot", "fairness_imagination", "fairness_instruction"]

if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}

fairness_metrics = ["accuracy", "f1", "fpr", "fnr", "individual_fairness"]


methods = ["Attention", "Saliency", "DeepLift", "InputXGradient", "IntegratedGradients", "Occlusion", "KernelShap"]
#methods = ["Attention", "Saliency", "DeepLift", "InputXGradient", "Occlusion", "KernelShap"]
reliance_keys = ["raw", "max", "len", "norm"]


In [8]:

correlation_dict = {"model": [], "bias_type": [], "debiasing_method": [], "explanation_method": [], "correlation": [], "reliance_method": []}
for model in models:
    for bias_type in bias_types:
        for debiasing_method in debiasing_methods:
            for explanation_method in methods:
                file_path = os.path.join(root_dir, f"{model}_{data}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "correlation", f"correlation_{explanation_method}_{bias_type}_test.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    correlation_data = json.load(f)
                    if correlation_data is None or correlation_data == {}:
                        print(f"No data found in file: {file_path}")
                        continue
                for aggregation_method, value in correlation_data.items():
                    for reliance_method in reliance_keys:
                        correlation_dict["model"].append(model)
                        correlation_dict["bias_type"].append(bias_type)
                        correlation_dict["debiasing_method"].append(debiasing_method)
                        correlation_dict["explanation_method"].append(aggregation_method)
                        correlation_dict["correlation"].append(value['abs_average'][reliance_method])
                        correlation_dict["reliance_method"].append(reliance_method)
                



File not found: /scratch/yifwang/fairness_x_explainability/decoder_results_jigsaw/llama_3b_jigsaw_religion_test_200/few_shot/correlation/correlation_IntegratedGradients_religion_test.json
File not found: /scratch/yifwang/fairness_x_explainability/decoder_results_jigsaw/qwen3_4b_jigsaw_religion_test_200/zero_shot/correlation/correlation_IntegratedGradients_religion_test.json
No data found in file: /scratch/yifwang/fairness_x_explainability/decoder_results_jigsaw/qwen3_4b_jigsaw_religion_test_200/few_shot/correlation/correlation_IntegratedGradients_religion_test.json


In [3]:
# convert to a pandas DataFrame
import pandas as pd
correlation_df = pd.DataFrame(correlation_dict)
correlation_df

Unnamed: 0,model,bias_type,debiasing_method,explanation_method,correlation,reliance_method
0,qwen_3b,race,zero_shot,raw_attention,0.099832,raw
1,qwen_3b,race,zero_shot,raw_attention,0.082013,max
2,qwen_3b,race,zero_shot,raw_attention,0.136150,len
3,qwen_3b,race,zero_shot,raw_attention,0.090346,norm
4,qwen_3b,race,zero_shot,attention_rollout,0.094813,raw
...,...,...,...,...,...,...
491,qwen3_4b,religion,zero_shot,Occlusion_abs,0.173336,norm
492,qwen3_4b,religion,zero_shot,KernelShap,0.142212,raw
493,qwen3_4b,religion,zero_shot,KernelShap,0.114401,max
494,qwen3_4b,religion,zero_shot,KernelShap,0.159080,len


In [4]:
# average correlation for different debiasing methods
for debiasing_method in debiasing_methods:
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['debiasing_method'] == debiasing_method)].groupby('explanation_method')['correlation'].mean()
    print(f"\nAverage correlation for debiasing method '{debiasing_method}':")
    print(average_corr_df)


Average correlation for debiasing method 'zero_shot':
explanation_method
DeepLift_L2                 0.313186
DeepLift_mean               0.140623
InputXGradient_L2           0.315880
InputXGradient_mean         0.186493
IntegratedGradients_L2      0.174136
IntegratedGradients_mean    0.083884
KernelShap                  0.080674
Occlusion                   0.322549
Occlusion_abs               0.340336
Saliency_L2                 0.318151
Saliency_mean               0.159238
attention_flow              0.074387
attention_rollout           0.073606
raw_attention               0.169940
Name: correlation, dtype: float64

Average correlation for debiasing method 'few_shot':
Series([], Name: correlation, dtype: float64)

Average correlation for debiasing method 'fairness_imagination':
Series([], Name: correlation, dtype: float64)

Average correlation for debiasing method 'fairness_instruction':
Series([], Name: correlation, dtype: float64)


In [5]:
# average correlation for different training types

for bias_type in bias_types:
    print(f"\nAverage correlation for bias type '{bias_type}':")
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['bias_type'] == bias_type)].groupby('explanation_method')['correlation'].mean()
    print(average_corr_df)



Average correlation for bias type 'race':
explanation_method
DeepLift_L2                 0.231612
DeepLift_mean               0.140809
InputXGradient_L2           0.239944
InputXGradient_mean         0.204611
IntegratedGradients_L2      0.152402
IntegratedGradients_mean    0.069683
KernelShap                  0.080430
Occlusion                   0.246953
Occlusion_abs               0.219718
Saliency_L2                 0.239122
Saliency_mean               0.133599
attention_flow              0.058898
attention_rollout           0.077325
raw_attention               0.131889
Name: correlation, dtype: float64

Average correlation for bias type 'gender':
explanation_method
DeepLift_L2                 0.266448
DeepLift_mean               0.088323
InputXGradient_L2           0.266466
InputXGradient_mean         0.115941
IntegratedGradients_L2      0.160648
IntegratedGradients_mean    0.066596
KernelShap                  0.047511
Occlusion                   0.315709
Occlusion_abs             

In [7]:
# average correlation of each explanation method for each debiasing method and training type
for bias_type in bias_types:
    for model in models:

        for debiasing_method in debiasing_methods:

            average_corr_df = correlation_df[(correlation_df['model'] == model) & (correlation_df['bias_type'] == bias_type) & (correlation_df['reliance_method'] == 'raw') & (correlation_df['debiasing_method'] == debiasing_method)].groupby('explanation_method')['correlation'].mean()
            print(f"\nAverage correlation for model '{model}', bias type '{bias_type}' debiasing method '{debiasing_method}'':")
            print(average_corr_df)


Average correlation for model 'qwen_3b', bias type 'race' debiasing method 'zero_shot'':
explanation_method
DeepLift_L2                 0.151971
DeepLift_mean               0.142815
InputXGradient_L2           0.157426
InputXGradient_mean         0.262467
IntegratedGradients_L2      0.022751
IntegratedGradients_mean    0.032969
KernelShap                  0.129740
Occlusion                   0.369940
Occlusion_abs               0.217725
Saliency_L2                 0.163181
Saliency_mean               0.048002
attention_flow              0.060826
attention_rollout           0.094813
raw_attention               0.099832
Name: correlation, dtype: float64

Average correlation for model 'qwen_3b', bias type 'race' debiasing method 'few_shot'':
Series([], Name: correlation, dtype: float64)

Average correlation for model 'qwen_3b', bias type 'race' debiasing method 'fairness_imagination'':
Series([], Name: correlation, dtype: float64)

Average correlation for model 'qwen_3b', bias type 'race