In [None]:
import os
import json
from utils.vocabulary import *

import os
import json
from utils.vocabulary import *

data = "jigsaw"
root_dir = f"/scratch/yifwang/new_fairness_x_explainability/encoder_results_{data}"
models = ["bert", "roberta", "distilbert"]
bias_types = ["race", "gender", "religion"]
debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

training_types = ["all axes", "one axis"]
if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
fairness_metrics = ["accuracy", "f1", "fpr", "fnr", "individual_fairness"]


methods = ["Attention", "Saliency", "DeepLift", "InputXGradient", "IntegratedGradients", "Occlusion", "KernelShap"]
reliance_keys = ["raw", "max", "len", "norm"]


In [12]:

correlation_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "explanation_method": [], "correlation": [], "reliance_method": []}
for model in models:
    for bias_type in bias_types:
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                for explanation_method in methods:
                    file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "correlation", f"correlation_{explanation_method}_{bias_type}_test.json")
                    if not os.path.exists(file_path):
                        print(f"File not found: {file_path}")
                        continue
                    with open(file_path, "r") as f:
                        correlation_data = json.load(f)
                    for aggregation_method, value in correlation_data.items():
                        for reliance_method in reliance_keys:
                            correlation_dict["model"].append(model)
                            correlation_dict["bias_type"].append(bias_type)
                            correlation_dict["debiasing_method"].append(debiasing_method)
                            correlation_dict["training_data"].append(training_type)
                            correlation_dict["explanation_method"].append(aggregation_method)
                            correlation_dict["correlation"].append(value['abs_average'][reliance_method])
                            correlation_dict["reliance_method"].append(reliance_method)
                    



In [13]:
# convert to a pandas DataFrame
import pandas as pd
correlation_df = pd.DataFrame(correlation_dict)
correlation_df

Unnamed: 0,model,bias_type,debiasing_method,training_data,explanation_method,correlation,reliance_method
0,bert,race,no_debiasing,all axes,raw_attention,0.619244,raw
1,bert,race,no_debiasing,all axes,raw_attention,0.579729,max
2,bert,race,no_debiasing,all axes,raw_attention,0.598964,len
3,bert,race,no_debiasing,all axes,raw_attention,0.572680,norm
4,bert,race,no_debiasing,all axes,attention_rollout,0.578462,raw
...,...,...,...,...,...,...,...
7051,distilbert,religion,causal_debias,one axis,Occlusion_abs,0.403299,norm
7052,distilbert,religion,causal_debias,one axis,ShapleyValue,0.124311,raw
7053,distilbert,religion,causal_debias,one axis,ShapleyValue,0.111509,max
7054,distilbert,religion,causal_debias,one axis,ShapleyValue,0.194168,len


In [14]:
# show the average correlation for each explanation method with raw reliance method
average_corr_df = correlation_df[correlation_df['reliance_method'] == 'raw'].groupby('explanation_method')['correlation'].mean()
print(average_corr_df)

explanation_method
DeepLift_L2                 0.438122
DeepLift_mean               0.251806
InputXGradient_L2           0.432942
InputXGradient_mean         0.296284
IntegratedGradients_L2      0.380105
IntegratedGradients_mean    0.298514
Occlusion                   0.480676
Occlusion_abs               0.501809
Saliency_L2                 0.433678
Saliency_mean               0.300379
ShapleyValue                0.154751
attention_flow              0.257263
attention_rollout           0.313854
raw_attention               0.318524
Name: correlation, dtype: float64


In [15]:
# show correlation for each bias type and debiasing method
for bias_type in bias_types:
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['bias_type'] == bias_type)].groupby('explanation_method')['correlation'].mean()
    print(f"\nAverage correlation for bias type '{bias_type}':")
    print(average_corr_df)


Average correlation for bias type 'race':
explanation_method
DeepLift_L2                 0.544246
DeepLift_mean               0.239439
InputXGradient_L2           0.536281
InputXGradient_mean         0.323131
IntegratedGradients_L2      0.500451
IntegratedGradients_mean    0.332616
Occlusion                   0.571854
Occlusion_abs               0.617462
Saliency_L2                 0.536107
Saliency_mean               0.333400
ShapleyValue                0.158977
attention_flow              0.358707
attention_rollout           0.477709
raw_attention               0.492419
Name: correlation, dtype: float64

Average correlation for bias type 'gender':
explanation_method
DeepLift_L2                 0.308288
DeepLift_mean               0.195723
InputXGradient_L2           0.310008
InputXGradient_mean         0.224252
IntegratedGradients_L2      0.245095
IntegratedGradients_mean    0.234074
Occlusion                   0.355813
Occlusion_abs               0.356033
Saliency_L2               

In [16]:
# average correlation for different debiasing methods
for debiasing_method in debiasing_methods:
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['debiasing_method'] == debiasing_method)].groupby('explanation_method')['correlation'].mean()
    print(f"\nAverage correlation for debiasing method '{debiasing_method}':")
    print(average_corr_df)


Average correlation for debiasing method 'no_debiasing':
explanation_method
DeepLift_L2                 0.456196
DeepLift_mean               0.269720
InputXGradient_L2           0.448412
InputXGradient_mean         0.334086
IntegratedGradients_L2      0.397259
IntegratedGradients_mean    0.309048
Occlusion                   0.546162
Occlusion_abs               0.539814
Saliency_L2                 0.447181
Saliency_mean               0.301602
ShapleyValue                0.156088
attention_flow              0.255589
attention_rollout           0.328570
raw_attention               0.328281
Name: correlation, dtype: float64

Average correlation for debiasing method 'group_balance':
explanation_method
DeepLift_L2                 0.457427
DeepLift_mean               0.269614
InputXGradient_L2           0.453581
InputXGradient_mean         0.247981
IntegratedGradients_L2      0.401997
IntegratedGradients_mean    0.329092
Occlusion                   0.471468
Occlusion_abs               0.5314

In [17]:
# average correlation for different debiasing methods
for debiasing_method in debiasing_methods:
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['debiasing_method'] == debiasing_method)].groupby(['explanation_method', "bias_type"])['correlation'].mean()
    print(f"\nAverage correlation for debiasing method '{debiasing_method}':")
    print(average_corr_df)


Average correlation for debiasing method 'no_debiasing':
explanation_method        bias_type
DeepLift_L2               gender       0.267080
                          race         0.613960
                          religion     0.487547
DeepLift_mean             gender       0.211517
                          race         0.250081
                          religion     0.347563
InputXGradient_L2         gender       0.274912
                          race         0.598037
                          religion     0.472287
InputXGradient_mean       gender       0.225975
                          race         0.399152
                          religion     0.377131
IntegratedGradients_L2    gender       0.224063
                          race         0.563290
                          religion     0.404424
IntegratedGradients_mean  gender       0.248755
                          race         0.357760
                          religion     0.320628
Occlusion                 gender       0.4

In [18]:
# average correlation for different training types
for training_type in training_types:
    for bias_type in bias_types:
        print(f"\nAverage correlation for training type '{training_type}' and bias type '{bias_type}':")
        average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['training_data'] == training_type) & (correlation_df['bias_type'] == bias_type)].groupby('explanation_method')['correlation'].mean()
        print(average_corr_df)



Average correlation for training type 'all axes' and bias type 'race':
explanation_method
DeepLift_L2                 0.515741
DeepLift_mean               0.199269
InputXGradient_L2           0.509814
InputXGradient_mean         0.314981
IntegratedGradients_L2      0.464547
IntegratedGradients_mean    0.348157
Occlusion                   0.599968
Occlusion_abs               0.605123
Saliency_L2                 0.511414
Saliency_mean               0.353758
ShapleyValue                0.161724
attention_flow              0.322342
attention_rollout           0.433067
raw_attention               0.454428
Name: correlation, dtype: float64

Average correlation for training type 'all axes' and bias type 'gender':
explanation_method
DeepLift_L2                 0.293601
DeepLift_mean               0.192335
InputXGradient_L2           0.291429
InputXGradient_mean         0.197504
IntegratedGradients_L2      0.236013
IntegratedGradients_mean    0.224258
Occlusion                   0.343890
Occlu

In [19]:
for explanation_methods in correlation_df['explanation_method'].unique():
    average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['explanation_method'] == explanation_methods)].groupby(['debiasing_method', 'training_data'])['correlation'].mean()
    print(f"\nAverage correlation for explanation method '{explanation_methods}':")
    print(average_corr_df)


Average correlation for explanation method 'raw_attention':
debiasing_method     training_data
attention_entropy    all axes         0.283534
                     one axis         0.342257
causal_debias        all axes         0.268775
                     one axis         0.408254
cda                  all axes         0.220894
                     one axis         0.157803
dropout              all axes         0.302334
                     one axis         0.371142
group_balance        all axes         0.337411
                     one axis         0.378095
group_class_balance  all axes         0.387342
                     one axis         0.344925
no_debiasing         all axes         0.341881
                     one axis         0.314682
Name: correlation, dtype: float64

Average correlation for explanation method 'attention_rollout':
debiasing_method     training_data
attention_entropy    all axes         0.269448
                     one axis         0.338312
causal_debias     

In [20]:
# average correlation of each explanation method for each debiasing method and training type
for debiasing_method in debiasing_methods:
    for training_type in training_types:
        average_corr_df = correlation_df[(correlation_df['reliance_method'] == 'raw') & (correlation_df['debiasing_method'] == debiasing_method) & (correlation_df['training_data'] == training_type)].groupby('explanation_method')['correlation'].mean()
        print(f"\nAverage correlation for debiasing method '{debiasing_method}' and training type '{training_type}':")
        print(average_corr_df)


Average correlation for debiasing method 'no_debiasing' and training type 'all axes':
explanation_method
DeepLift_L2                 0.431904
DeepLift_mean               0.216874
InputXGradient_L2           0.422578
InputXGradient_mean         0.296455
IntegratedGradients_L2      0.395159
IntegratedGradients_mean    0.319097
Occlusion                   0.528216
Occlusion_abs               0.526822
Saliency_L2                 0.422076
Saliency_mean               0.292903
ShapleyValue                0.138839
attention_flow              0.269076
attention_rollout           0.336322
raw_attention               0.341881
Name: correlation, dtype: float64

Average correlation for debiasing method 'no_debiasing' and training type 'one axis':
explanation_method
DeepLift_L2                 0.480487
DeepLift_mean               0.322566
InputXGradient_L2           0.474246
InputXGradient_mean         0.371716
IntegratedGradients_L2      0.399360
IntegratedGradients_mean    0.298998
Occlusion     