In [10]:
import os
import json
from utils.vocabulary import *
import pandas as pd

data = "civil"
root_dir = f"/scratch/yifwang/fairness_x_explainability/encoder_results_{data}"
models = ["bert", "roberta"]
bias_types = ["race", "gender", "religion"]
debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

training_types = ["all axes", "one axis"]
if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
fairness_metrics = ["accuracy", "f1", "fpr", "fnr", "individual_fairness"]


methods = ["Attention", "Saliency", "DeepLift", "InputXGradient", "IntegratedGradients", "Occlusion", "KernelShap"]
reliance_keys = ["raw"]


In [11]:
correlation_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "explanation_method": [], "correlation": [], "reliance_method": [], "p": []}
for model in models:
    for bias_type in bias_types:
        classes = ["positive", "negative"]
        groups = SOCIAL_GROUPS[bias_type]
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                for explanation_method in methods:
                    file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "correlation", f"correlation_{explanation_method}_{bias_type}_test.json")
                    if not os.path.exists(file_path):
                        print(f"File not found: {file_path}")
                        continue
                    with open(file_path, "r") as f:
                        correlation_data = json.load(f)
                    for aggregation_method, value in correlation_data.items():
                        for reliance_method in reliance_keys:
                            for one_class in classes:
                                for group in groups:
                                    
                                    correlation_dict["model"].append(model)
                                    correlation_dict["bias_type"].append(bias_type)
                                    correlation_dict["debiasing_method"].append(debiasing_method)
                                    correlation_dict["training_data"].append(training_type)
                                    correlation_dict["explanation_method"].append(aggregation_method)
                                    correlation_dict["correlation"].append(value['abs_average'][reliance_method])
                                    correlation_dict["reliance_method"].append(reliance_method)
                                    correlation_dict["p"].append(value[f'{group}_{one_class}'][reliance_method][1])

correlation_df = pd.DataFrame(correlation_dict)
    

In [12]:
selected_model = "bert"
selected_train_data = "one axis" # "all axes" or "one axis"
selected_bias_type = "religion"
selected_reliance_method = "raw"
alpha = 0.05

for debiasing_method in debiasing_methods:
    print("==========================")
    print(debiasing_method)
    print("==========================")
    print("")
    for explanation_method in correlation_df['explanation_method'].unique():
        print(f"--- {explanation_method} ---")
        subset = correlation_df[
            (correlation_df['model'] == selected_model) &
            (correlation_df['training_data'] == selected_train_data) &
            (correlation_df['bias_type'] == selected_bias_type) &
            (correlation_df['debiasing_method'] == debiasing_method) &
            (correlation_df['reliance_method'] == selected_reliance_method) &
            (correlation_df['explanation_method'] == explanation_method)
        ]
        if subset.empty:
            print(f"{explanation_method}: No data available")
            continue
        # choose all p values
        p_values = subset['p'].values
        # how many p values are less than
        significant_count = (p_values < alpha).sum()
        total_count = len(p_values)
        print(f"{explanation_method}: {significant_count}/{total_count} significant correlations (p < {alpha})")
        print("--------------------------")

no_debiasing

--- raw_attention ---
raw_attention: 6/6 significant correlations (p < 0.05)
--------------------------
--- attention_rollout ---
attention_rollout: 6/6 significant correlations (p < 0.05)
--------------------------
--- attention_flow ---
attention_flow: 6/6 significant correlations (p < 0.05)
--------------------------
--- Saliency_L2 ---
Saliency_L2: 5/6 significant correlations (p < 0.05)
--------------------------
--- Saliency_mean ---
Saliency_mean: 3/6 significant correlations (p < 0.05)
--------------------------
--- DeepLift_L2 ---
DeepLift_L2: 6/6 significant correlations (p < 0.05)
--------------------------
--- DeepLift_mean ---
DeepLift_mean: 4/6 significant correlations (p < 0.05)
--------------------------
--- InputXGradient_L2 ---
InputXGradient_L2: 5/6 significant correlations (p < 0.05)
--------------------------
--- InputXGradient_mean ---
InputXGradient_mean: 6/6 significant correlations (p < 0.05)
--------------------------
--- IntegratedGradients_L2 -