In [20]:
import os
import json
from utils.vocabulary import *

data = "civil"  # "civil" or "jigsaw"
root_dir = f"/scratch/yifwang/fairness_x_explainability/encoder_results_{data}"
debiased_dir = f"/scratch/yifwang/fairness_x_explainability/bias_mitigation_results_{data}"


models = ["bert", "roberta"] # ["bert", "roberta", "distilbert"]
bias_types = ["race", "gender", "religion"]

split = "test"  # "val" or "test"
debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

explanation_methods = ["Saliency", "InputXGradient", "raw_attention", "attention_rollout", "attention_flow", "Occlusion"]
aggregation = ["L1", "L2"]
alphas = [100.0, 10.0, 1.0, 0.1, 0.01]
seeds = [42, 1, 2]

training_types = ["one axis"] # ["all axes", "one axis"]
if data == "civil":
    num_examples_test = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples_test = {"race": 400, "gender": 800, "religion": 200}

if data == "civil":
    num_examples_val = {"race": 500, "gender": 500, "religion": 200}
elif data == "jigsaw":
    num_examples_val = {"race": 200, "gender": 200, "religion": 200}

num_examples = num_examples_test if split=="test" else num_examples_val

fairness_metrics = ["accuracy", "fpr", "fnr", "individual_fairness"]

In [21]:
def compute_average_relative_fairness(mitigated_fairness_dict, orig_fairness_dict):
    # 越大越好
    relative_fairness = {}
    for metric in mitigated_fairness_dict:
        if metric not in orig_fairness_dict:
            continue
        orig_value = orig_fairness_dict[metric]
        mitigated_value = mitigated_fairness_dict[metric]
        if orig_value == 0:
            relative_fairness[metric] = 0
        else:
            relative_fairness[metric] = (orig_value - mitigated_value) / abs(orig_value) * 100
    if len(relative_fairness) == 0:
        return 0
    average_relative_fairness = sum(relative_fairness.values()) / len(relative_fairness)
    return average_relative_fairness

def compute_average_fairness(mitigated_fairness_dict, orig_fairness_dict):
    # 越大越好
    relative_fairness = {}
    for metric in mitigated_fairness_dict:
        if metric not in orig_fairness_dict:
            continue
        orig_value = orig_fairness_dict[metric]
        mitigated_value = mitigated_fairness_dict[metric]
        if orig_value == 0:
            relative_fairness[metric] = 0
        else:
            relative_fairness[metric] = (orig_value - mitigated_value) * 100
    if len(relative_fairness) == 0:
        return 0
    average_relative_fairness = sum(relative_fairness.values()) / len(relative_fairness)
    return average_relative_fairness

def compute_harmonic_mean_performance_fairness(performance_metric_dict, fairness_metric_dict):
    # 越大越好
    if len(performance_metric_dict) == 0 or len(fairness_metric_dict) == 0:
        return 0
    # compute harmonic mean of performance and fairness separately
    harmonic_mean_performance = len(performance_metric_dict) / sum(1.0 / (v * 100) for v in performance_metric_dict.values() if v != 0)
    harmonic_mean_fairness = len(fairness_metric_dict) / sum(1.0 / (100-v*100) for v in fairness_metric_dict.values() if v != 0)
    if harmonic_mean_performance == 0 or harmonic_mean_fairness == 0:
        return 0
    harmonic_mean = 2 * (harmonic_mean_performance * harmonic_mean_fairness) / (harmonic_mean_performance + harmonic_mean_fairness)
    return harmonic_mean

In [24]:
fairness_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "aggregation": [], "alpha": [], "metrics": [], "score": [], "seed": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        # load baseline fairness results
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                
                file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_{split}_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_{split}_summary_stats.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    fairness_data = json.load(f)
                
                
                for metric in ["accuracy", "f1"]:
                    fairness_dict['model'].append(model)
                    fairness_dict['bias_type'].append(bias_type)
                    fairness_dict['debiasing_method'].append(debiasing_method)
                    fairness_dict['training_data'].append(training_type)
                    fairness_dict['metrics'].append(f"task_{metric}")
                    fairness_dict['seed'].append(42)
                    fairness_dict['aggregation'].append("N/A")
                    fairness_dict['alpha'].append("N/A")
                    if metric in fairness_data['Metrics']['overall']:
                        fairness_dict['score'].append(fairness_data['Metrics']['overall'][metric])
                    else:
                        print(f"Metric {metric} not found in fairness data for {debiasing_method} on {bias_type}")
                        fairness_dict['score'].append(None)

                for metric in fairness_metrics:
                    fairness_dict['model'].append(model)
                    fairness_dict['bias_type'].append(bias_type)
                    fairness_dict['debiasing_method'].append(debiasing_method)
                    fairness_dict['training_data'].append(training_type)
                    fairness_dict["seed"].append(42)
                    fairness_dict['aggregation'].append("N/A")
                    fairness_dict['alpha'].append("N/A")
                    if metric != "individual_fairness":
                        fairness_dict['metrics'].append(metric)
                        fairness_dict['score'].append(sum([abs(fairness_data['Group_Fairness']["average"][group][metric]) for group in groups]))
                    else:
                        fairness_dict['metrics'].append("individual_fairness")
                        fairness_dict['score'].append(fairness_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"])

            for explanation_method in explanation_methods:
                for aggregation_method in aggregation:
                    if aggregation_method == "L2" and explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                            continue
                    for alpha in alphas:
                        for seed in seeds:
                            if aggregation_method == "L1" and explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                                file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_{split}_{num_examples[bias_type]}_{seed}", explanation_method, f"{alpha}","fairness", f"fairness_{bias_type}_{split}_summary_stats.json")
                            else:
                                file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_{split}_{num_examples[bias_type]}_{seed}", explanation_method, f"{aggregation_method}_{alpha}","fairness", f"fairness_{bias_type}_{split}_summary_stats.json")
                            if not os.path.exists(file_path):
                                print(f"File not found: {file_path}")
                                continue
                            with open(file_path, "r") as f:
                                explanation_data = json.load(f)
                            
                            for metric in ["accuracy", "f1"]:
                                fairness_dict['model'].append(model)
                                fairness_dict['bias_type'].append(bias_type)
                                fairness_dict['debiasing_method'].append(f"{explanation_method}")
                                fairness_dict['training_data'].append(training_type)
                                fairness_dict['metrics'].append(f"task_{metric}")
                                fairness_dict['seed'].append(seed)
                                fairness_dict['aggregation'].append(f"{aggregation_method}")
                                fairness_dict['alpha'].append(f"{alpha}")
                                if metric in explanation_data['Metrics']['overall']:
                                    fairness_dict['score'].append(explanation_data['Metrics']['overall'][metric])
                                else:
                                    print(f"Metric {metric} not found in explanation data for {explanation_method} on {bias_type}")
                                    fairness_dict['score'].append(None)

                            for metric in fairness_metrics:
                                fairness_dict['model'].append(model)
                                fairness_dict['bias_type'].append(bias_type)
                                fairness_dict['debiasing_method'].append(f"{explanation_method}")
                                fairness_dict['training_data'].append(training_type)
                                fairness_dict['seed'].append(seed)
                                fairness_dict['aggregation'].append(f"{aggregation_method}")
                                fairness_dict['alpha'].append(f"{alpha}")
                                if metric != "individual_fairness":
                                    fairness_dict['metrics'].append(metric)
                                    fairness_dict['score'].append(sum([abs(explanation_data['Group_Fairness']["average"][group][metric]) for group in groups]))
                                    
                                else:
                                    fairness_dict['metrics'].append("individual_fairness")
                                    fairness_dict['score'].append(explanation_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"])


# convert to a pandas DataFrame
import pandas as pd
fairness_df = pd.DataFrame(fairness_dict)
# fairness_df


File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/roberta_civil_gender_gender_test_2000_2/Saliency/L1_1.0/fairness/fairness_gender_test_summary_stats.json


In [23]:
selected_model = "bert"
selected_bias_type = "race"
selected_explanation_method = "Saliency"
selected_training_data = "one axis"
selected_aggregation = "L2"
if selected_aggregation == "L2" and selected_explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
    selected_aggregation = "L1"

selected_seed = 42

selected_performance_metrics = ["task_accuracy"]

for selected_fairness_metrics in [["accuracy"], ["fpr"], ["fnr"], ["individual_fairness"]]:
    print("\n\n")
    print("================================")
    print(f"Selected fairness metrics: {selected_fairness_metrics}")
    print("================================")

    # original fairness results
    orig_fairness_result = fairness_df[(fairness_df['model'] == selected_model) & (fairness_df['bias_type'] == selected_bias_type) & (fairness_df['debiasing_method'] == 'no_debiasing') & (fairness_df['training_data'] == selected_training_data)]
    orig_fairness_metric = {metric: orig_fairness_result[orig_fairness_result['metrics'] == metric]['score'].values[0] for metric in selected_fairness_metrics}
    orig_performance_metric = {metric: orig_fairness_result[orig_fairness_result['metrics'] == metric]['score'].values[0] for metric in selected_performance_metrics}
    orig_harmonic_mean = compute_harmonic_mean_performance_fairness(orig_performance_metric, orig_fairness_metric)
    best_avg_fairness = -float('inf')
    best_avg_relative_fairness = -float('inf')
    best_harmonic_mean = -float('inf')
    best_avg_fairness_debiasing_method = None
    best_avg_relative_fairness_debiasing_method = None
    best_harmonic_mean_debiasing_method = None
    best_avg_fairness_fairness_metric = {}
    best_avg_fairness_performance_metric = {}
    best_avg_relative_fairness_fairness_metric = {}
    best_avg_relative_fairness_performance_metric = {}
    best_harmonic_mean_fairness_metric = {}
    best_harmonic_mean_performance_metric = {}
    for alpha in alphas:
        mitigated_fairness_result = fairness_df[(fairness_df['model'] == selected_model) & (fairness_df['bias_type'] == selected_bias_type) & (fairness_df['debiasing_method'] == selected_explanation_method) & (fairness_df['training_data'] == selected_training_data) & (fairness_df['aggregation'] == selected_aggregation) & (fairness_df['seed'] == selected_seed) & (fairness_df['alpha'] == str(alpha))]
        mitigated_fairness_metric = {metric: mitigated_fairness_result[mitigated_fairness_result['metrics'] == metric]['score'].values[0] for metric in selected_fairness_metrics}
        
        mitigated_performance_metric = {metric: mitigated_fairness_result[mitigated_fairness_result['metrics'] == metric]['score'].values[0] for metric in selected_performance_metrics}
        avg_fairness = compute_average_fairness(mitigated_fairness_metric, orig_fairness_metric)
        #avg_relative_fairness = compute_average_relative_fairness(mitigated_fairness_metric, orig_fairness_metric)
        harmonic_mean = compute_harmonic_mean_performance_fairness(mitigated_performance_metric, mitigated_fairness_metric)
        if avg_fairness > best_avg_fairness:
            best_avg_fairness = avg_fairness
            best_avg_fairness_debiasing_method = f"{selected_explanation_method}_{selected_aggregation}_{alpha}"
            best_avg_fairness_fairness_metric = mitigated_fairness_metric
            best_avg_fairness_performance_metric = mitigated_performance_metric
        # if avg_relative_fairness > best_avg_relative_fairness:
        #     best_avg_relative_fairness = avg_relative_fairness
        #     best_avg_relative_fairness_debiasing_method = f"{selected_explanation_method}_{selected_aggregation}_{alpha}"
        #     best_avg_relative_fairness_fairness_metric = mitigated_fairness_metric
        #     best_avg_relative_fairness_performance_metric = mitigated_performance_metric
        if harmonic_mean > best_harmonic_mean:
            best_harmonic_mean = harmonic_mean
            best_harmonic_mean_debiasing_method = f"{selected_explanation_method}_{selected_aggregation}_{alpha}"
            best_harmonic_mean_fairness_metric = mitigated_fairness_metric
            best_harmonic_mean_performance_metric = mitigated_performance_metric
    print(f"Best average fairness: {best_avg_fairness} with method {best_avg_fairness_debiasing_method}")
    #print(f"Best average relative fairness: {best_avg_relative_fairness} with method {best_avg_relative_fairness_debiasing_method}")
    print(f"Best harmonic mean of performance and fairness: {best_harmonic_mean} with method {best_harmonic_mean_debiasing_method}")
    print("================================")
    # show original model results
    print("Original model results:")
    print(orig_performance_metric)
    print(orig_fairness_metric)

    print("================================")
    # show best average fairness results
    print("Best average fairness results:")
    print(best_avg_fairness_debiasing_method)
    print(best_avg_fairness_performance_metric)
    print(best_avg_fairness_fairness_metric)

    # print("================================")
    # # show best average relative fairness results
    # print("Best average relative fairness results:")
    # print(best_avg_relative_fairness_debiasing_method)
    # print(best_avg_relative_fairness_performance_metric)
    # print(best_avg_relative_fairness_fairness_metric)

    print("================================")
    # show best harmonic mean results
    print("Best harmonic mean results:")
    print(best_harmonic_mean_debiasing_method)
    print(best_harmonic_mean_performance_metric)
    print(best_harmonic_mean_fairness_metric)
    print("original harmonic mean: ", orig_harmonic_mean)
    print("bias mitigated harmonic mean: ", best_harmonic_mean)






Selected fairness metrics: ['accuracy']
Best average fairness: -0.04999999999999449 with method Saliency_L2_1.0
Best harmonic mean of performance and fairness: 86.7933466022178 with method Saliency_L2_1.0
Original model results:
{'task_accuracy': np.float64(0.78375)}
{'accuracy': np.float64(0.020500000000000074)}
Best average fairness results:
Saliency_L2_1.0
{'task_accuracy': np.float64(0.7795)}
{'accuracy': np.float64(0.02100000000000002)}
Best harmonic mean results:
Saliency_L2_1.0
{'task_accuracy': np.float64(0.7795)}
{'accuracy': np.float64(0.02100000000000002)}
original harmonic mean:  87.07592513823904
bias mitigated harmonic mean:  86.7933466022178



Selected fairness metrics: ['fpr']
Best average fairness: 0.16380979253372147 with method Saliency_L2_10.0
Best harmonic mean of performance and fairness: 87.4492122379971 with method Saliency_L2_0.1
Original model results:
{'task_accuracy': np.float64(0.78375)}
{'fpr': np.float64(0.005016185034525458)}
Best average fairness re