In [2]:
import os
import json
from utils.vocabulary import *

data = "civil"  # "civil" or "jigsaw"
root_dir = f"/scratch/yifwang/fairness_x_explainability/encoder_results_{data}"
debiased_dir = f"/scratch/yifwang/fairness_x_explainability/bias_mitigation_results_{data}"


models = ["bert"] # roberta gender有一个有问题，distilbert的attention都有问题
bias_types = ["race", "gender", "religion"]

debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

#explanation_methods = ["Saliency", "InputXGradient", "IntegratedGradients", "raw_attention", "attention_rollout", "attention_flow", "Occlusion"]
explanation_methods = ["Saliency", "InputXGradient", "raw_attention", "attention_rollout", "attention_flow", "Occlusion"]

aggregation = ["L1", "L2"]
seeds = [1, 2, 42]
alphas = [100.0, 10.0, 1.0, 0.1, 0.01]

training_types = ["one axis"] # ["all axes", "one axis"]
if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
    
fairness_metrics = ["accuracy", "f1", "fpr", "fnr", "individual_fairness"]



In [4]:
fairness_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "seed": [], "fairness_metric": [], "score": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        # load baseline fairness results
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                
                file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_test_summary_stats.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    fairness_data = json.load(f)
                
                for metric in fairness_metrics:
                    fairness_dict['model'].append(model)
                    fairness_dict['bias_type'].append(bias_type)
                    fairness_dict['seed'].append(-1)
                    fairness_dict['debiasing_method'].append(debiasing_method)
                    fairness_dict['training_data'].append(training_type)
                    if metric != "individual_fairness":
                        fairness_dict['fairness_metric'].append(metric)
                        fairness_dict['score'].append(sum([abs(fairness_data['Group_Fairness']["average"][group][metric]) for group in groups]))
                    else:
                        fairness_dict['fairness_metric'].append("individual_fairness")
                        fairness_dict['score'].append(fairness_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"])
            for seed in seeds:
                for explanation_method in explanation_methods:
                    if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                        aggregation = ["none"]
                    else:
                        aggregation = ["L1", "L2"]
                    for aggregation_method in aggregation:
                        for alpha in alphas:
                            if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                                file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}_{seed}", explanation_method, f"{alpha}","fairness", f"fairness_{bias_type}_test_summary_stats.json")
                            else:
                                file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}_{seed}", explanation_method, f"{aggregation_method}_{alpha}","fairness", f"fairness_{bias_type}_test_summary_stats.json")
                            if not os.path.exists(file_path):
                                print(f"File not found: {file_path}")
                                continue
                            with open(file_path, "r") as f:
                                explanation_data = json.load(f)
                            
                            for metric in fairness_metrics:
                                fairness_dict['model'].append(model)
                                fairness_dict['bias_type'].append(bias_type)
                                fairness_dict['seed'].append(seed)
                                if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                                    fairness_dict['debiasing_method'].append(f"{explanation_method}_{alpha}")
                                else:
                                    fairness_dict['debiasing_method'].append(f"{explanation_method}_{aggregation_method}_{alpha}")
                                fairness_dict['training_data'].append(training_type)
                                fairness_dict['fairness_metric'].append(metric)
                                if metric != "individual_fairness":
                                    fairness_dict['score'].append(sum([abs(explanation_data['Group_Fairness']["average"][group][metric]) for group in groups]))
                                else:
                                    fairness_dict['score'].append(explanation_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"])

# convert to a pandas DataFrame
import pandas as pd
fairness_df = pd.DataFrame(fairness_dict)
# fairness_df

In [12]:
# # print out each metric for different debiasing methods, for all models and bias types
# for metric in fairness_metrics:
#     print(f"\nMetric: {metric}")
#     for model in models:
#         for bias_type in bias_types:
#             print(f"\nMetric: {metric}, Model: {model}, Bias Type: {bias_type}")
#             for debiasing_method in fairness_df['debiasing_method'].unique():
#                 subset = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type) & (fairness_df['debiasing_method'] == debiasing_method) & (fairness_df['fairness_metric'] == metric)]
#                 if not subset.empty:
#                     print(f"{debiasing_method}: {subset['score'].values[0]}")

In [5]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(fairness_df['debiasing_method'].unique())
    # for each debiasing method and metric, calculate the difference from no debiasing of the same metric
    no_debiasing_scores = avg_score.loc['no_debiasing']
    for metric in avg_score.columns:
        if metric != 'debiasing_method':
            avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
    print(avg_score)
    print('\n')

Average fairness score for training type one axis:
fairness_metric          accuracy        f1       fnr       fpr  \
debiasing_method                                                  
no_debiasing             0.000000  0.000000  0.000000  0.000000   
group_balance           -0.013278 -0.018615 -0.022577  0.024077   
group_class_balance     -0.013778 -0.011734 -0.006302  0.000687   
cda                     -0.011111 -0.011296 -0.009266  0.024185   
dropout                 -0.011333 -0.016634 -0.038154  0.006259   
attention_entropy       -0.012333 -0.022503 -0.025290 -0.000719   
causal_debias           -0.014556 -0.011030 -0.012716  0.022940   
Saliency_L1_100.0        0.041944 -0.051895 -0.176426 -0.020989   
Saliency_L1_10.0         0.045278 -0.036721 -0.133547 -0.011092   
Saliency_L1_1.0          0.021759 -0.031289 -0.073759  0.018616   
Saliency_L1_0.1         -0.011167 -0.015126 -0.011201  0.021209   
Saliency_L1_0.01        -0.013426 -0.018125 -0.021673  0.012338   
Saliency_L2

In [8]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(fairness_df['debiasing_method'].unique())
    # for each debiasing method and metric, calculate the difference from no debiasing of the same metric
    print(avg_score)
    print('\n')

Average fairness score for training type one axis:
fairness_metric          accuracy        f1       fnr       fpr  \
debiasing_method                                                  
no_debiasing             0.078056  0.094414  0.176426  0.020989   
group_balance            0.064778  0.075799  0.153849  0.045066   
group_class_balance      0.064278  0.082680  0.170124  0.021676   
cda                      0.066944  0.083118  0.167160  0.045175   
dropout                  0.066722  0.077781  0.138272  0.027248   
attention_entropy        0.065722  0.071911  0.151135  0.020270   
causal_debias            0.063500  0.083384  0.163709  0.043929   
Saliency_L1_100.0        0.120000  0.042519  0.000000  0.000000   
Saliency_L1_10.0         0.123333  0.057693  0.042879  0.009898   
Saliency_L1_1.0          0.099815  0.063125  0.102667  0.039605   
Saliency_L1_0.1          0.066889  0.079288  0.165225  0.042198   
Saliency_L1_0.01         0.064630  0.076289  0.154753  0.033327   
Saliency_L2

In [12]:
performance_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "metric": [], "score": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        # load baseline fairness results
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                
                file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_test_summary_stats.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    fairness_data = json.load(f)
                
                
                for metric in ["accuracy", "f1"]:
                    performance_dict['model'].append(model)
                    performance_dict['bias_type'].append(bias_type)
                    performance_dict['debiasing_method'].append(debiasing_method)
                    performance_dict['training_data'].append(training_type)
                    performance_dict['metric'].append(metric)
                    if metric in fairness_data['Metrics']['overall']:
                        performance_dict['score'].append(fairness_data['Metrics']['overall'][metric])
                    else:
                        print(f"Metric {metric} not found in fairness data for {debiasing_method} on {bias_type}")
                        performance_dict['score'].append(None)

            for explanation_method in explanation_methods:
                if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                    aggregation = ["none"]
                else:
                    aggregation = ["L1", "L2"]
                for aggregation_method in aggregation:
                    for alpha in alphas:
                        if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                            file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", explanation_method, f"{alpha}","fairness", f"fairness_{bias_type}_test_summary_stats.json")
                        else:
                            file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", explanation_method, f"{aggregation_method}_{alpha}","fairness", f"fairness_{bias_type}_test_summary_stats.json")
                        if not os.path.exists(file_path):
                            print(f"File not found: {file_path}")
                            continue
                        with open(file_path, "r") as f:
                            explanation_data = json.load(f)
                        
                        for metric in ["accuracy", "f1"]:
                            performance_dict['model'].append(model)
                            performance_dict['bias_type'].append(bias_type)
                            if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                                performance_dict['debiasing_method'].append(f"{explanation_method}_{alpha}")
                            else:
                                performance_dict['debiasing_method'].append(f"{explanation_method}_{aggregation_method}_{alpha}")
                            performance_dict['training_data'].append(training_type)
                            performance_dict['metric'].append(metric)
                            if metric in explanation_data['Metrics']['overall']:
                                performance_dict['score'].append(explanation_data['Metrics']['overall'][metric])
                            else:
                                print(f"Metric {metric} not found in explanation data for {explanation_method} on {bias_type}")
                                performance_dict['score'].append(None)

# convert to a pandas DataFrame
import pandas as pd
performance_df = pd.DataFrame(performance_dict)
# fairness_df
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type



File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_race_race_test_2000/Saliency/L1_100.0/fairness/fairness_race_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_race_race_test_2000/Saliency/L1_10.0/fairness/fairness_race_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_race_race_test_2000/Saliency/L1_1.0/fairness/fairness_race_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_race_race_test_2000/Saliency/L1_0.1/fairness/fairness_race_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_race_race_test_2000/Saliency/L1_0.01/fairness/fairness_race_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_result

File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_religion_religion_test_1000/Saliency/L1_100.0/fairness/fairness_religion_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_religion_religion_test_1000/Saliency/L1_10.0/fairness/fairness_religion_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_religion_religion_test_1000/Saliency/L1_1.0/fairness/fairness_religion_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_religion_religion_test_1000/Saliency/L1_0.1/fairness/fairness_religion_test_summary_stats.json
File not found: /scratch/yifwang/fairness_x_explainability/bias_mitigation_results_civil/bert_civil_religion_religion_test_1000/Saliency/L1_0.01/fairness/fairness_religion_test_summary_stats.json
File not found: /scra

In [10]:
for train_type in training_types:
    print(f"Average task performance for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = performance_df[performance_df['training_data'] == train_type].groupby(['debiasing_method', 'metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(performance_df['debiasing_method'].unique())
    # for each debiasing method and metric, calculate the difference from no debiasing of the same metric
    no_debiasing_scores = avg_score.loc['no_debiasing']
    for metric in avg_score.columns:
        if metric != 'debiasing_method':
            avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
    print(avg_score)
    print('\n')

Average task performance for training type one axis:
metric               accuracy        f1
debiasing_method                       
no_debiasing         0.000000  0.000000
group_balance        0.003250  0.015668
group_class_balance -0.005222 -0.001163
cda                 -0.013333 -0.011149
dropout             -0.002000 -0.003421
attention_entropy   -0.002000 -0.003566
causal_debias       -0.003278  0.000532




In [17]:
for train_type in training_types:
    print(f"Average task performance for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = performance_df[performance_df['training_data'] == train_type].groupby(['debiasing_method', 'metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(performance_df['debiasing_method'].unique())
    print(avg_score)
    print('\n')

Average task performance for training type one axis:
metric                   accuracy        f1
debiasing_method                           
no_debiasing              0.88050  0.837022
group_balance             0.87250  0.828152
group_class_balance       0.87025  0.825656
cda                       0.86700  0.815724
dropout                   0.88200  0.839743
attention_entropy         0.87675  0.835940
causal_debias             0.86175  0.798009
Saliency_L1_100.0         0.73500  0.423631
Saliency_L1_10.0          0.75375  0.523991
Saliency_L1_1.0           0.83250  0.736547
Saliency_L1_0.1           0.88175  0.843526
Saliency_L1_0.01          0.88525  0.847557
Saliency_L2_100.0         0.88200  0.847338
Saliency_L2_10.0          0.88425  0.844043
Saliency_L2_1.0           0.88725  0.850804
Saliency_L2_0.1           0.88275  0.842559
Saliency_L2_0.01          0.88400  0.846151
InputXGradient_L1_100.0   0.73500  0.423631
InputXGradient_L1_10.0    0.73525  0.424635
InputXGradient_L1_1.0  

In [11]:
# visualize the individual fairness scores and accuracy of the explanation debiased models in the same plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

for model in models:
    for bias_type in bias_types:
        for train_type in training_types:
            all_individual_fairness_scores = {}
            all_accuracy_scores = {}
            all_explanation_debiased_models = {}
            baseline_fairness = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type) & (fairness_df['training_data'] == train_type) & (fairness_df['debiasing_method'] == "no_debiasing") & (fairness_df["fairness_metric"] == "individual_fairness")]['score'].values[0]
            baseline_accuracy = performance_df[(performance_df['model'] == model) & (performance_df['bias_type'] == bias_type) & (performance_df['training_data'] == train_type) & (performance_df['debiasing_method'] == "no_debiasing") & (performance_df["metric"] == "accuracy")]['score'].values[0]
            for explanation_method in explanation_methods:
                if explanation_method in ["raw_attention", "attention_rollout", "attention_flow", "Occlusion"]:
                    aggregation = ["none"]
                else:
                    aggregation = ["L1", "L2"]
                for aggregation_method in aggregation:
                    explanation_debiased_models = [f"{explanation_method}_{aggregation_method}_{alpha}" if aggregation_method != "none" else f"{explanation_method}_{alpha}" for alpha in alphas]
                    # reverse the order of the explanation_debiased_models to have the lowest alpha first
                    explanation_debiased_models.reverse()
                    # make sure all explanation_debiased_models are in the performance_df and fairness_df
                    if any(explanation_debiased_model not in performance_df['debiasing_method'].unique() for explanation_debiased_model in explanation_debiased_models):
                        print(f"Skipping {model}, {bias_type}, {train_type}, {explanation_method}, {aggregation_method} due to missing debiasing methods")
                        continue
                    individual_fairness_scores = [fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type) & (fairness_df['training_data'] == train_type) & (fairness_df['debiasing_method'] == explanation_debiased_model) & (fairness_df["fairness_metric"] == "individual_fairness")]['score'].values[0] for explanation_debiased_model in explanation_debiased_models]
                    accuracy_scores = [performance_df[(performance_df['model'] == model) & (performance_df['bias_type'] == bias_type) & (performance_df['training_data'] == train_type) & (performance_df['debiasing_method'] == explanation_debiased_model) & (performance_df["metric"] == "accuracy")]['score'].values[0] for explanation_debiased_model in explanation_debiased_models]

                    explanation_debiasing_method = f"{explanation_method}_{aggregation_method}" if aggregation_method != "none" else explanation_method
                    all_individual_fairness_scores[explanation_debiasing_method] = individual_fairness_scores
                    all_accuracy_scores[explanation_debiasing_method] = accuracy_scores
                    all_explanation_debiased_models[explanation_debiasing_method] = explanation_debiased_models

            # find the maximum and minimum individual fairness scores and accuracy scores across all explanation debiased models
            max_individual_fairness = max([max(scores) for scores in all_individual_fairness_scores.values()]+[baseline_fairness])
            min_individual_fairness = min([min(scores) for scores in all_individual_fairness_scores.values()]+[baseline_fairness])
            max_accuracy = max([max(scores) for scores in all_accuracy_scores.values()]+[baseline_accuracy])
            min_accuracy = min([min(scores) for scores in all_accuracy_scores.values()]+[baseline_accuracy])

            for explanation_debiasing_method in all_explanation_debiased_models.keys():
                
                explanation_debiased_models = all_explanation_debiased_models[explanation_debiasing_method]
                individual_fairness_scores = all_individual_fairness_scores[explanation_debiasing_method]
                accuracy_scores = all_accuracy_scores[explanation_debiasing_method] 
                # create two plots side by side
                # one plot shows the individual fairness scores using a line plot with a horizontal line at the baseline individual fairness score
                # the other plot shows the accuracy scores using a line plot with a horizontal line at the baseline accuracy score
                # rotate x axis for better readability
                fig, axes = plt.subplots(1, 2, figsize=(14, 6))
                sns.lineplot(x=explanation_debiased_models, y=individual_fairness_scores, ax=axes[0], marker='o')
                axes[0].axhline(y=baseline_fairness, color='r', linestyle='--', label='Baseline Individual Fairness')
                axes[0].set_title(f'Individual Fairness Scores: {model}, {bias_type} ({train_type}) with {explanation_debiasing_method}')
                # set x range
                axes[0].set_ylim(min_individual_fairness-0.001, max_individual_fairness+0.001)
                axes[0].set_xlabel('Debiasing Method')
                axes[0].set_ylabel('Individual Fairness Score')
                axes[0].legend()    
                sns.lineplot(x=explanation_debiased_models, y=accuracy_scores, ax=axes[1], marker='o')
                axes[1].axhline(y=baseline_accuracy, color='r', linestyle='--', label='Baseline Accuracy')

                axes[1].set_title(f'Accuracy Scores for {model} on {bias_type} ({train_type}) with {explanation_debiasing_method}')

                axes[1].set_ylim(min_accuracy-0.05, max_accuracy+0.05)
                axes[1].set_xlabel('Debiasing Method')
                axes[1].set_ylabel('Accuracy Score')
                axes[1].legend()

                for ax in axes:
                    ax.tick_params(axis='x', rotation=45)

                plt.tight_layout()
                plt.show()

Skipping bert, race, one axis, Saliency, L1 due to missing debiasing methods
Skipping bert, race, one axis, Saliency, L2 due to missing debiasing methods
Skipping bert, race, one axis, InputXGradient, L1 due to missing debiasing methods
Skipping bert, race, one axis, InputXGradient, L2 due to missing debiasing methods
Skipping bert, race, one axis, raw_attention, none due to missing debiasing methods
Skipping bert, race, one axis, attention_rollout, none due to missing debiasing methods
Skipping bert, race, one axis, attention_flow, none due to missing debiasing methods
Skipping bert, race, one axis, Occlusion, none due to missing debiasing methods
Skipping bert, gender, one axis, Saliency, L1 due to missing debiasing methods
Skipping bert, gender, one axis, Saliency, L2 due to missing debiasing methods
Skipping bert, gender, one axis, InputXGradient, L1 due to missing debiasing methods
Skipping bert, gender, one axis, InputXGradient, L2 due to missing debiasing methods
Skipping bert, 