In [None]:
import os
import json
from utils.vocabulary import *

data = "civil"  # "civil" or "jigsaw"
root_dir = f"/scratch/yifwang/new_fairness_x_explainability/encoder_results_{data}"
debiased_dir = f"/scratch/yifwang/new_fairness_x_explainability/bias_mitigation_results_{data}"


models = ["bert"] # ["bert", "roberta", "distilbert"]
bias_types = ["race"]  # ["race", "gender", "religion"]
seeds = [42]

debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

explanation_methods = ["Saliency"] # ["Saliency", "InputXGradient", "IntegratedGradients", "raw_attention", "attention_rollout", "attention_flow", "occlusion"]
aggregation = ["L1", "L2"]
alphas = [1.0, 0.1, 0.01, 0.001, 0.0001]

training_types = ["one axis"] # ["all axes", "one axis"]
if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
    
fairness_metrics = ["accuracy", "f1", "fpr", "fnr", "individual_fairness"]

In [None]:
fairness_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "metrics": [], "score": [], "seed": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        # load baseline fairness results
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                
                file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_test_summary_stats.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    fairness_data = json.load(f)
                
                
                for metric in ["accuracy", "f1"]:
                    fairness_dict['model'].append(model)
                    fairness_dict['bias_type'].append(bias_type)
                    fairness_dict['debiasing_method'].append(debiasing_method)
                    fairness_dict["seed"].append(42)
                    fairness_dict['training_data'].append(training_type)
                    fairness_dict['metrics'].append(f"task_{metric}")
                    if metric in fairness_data['Metrics']['overall']:
                        fairness_dict['score'].append(fairness_data['Metrics']['overall'][metric])
                    else:
                        print(f"Metric {metric} not found in fairness data for {debiasing_method} on {bias_type}")
                        fairness_dict['score'].append(None)
 

            for explanation_method in explanation_methods:
                for aggregation_method in aggregation:
                    for alpha in alphas:
                        for seed in seeds:
                            file_path = os.path.join(debiased_dir, f"{model}_{data}_{data_token}_{bias_type}_{split}_{num_examples[bias_type]}_{seed}", explanation_method, f"{aggregation_method}_{alpha}","fairness", f"fairness_{bias_type}_{split}_summary_stats.json")
                            if not os.path.exists(file_path):
                                print(f"File not found: {file_path}")
                                continue
                            with open(file_path, "r") as f:
                                explanation_data = json.load(f)
                            
                            for metric in ["accuracy", "f1"]:
                                fairness_dict['model'].append(model)
                                fairness_dict['bias_type'].append(bias_type)
                                fairness_dict['debiasing_method'].append(f"{explanation_method}_{aggregation_method}_{alpha}")
                                fairness_dict['training_data'].append(training_type)
                                fairness_dict['metrics'].append(f"task_{metric}")
                                fairness_dict['seed'].append(seed)
                                if metric in explanation_data['Metrics']['overall']:
                                    fairness_dict['score'].append(explanation_data['Metrics']['overall'][metric])
                                else:
                                    print(f"Metric {metric} not found in explanation data for {explanation_method} on {bias_type}")
                                    fairness_dict['score'].append(None)
                            for metric in fairness_metrics:
                                fairness_dict['model'].append(model)
                                fairness_dict['bias_type'].append(bias_type)
                                fairness_dict['debiasing_method'].append(f"{explanation_method}_{aggregation_method}_{alpha}")
                                fairness_dict['training_data'].append(training_type)
                                fairness_dict['seed'].append(seed)
                                if metric != "individual_fairness":
                                    fairness_dict['metric'].append(metric)
                                    fairness_dict['score'].append(sum([abs(fairness_data['Group_Fairness']["average"][group][metric]) for group in groups]))
                                else:
                                    fairness_dict['metric'].append("individual_fairness")
                                    fairness_dict['score'].append(fairness_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"])




In [1]:
# convert to a pandas DataFrame
import pandas as pd
fairness_df = pd.DataFrame(fairness_dict)
# fairness_df

NameError: name 'fairness_dict' is not defined

In [7]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'metrics'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(fairness_df['debiasing_method'].unique())
    print(avg_score)
    print('\n')

Average fairness score for training type one axis:
metrics              accuracy        f1
debiasing_method                       
no_debiasing          0.78375  0.751909
group_balance         0.79250  0.775724
group_class_balance   0.78000  0.762283
cda                   0.76825  0.744694
dropout               0.78525  0.755417
attention_entropy     0.79150  0.765320
causal_debias         0.78800  0.772503
Saliency_L1_1.0       0.76975  0.756139
Saliency_L1_0.1       0.76700  0.747938
Saliency_L1_0.01      0.77625  0.750052
Saliency_L1_0.001     0.77075  0.753470
Saliency_L1_0.0001    0.78425  0.763416
Saliency_L2_1.0       0.77950  0.757931
Saliency_L2_0.1       0.78275  0.764941
Saliency_L2_0.01      0.77800  0.757845
Saliency_L2_0.001     0.77100  0.747934
Saliency_L2_0.0001    0.77975  0.762604




In [8]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'metrics'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(fairness_df['debiasing_method'].unique())
    # for each debiasing method and metric, calculate the difference from no debiasing of the same metric
    no_debiasing_scores = avg_score.loc['no_debiasing']
    for metric in avg_score.columns:
        if metric != 'debiasing_method':
            avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
    print(avg_score)
    print('\n')

Average fairness score for training type one axis:
metrics              accuracy        f1
debiasing_method                       
no_debiasing          0.00000  0.000000
group_balance         0.00875  0.023814
group_class_balance  -0.00375  0.010374
cda                  -0.01550 -0.007215
dropout               0.00150  0.003508
attention_entropy     0.00775  0.013411
causal_debias         0.00425  0.020593
Saliency_L1_1.0      -0.01400  0.004230
Saliency_L1_0.1      -0.01675 -0.003971
Saliency_L1_0.01     -0.00750 -0.001857
Saliency_L1_0.001    -0.01300  0.001561
Saliency_L1_0.0001    0.00050  0.011507
Saliency_L2_1.0      -0.00425  0.006022
Saliency_L2_0.1      -0.00100  0.013032
Saliency_L2_0.01     -0.00575  0.005936
Saliency_L2_0.001    -0.01275 -0.003975
Saliency_L2_0.0001   -0.00400  0.010694


