In [5]:
import os
import json
from utils.vocabulary import *

data = "civil"
root_dir = f"/scratch/yifwang/fairness_x_explainability/encoder_results_{data}"


models = ["bert"] # ["bert", "roberta", "distilbert"]
bias_types = ["race", "gender", "religion"]  # ["race", "gender", "religion"]

debiasing_methods = ["no_debiasing", "group_balance", "group_class_balance", "cda", "dropout", "attention_entropy", "causal_debias"]

training_types = ["all axes", "one axis"]
if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
    
fairness_metrics = ["model_accuracy", "accuracy", "fpr", "fnr", "individual_fairness"]



In [14]:
fairness_dict = {"model": [], "bias_type": [], "debiasing_method": [], "training_data": [], "fairness_metric": [], "score": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        for training_type in training_types:
            data_token = "all" if training_type == "all axes" else bias_type
            for debiasing_method in debiasing_methods:
                
                file_path = os.path.join(root_dir, f"{model}_{data}_{data_token}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_test_summary_stats.json")
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue
                with open(file_path, "r") as f:
                    fairness_data = json.load(f)
                
                for metric in fairness_metrics:
                    fairness_dict['model'].append(model)
                    fairness_dict['bias_type'].append(bias_type)
                    fairness_dict['debiasing_method'].append(debiasing_method)
                    fairness_dict['training_data'].append(training_type)
                    if metric != "individual_fairness" and metric != "model_accuracy":
                        fairness_dict['fairness_metric'].append(metric)
                        fairness_dict['score'].append(round(sum([abs(fairness_data['Group_Fairness']["average"][group][metric]) for group in groups])*100, 2))
                    elif metric == "individual_fairness":
                        fairness_dict['fairness_metric'].append("individual_fairness")
                        fairness_dict['score'].append(round(fairness_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"]*100, 2))
                    elif metric == "model_accuracy":
                        fairness_dict['fairness_metric'].append("model_accuracy")
                        fairness_dict['score'].append(round(fairness_data["Metrics"]["overall"]["accuracy"] * 100, 2))
                    



In [15]:
# convert to a pandas DataFrame
import pandas as pd
fairness_df = pd.DataFrame(fairness_dict)
fairness_df

Unnamed: 0,model,bias_type,debiasing_method,training_data,fairness_metric,score
0,bert,race,no_debiasing,all axes,model_accuracy,78.30
1,bert,race,no_debiasing,all axes,accuracy,2.00
2,bert,race,no_debiasing,all axes,fpr,0.02
3,bert,race,no_debiasing,all axes,fnr,8.44
4,bert,race,no_debiasing,all axes,individual_fairness,3.99
...,...,...,...,...,...,...
205,bert,religion,causal_debias,one axis,model_accuracy,86.40
206,bert,religion,causal_debias,one axis,accuracy,16.40
207,bert,religion,causal_debias,one axis,fpr,8.82
208,bert,religion,causal_debias,one axis,fnr,30.46


In [16]:
# 用于显示所有结果
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    for bias_type in bias_types:
        print(f"Average fairness score for training type {train_type}, {bias_type}:")
        # show the difference in the average fairness score for each debiasing method compared to no debiasing
        avg_score = fairness_df[(fairness_df['training_data'] == train_type) & (fairness_df['bias_type'] == bias_type)].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
        avg_score = avg_score.set_index('debiasing_method')
        avg_score = avg_score.reindex(debiasing_methods)
        print(avg_score)
        print('\n')

Average fairness score for training type all axes, race:
fairness_metric      accuracy    fnr   fpr  individual_fairness  \
debiasing_method                                                  
no_debiasing             2.00   8.44  0.02                 3.99   
group_balance            3.50   8.83  1.72                 4.13   
group_class_balance      1.95   9.33  1.35                 4.83   
cda                      2.65  20.35  6.38                 0.60   
dropout                  2.45   9.99  0.30                 3.60   
attention_entropy        2.10   5.92  1.28                 4.98   
causal_debias            2.20  13.13  2.51                 3.54   

fairness_metric      model_accuracy  
debiasing_method                     
no_debiasing                  78.30  
group_balance                 79.05  
group_class_balance           78.17  
cda                           78.08  
dropout                       78.08  
attention_entropy             78.35  
causal_debias                 79.40

In [64]:
# 加color coding表示比较
# print results as metric1_race/metric1_gender/metric1_religion metric2_race/ ...
train_data = "one axis"
debiasing_method = "causal_debias"
reported_metrics = ["model_accuracy", "accuracy", "fpr", "fnr", "individual_fairness"]
bias_types_order = ["race", "gender", "religion"]
print_string = ""
for reported_metric in reported_metrics:
    for bias_type in bias_types_order:
        default_score = fairness_df[(fairness_df['training_data'] == train_data) & (fairness_df['debiasing_method'] == "no_debiasing") & (fairness_df['bias_type'] == bias_type) & (fairness_df['fairness_metric'] == reported_metric)]['score'].values[0]
        score = fairness_df[(fairness_df['training_data'] == train_data) & (fairness_df['debiasing_method'] == debiasing_method) & (fairness_df['bias_type'] == bias_type) & (fairness_df['fairness_metric'] == reported_metric)]['score'].values[0]
        if debiasing_method != "no_debiasing" and reported_metric != "model_accuracy":
            if score < default_score:
                print_string += "\\textcolor{forestgreen}{"
                print_string += f"{score:.2f}" + "}/"
            elif score > default_score:
                print_string += "\\textcolor{red}{"
                print_string += f"{score:.2f}" + "}/"
            else:
                print_string += f"{score:.2f}" + "/"
        else:
            print_string += f"{score:.2f}" + "/"
    print_string = print_string[:-1] + " & "

print(print_string[:-2])


78.80/86.17/86.40 & \textcolor{forestgreen}{0.00}/\textcolor{forestgreen}{2.65}/\textcolor{forestgreen}{16.40} & \textcolor{red}{3.90}/\textcolor{red}{0.46}/\textcolor{red}{8.82} & \textcolor{forestgreen}{7.98}/\textcolor{forestgreen}{10.67}/\textcolor{forestgreen}{30.46} & \textcolor{red}{3.83}/\textcolor{forestgreen}{0.48}/\textcolor{red}{2.10} 


In [4]:
# print out each metric for different debiasing methods, for all models and bias types
for metric in fairness_metrics:
    print(f"\nMetric: {metric}")
    for model in models:
        for bias_type in bias_types:
            print(f"\nMetric: {metric}, Model: {model}, Bias Type: {bias_type}")
            for debiasing_method in debiasing_methods:
                subset = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type) & (fairness_df['debiasing_method'] == debiasing_method) & (fairness_df['fairness_metric'] == metric)]
                if not subset.empty:
                    print(f"{debiasing_method}: {subset['score'].values[0]}")


Metric: accuracy

Metric: accuracy, Model: bert, Bias Type: race
no_debiasing: 0.012500000000000067
group_balance: 0.01750000000000007
group_class_balance: 0.012499999999999956
cda: 0.032500000000000084
dropout: 0.030000000000000027
attention_entropy: 0.0050000000000000044
causal_debias: 0.010000000000000009

Metric: accuracy, Model: bert, Bias Type: gender
no_debiasing: 0.011249999999999982
group_balance: 0.011249999999999982
group_class_balance: 0.008749999999999925
cda: 0.018750000000000044
dropout: 0.01375000000000004
attention_entropy: 0.02750000000000008
causal_debias: 0.01375000000000004

Metric: accuracy, Model: bert, Bias Type: religion
no_debiasing: 0.09333333333333327
group_balance: 0.09666666666666668
group_class_balance: 0.09999999999999987
cda: 0.07000000000000006
dropout: 0.07000000000000006
attention_entropy: 0.08000000000000007
causal_debias: 0.08999999999999986

Metric: accuracy, Model: roberta, Bias Type: race
no_debiasing: 0.0050000000000000044
group_balance: 0.017

In [5]:
# show the average fairness score for each bias type
for metric in fairness_metrics:
    print(f"Average fairness score for {metric}:")
    avg_score = fairness_df[fairness_df['fairness_metric'] == metric].groupby(['bias_type'])['score'].mean().reset_index()
    print(avg_score)


Average fairness score for accuracy:
  bias_type     score
0    gender  0.020357
1      race  0.014405
2  religion  0.081905
Average fairness score for f1:
  bias_type     score
0    gender  0.066184
1      race  0.029809
2  religion  0.170914
Average fairness score for fpr:
  bias_type     score
0    gender  0.015972
1      race  0.014296
2  religion  0.032984
Average fairness score for fnr:
  bias_type     score
0    gender  0.086337
1      race  0.058762
2  religion  0.332525
Average fairness score for individual_fairness:
  bias_type     score
0    gender  0.003858
1      race  0.019275
2  religion  0.008408


In [6]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(debiasing_methods)
    print(avg_score)
    print('\n')

Average fairness score for training type all axes:
fairness_metric      accuracy        f1       fnr       fpr  \
debiasing_method                                              
no_debiasing         0.038009  0.084849  0.161212  0.020708   
group_balance        0.044537  0.099717  0.173913  0.023810   
group_class_balance  0.041435  0.125013  0.236188  0.014794   
cda                  0.035880  0.080788  0.167265  0.018564   
dropout              0.037269  0.085589  0.169209  0.020820   
attention_entropy    0.037222  0.088371  0.161610  0.020243   
causal_debias        0.039954  0.089016  0.129075  0.025391   

fairness_metric      individual_fairness  
debiasing_method                          
no_debiasing                    0.011719  
group_balance                   0.012563  
group_class_balance             0.011691  
cda                             0.004212  
dropout                         0.009975  
attention_entropy               0.013964  
causal_debias                   0.011

In [7]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for train_type in training_types:
    print(f"Average fairness score for training type {train_type}:")
    # show the difference in the average fairness score for each debiasing method compared to no debiasing
    avg_score = fairness_df[fairness_df['training_data'] == train_type].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
    avg_score = avg_score.set_index('debiasing_method')
    avg_score = avg_score.reindex(debiasing_methods)
    # for each debiasing method and metric, calculate the difference from no debiasing of the same metric
    no_debiasing_scores = avg_score.loc['no_debiasing']
    for metric in avg_score.columns:
        if metric != 'debiasing_method':
            avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
    print(avg_score)
    print('\n')

Average fairness score for training type all axes:
fairness_metric      accuracy        f1       fnr       fpr  \
debiasing_method                                              
no_debiasing         0.000000  0.000000  0.000000  0.000000   
group_balance        0.006528  0.014868  0.012700  0.003102   
group_class_balance  0.003426  0.040164  0.074975 -0.005915   
cda                 -0.002130 -0.004060  0.006052 -0.002144   
dropout             -0.000741  0.000740  0.007997  0.000112   
attention_entropy   -0.000787  0.003522  0.000398 -0.000466   
causal_debias        0.001944  0.004167 -0.032137  0.004682   

fairness_metric      individual_fairness  
debiasing_method                          
no_debiasing                    0.000000  
group_balance                   0.000844  
group_class_balance            -0.000028  
cda                            -0.007508  
dropout                        -0.001745  
attention_entropy               0.002244  
causal_debias                  -0.000