In [1]:
import os
import json
from utils.vocabulary import *

data = "civil"
root_dir = f"/scratch/yifwang/fairness_x_explainability/decoder_results_{data}"


models = ['qwen3_4b']# ["llama_3b", "qwen_3b", "qwen3_4b"]
bias_types = ["race", "gender", "religion"]
debiasing_methods = ["zero_shot", "few_shot", "fairness_imagination", "fairness_instruction"]

if data == "civil":
    num_examples = {"race": 2000, "gender": 2000, "religion": 1000}
elif data == "jigsaw":
    num_examples = {"race": 400, "gender": 800, "religion": 200}
fairness_metrics = ["model_accuracy", "accuracy", "fpr", "fnr", "individual_fairness"]



In [2]:
fairness_dict = {"model": [], "bias_type": [], "debiasing_method": [], "fairness_metric": [], "score": []}
for model in models:
    for bias_type in bias_types:
        groups = SOCIAL_GROUPS[bias_type]
        
        for debiasing_method in debiasing_methods:
            
            file_path = os.path.join(root_dir, f"{model}_{data}_{bias_type}_test_{num_examples[bias_type]}", debiasing_method, "fairness", f"fairness_{bias_type}_test_summary_stats.json")
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            with open(file_path, "r") as f:
                fairness_data = json.load(f)
            
            for metric in fairness_metrics:
                fairness_dict['model'].append(model)
                fairness_dict['bias_type'].append(bias_type)
                fairness_dict['debiasing_method'].append(debiasing_method)
                if metric != "individual_fairness" and metric != "model_accuracy":
                    fairness_dict['fairness_metric'].append(metric)
                    fairness_dict['score'].append(round(sum([abs(fairness_data['Group_Fairness']["average"][group][metric]) for group in groups])*100, 2))
                elif metric == "individual_fairness":
                    fairness_dict['fairness_metric'].append("individual_fairness")
                    fairness_dict['score'].append(round(fairness_data['Individual_Fairness']['overall']["predicted_class"]["abs_average"]*100, 2))
                elif metric == "model_accuracy":
                    fairness_dict['fairness_metric'].append("model_accuracy")
                    fairness_dict['score'].append(round(fairness_data["Metrics"]["overall"]["accuracy"] * 100, 2))
                    



In [3]:
# convert to a pandas DataFrame
import pandas as pd
fairness_df = pd.DataFrame(fairness_dict)
fairness_df

Unnamed: 0,model,bias_type,debiasing_method,fairness_metric,score
0,qwen3_4b,race,zero_shot,model_accuracy,69.55
1,qwen3_4b,race,zero_shot,accuracy,0.6
2,qwen3_4b,race,zero_shot,fpr,7.13
3,qwen3_4b,race,zero_shot,fnr,13.25
4,qwen3_4b,race,zero_shot,individual_fairness,2.55
5,qwen3_4b,race,few_shot,model_accuracy,63.0
6,qwen3_4b,race,few_shot,accuracy,1.9
7,qwen3_4b,race,few_shot,fpr,10.17
8,qwen3_4b,race,few_shot,fnr,8.04
9,qwen3_4b,race,few_shot,individual_fairness,3.3


In [7]:
# 加color coding表示比较
# print results as metric1_race/metric1_gender/metric1_religion metric2_race/ ...
debiasing_method = "fairness_instruction"
reported_metrics = ["model_accuracy", "accuracy", "fpr", "fnr", "individual_fairness"]
bias_types_order = ["race", "gender", "religion"]
print_string = ""
for reported_metric in reported_metrics:
    for bias_type in bias_types_order:
        default_score = fairness_df[(fairness_df['debiasing_method'] == "zero_shot") & (fairness_df['bias_type'] == bias_type) & (fairness_df['fairness_metric'] == reported_metric)]['score'].values[0]
        score = fairness_df[(fairness_df['debiasing_method'] == debiasing_method) & (fairness_df['bias_type'] == bias_type) & (fairness_df['fairness_metric'] == reported_metric)]['score'].values[0]
        if debiasing_method != "zero_shot" and reported_metric != "model_accuracy":
            if score < default_score:
                print_string += "\\textcolor{forestgreen}{"
                print_string += f"{score:.2f}" + "}/"
            elif score > default_score:
                print_string += "\\textcolor{red}{"
                print_string += f"{score:.2f}" + "}/"
            else:
                print_string += f"{score:.2f}" + "/"
        else:
            print_string += f"{score:.2f}" + "/"
    print_string = print_string[:-1] + " & "

print(print_string[:-2])

70.40/79.77/80.47 & 0.60/\textcolor{red}{1.35}/\textcolor{red}{19.33} & \textcolor{forestgreen}{4.30}/\textcolor{forestgreen}{0.39}/\textcolor{forestgreen}{4.67} & \textcolor{forestgreen}{11.11}/\textcolor{red}{5.24}/\textcolor{forestgreen}{5.08} & \textcolor{forestgreen}{2.02}/\textcolor{forestgreen}{1.83}/\textcolor{forestgreen}{1.71} 


In [4]:
# print out each metric for different debiasing methods, for all models and bias types
for metric in fairness_metrics:
    print(f"\nMetric: {metric}")
    for model in models:
        for bias_type in bias_types:
            print(f"\nMetric: {metric}, Model: {model}, Bias Type: {bias_type}")
            for debiasing_method in debiasing_methods:
                subset = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type) & (fairness_df['debiasing_method'] == debiasing_method) & (fairness_df['fairness_metric'] == metric)]
                if not subset.empty:
                    print(f"{debiasing_method}: {subset['score'].values[0]}")


Metric: accuracy

Metric: accuracy, Model: llama_3b, Bias Type: race
zero_shot: 0.014500000000000068
few_shot: 0.011999999999999955
fairness_imagination: 0.008000000000000007
fairness_instruction: 0.026000000000000023

Metric: accuracy, Model: llama_3b, Bias Type: gender
zero_shot: 0.023499999999999965
few_shot: 0.008999999999999952
fairness_imagination: 0.008499999999999952
fairness_instruction: 0.017000000000000015

Metric: accuracy, Model: llama_3b, Bias Type: religion
zero_shot: 0.2466666666666666
few_shot: 0.21266666666666667
fairness_imagination: 0.21866666666666668
fairness_instruction: 0.21533333333333315

Metric: accuracy, Model: qwen_3b, Bias Type: race
zero_shot: 0.006000000000000005
few_shot: 0.04799999999999993
fairness_imagination: 0.017000000000000015
fairness_instruction: 0.000500000000000056

Metric: accuracy, Model: qwen_3b, Bias Type: gender
zero_shot: 0.008499999999999952
few_shot: 0.0040000000000000036
fairness_imagination: 0.018500000000000072
fairness_instructio

In [5]:
# show the average fairness score for each bias type
for metric in fairness_metrics:
    print(f"Average fairness score for {metric}:")
    avg_score = fairness_df[fairness_df['fairness_metric'] == metric].groupby(['bias_type'])['score'].mean().reset_index()
    print(avg_score)

Average fairness score for accuracy:
  bias_type     score
0    gender  0.011875
1      race  0.014292
2  religion  0.203833
Average fairness score for f1:
  bias_type     score
0    gender  0.023567
1      race  0.024958
2  religion  0.130482
Average fairness score for fpr:
  bias_type     score
0    gender  0.022738
1      race  0.060277
2  religion  0.181168
Average fairness score for fnr:
  bias_type     score
0    gender  0.066597
1      race  0.086762
2  religion  0.070716
Average fairness score for individual_fairness:
  bias_type     score
0    gender  0.022446
1      race  0.019458
2  religion  0.024068


In [6]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for bias_type in bias_types:
    for model in models:
        print(f"\nModel: {model}, Bias Type: {bias_type}")
        avg_score = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type)].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
        

        # show the difference in the average fairness score for each debiasing method compared to no debiasing
        #avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
        avg_score = avg_score.set_index('debiasing_method')
        avg_score = avg_score.reindex(debiasing_methods)
        print(avg_score)
        print('\n')


Model: llama_3b, Bias Type: race
fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot               0.0145  0.015701  0.105374  0.110282   
few_shot                0.0120  0.012175  0.008027  0.038745   
fairness_imagination    0.0080  0.004217  0.094418  0.087010   
fairness_instruction    0.0260  0.008312  0.037876  0.018944   

fairness_metric       individual_fairness  
debiasing_method                           
zero_shot                        0.021278  
few_shot                         0.001059  
fairness_imagination             0.026466  
fairness_instruction             0.013469  



Model: qwen_3b, Bias Type: race
fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot               0.0060  0.049901  0.130871  0.040550   
few_shot                0.0480  0.032762  0.032588  0.091688   
fairness_imagination    0.

In [7]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

for bias_type in bias_types:
    for model in models:
        print(f"\nModel: {model}, Bias Type: {bias_type}")
        avg_score = fairness_df[(fairness_df['model'] == model) & (fairness_df['bias_type'] == bias_type)].groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
        

        # show the difference in the average fairness score for each debiasing method compared to no debiasing
        #avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
        avg_score = avg_score.set_index('debiasing_method')
        avg_score = avg_score.reindex(debiasing_methods)
        no_debiasing_scores = avg_score.loc['zero_shot']
        for metric in avg_score.columns:
            if metric != 'debiasing_method':
                avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
        print(avg_score)
        print('\n')



Model: llama_3b, Bias Type: race
fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot               0.0000  0.000000  0.000000  0.000000   
few_shot               -0.0025 -0.003526 -0.097347 -0.071537   
fairness_imagination   -0.0065 -0.011484 -0.010956 -0.023272   
fairness_instruction    0.0115 -0.007390 -0.067497 -0.091338   

fairness_metric       individual_fairness  
debiasing_method                           
zero_shot                        0.000000  
few_shot                        -0.020220  
fairness_imagination             0.005187  
fairness_instruction            -0.007809  



Model: qwen_3b, Bias Type: race
fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot               0.0000  0.000000  0.000000  0.000000   
few_shot                0.0420 -0.017138 -0.098283  0.051138   
fairness_imagination    0.

In [8]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type

avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()


# show the difference in the average fairness score for each debiasing method compared to no debiasing
#avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
avg_score = avg_score.set_index('debiasing_method')
avg_score = avg_score.reindex(debiasing_methods)
print(avg_score)
print('\n')

fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot             0.075093  0.062425  0.070783  0.104174   
few_shot              0.080074  0.064278  0.058294  0.128084   
fairness_imagination  0.075907  0.059893  0.093008  0.083224   
fairness_instruction  0.075593  0.052080  0.076681  0.036761   

fairness_metric       individual_fairness  
debiasing_method                           
zero_shot                        0.025211  
few_shot                         0.015659  
fairness_imagination             0.028943  
fairness_instruction             0.018151  




In [9]:
# show for each debiasing method, what is there average fairness score (across model types and bias types) for each metric and training type, show in one table, where each row is a debiasing method, and each column is a metric and training type


avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()


# show the difference in the average fairness score for each debiasing method compared to no debiasing
#avg_score = fairness_df.groupby(['debiasing_method', 'fairness_metric'])['score'].mean().unstack().reset_index()
avg_score = avg_score.set_index('debiasing_method')
avg_score = avg_score.reindex(debiasing_methods)
no_debiasing_scores = avg_score.loc['zero_shot']
for metric in avg_score.columns:
    if metric != 'debiasing_method':
        avg_score[metric] = avg_score[metric] - no_debiasing_scores[metric]
print(avg_score)
print('\n')

fairness_metric       accuracy        f1       fnr       fpr  \
debiasing_method                                               
zero_shot             0.000000  0.000000  0.000000  0.000000   
few_shot              0.004981  0.001853 -0.012489  0.023910   
fairness_imagination  0.000815 -0.002532  0.022226 -0.020950   
fairness_instruction  0.000500 -0.010345  0.005899 -0.067413   

fairness_metric       individual_fairness  
debiasing_method                           
zero_shot                        0.000000  
few_shot                        -0.009552  
fairness_imagination             0.003732  
fairness_instruction            -0.007060  


