In [1]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 

from utils.load_settings import load_settings
from utils.latex_helpers import df_to_latex
from utils.fairness_functions import reshape_general_violent_cond_auc_summaries

%matplotlib inline
# load parameters
settings = load_settings()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa-analysis


## AUC Fairness 

In this notebook, we check the AUC conditioned on the sensitive attributes of race and gender. 

In [2]:
# read in auc data for prediction problems of interest
auc_data = {}
for time_period in [
                    "six-month", 
                    "two-year"
                    ]:
    
    for problem in ['general', 'violent']:
        problem_path = "kentucky/KY AUC/" + time_period + "/" + problem
        dircontents = os.listdir()
        if os.path.isdir(problem_path):
            onlyfiles = [f for f in os.listdir(problem_path) if os.path.isfile(os.path.join(problem_path, f))]
        else: 
            continue
        for filename in onlyfiles:
            model_name = filename.split("_")[0]
            if model_name in ['ebm', 'riskslim', 'arnold']:
                auc_data["-".join((problem, time_period, model_name))] = pd.read_csv(problem_path + "/" + filename)

auc_data.keys()

dict_keys(['general-six-month-arnold', 'general-six-month-ebm', 'violent-six-month-arnold', 'violent-six-month-ebm', 'general-two-year-arnold', 'general-two-year-ebm', 'violent-two-year-arnold', 'violent-two-year-ebm'])

## AUC of Arnold NCA and Arnold NVCA

In [3]:
# Arnold NCA corresp to general-two-year-arnold
# Arnold NVCA corresp to violent-two-year-arnold
auc_data['general-two-year-arnold'].head()
auc_data['violent-two-year-arnold'].head()

Unnamed: 0,Attribute,Attribute Value,AUC,fold_num
0,race,African-American,0.773576,0
1,race,Caucasian,0.781312,0
2,race,Other,0.896,0
3,sex,male,0.780144,0
4,sex,female,0.770254,0


In [11]:
# table format for AUC
df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-ebm'],
                                           general_model_name="Arnold NCA",
                                           violent_auc=auc_data['violent-two-year-ebm'],
                                           violent_model_name="Arnold NVCA"
                                           )
df

Unnamed: 0,Label,African-American,Caucasian,Other,female,male,Model,race_range,sex_range
0,general\_two\_year,0.735302,0.742442,0.702144,0.734345,0.744601,Arnold NCA,0.040298,0.010256
1,violent\_two\_year,0.823364,0.808883,0.931942,0.805875,0.811197,Arnold NVCA,0.123059,0.005323


In [17]:
models = {'arnold': ['Arnold NCA', 'Arnold NVCA Raw'], 
          'ebm': ['EBM', 'EBM']}

res_dfs = []
for model_name, model_table_labels in models.items():
    viol_auc_name = 'violent-two-year-' + model_name
    res_df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-' + model_name],
                                                        general_model_name=model_table_labels[0],
                                                        violent_auc=auc_data['violent-two-year-' + model_name],
                                                        violent_model_name=model_table_labels[1]
                                                        )
    res_dfs.append(res_df)
    
auc_summary_table = (pd.concat(res_dfs, axis=0)
                        .rename(columns={"African-American": "Afr-Am.",
                                        "Other": "Other Race",
                                        "Caucasian": "Cauc.",
                                        "female": "Female", 
                                        "male": "Male"}))

auc_summary_table['Hisp.'] = "-"

auc_summary_table = auc_summary_table[["Model", "Label", 
                                       "Afr-Am.", "Cauc.", "Hisp.", "Other Race", "race_range",
                                       "Female", "Male", "sex_range"]]

for column_name in ["Afr-Am.", "Cauc.", "Other Race", "race_range",
                    "Female", "Male", "sex_range"]:
    auc_summary_table[column_name] = auc_summary_table[column_name].apply(lambda x: round(x, 3))
    
auc_summary_table

Unnamed: 0,Model,Label,Afr-Am.,Cauc.,Hisp.,Other Race,race_range,Female,Male,sex_range
0,Arnold NCA,general\_two\_year,0.666,0.689,-,0.631,0.057,0.687,0.684,0.003
1,Arnold NVCA Raw,violent\_two\_year,0.778,0.787,-,0.881,0.103,0.79,0.782,0.008
0,EBM,general\_two\_year,0.735,0.742,-,0.702,0.04,0.734,0.745,0.01
1,EBM,violent\_two\_year,0.823,0.809,-,0.932,0.123,0.806,0.811,0.005


In [18]:
# print as latex
from utils.latex_helpers import df_to_latex
df_to_latex(auc_summary_table)

\begin{array}{l | l | r | r | l | r | r | r | r | r} \hline
    \verb|     Model     | & \verb|      Label       | & \verb|Afr-Am.| & \verb|Cauc.| & \verb|Hisp.| & \verb|Other Race| & \verb|race_range| & \verb|Female| & \verb|Male | & \verb|sex_range| \\ \hline
    \hline
    \verb|Arnold NCA     | & general\_two\_year &   0.666 & 0.689 & \verb|-    | &      0.631 &      0.057 &  0.687 & 0.684 &     0.003 \\ \hline
    \verb|Arnold NVCA Raw| & violent\_two\_year &   0.778 & 0.787 & \verb|-    | &      0.881 &      0.103 &  0.790 & 0.782 &     0.008 \\ \hline
    EBM             & general\_two\_year &   0.735 & 0.742 & \verb|-    | &      0.702 &      0.040 &  0.734 & 0.745 &     0.010 \\ \hline
    EBM             & violent\_two\_year &   0.823 & 0.809 & \verb|-    | &      0.932 &      0.123 &  0.806 & 0.811 &     0.005 \\ \hline
\end{array}


In [19]:
## write result
auc_summary_table.to_csv("kentucky/logs/fairness_results/auc_visualizations/auc_summary_general_violence.csv")