In [1]:
import os 
os.chdir('../../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 

from utils.load_settings import load_settings
from utils.latex_helpers import df_to_latex
from utils.fairness_functions import reshape_general_violent_cond_auc_summaries

%matplotlib inline
# load parameters
settings = load_settings()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa_analysis_all\psa-analysis-public-repo


## AUC Fairness 

In this notebook, we check the AUC conditioned on the sensitive attributes of race and gender. 

In [15]:
# read in auc data for prediction problems of interest
auc_data = {}
for time_period in [
                    "six-month", 
                    "two-year"
                    ]:
    
    for problem in ['general', 'violent']:
        problem_path = "kentucky/fairness/race-auc/" + time_period + "/" + problem
        dircontents = os.listdir()
        if os.path.isdir(problem_path):
            onlyfiles = [f for f in os.listdir(problem_path) if os.path.isfile(os.path.join(problem_path, f))]
        else: 
            continue
        for filename in onlyfiles:
            model_name = filename.split("-")[0]
            if model_name in ['ebm', 'riskslim', 'arnold']:
                auc_data["-".join((problem, time_period, model_name))] = pd.read_csv(problem_path + "/" + filename)

auc_data.keys()

dict_keys(['general-six-month-arnold', 'general-six-month-ebm', 'general-six-month-riskslim', 'violent-six-month-arnold', 'violent-six-month-ebm', 'violent-six-month-riskslim', 'general-two-year-arnold', 'general-two-year-ebm', 'general-two-year-riskslim', 'violent-two-year-arnold', 'violent-two-year-ebm', 'violent-two-year-riskslim'])

## AUC of Arnold NCA and Arnold NVCA

In [16]:
# Arnold NCA corresp to general-two-year-arnold
# Arnold NVCA corresp to violent-two-year-arnold
auc_data['general-two-year-arnold'].head()
auc_data['violent-two-year-arnold'].head()

Unnamed: 0,Attribute,Attribute Value,AUC,fold_num
0,race,African-American,0.716755,0
1,race,Caucasian,0.741007,0
2,race,Other,0.909742,0
3,sex,male,0.733684,0
4,sex,female,0.729115,0


In [17]:
# table format for AUC
df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-ebm'],
                                           general_model_name="Arnold NCA",
                                           violent_auc=auc_data['violent-two-year-ebm'],
                                           violent_model_name="Arnold NVCA"
                                           )
df

Unnamed: 0,Label,African-American,Caucasian,Other,female,male,Model,race_range,sex_range
0,general\_two\_year,0.742298,0.750671,0.695851,0.744689,0.752645,Arnold NCA,0.05482,0.007956
1,violent\_two\_year,0.775129,0.769725,0.765827,0.744145,0.765857,Arnold NVCA,0.009302,0.021713


In [18]:
models = {'arnold': ['Arnold NCA Raw', 'Arnold NVCA Raw'], 
          'ebm': ['EBM', 'EBM'], 'riskslim': ['RiskSLIM', 'RiskSLIM']}

res_dfs = []
for model_name, model_table_labels in models.items():
    viol_auc_name = 'violent-two-year-' + model_name
    res_df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-' + model_name],
                                                        general_model_name=model_table_labels[0],
                                                        violent_auc=auc_data['violent-two-year-' + model_name],
                                                        violent_model_name=model_table_labels[1]
                                                        )
    res_dfs.append(res_df)
    
auc_summary_table = (pd.concat(res_dfs, axis=0)
                        .rename(columns={"African-American": "Afr-Am.",
                                        "Other": "Other Race",
                                        "Caucasian": "Cauc.",
                                        "female": "Female", 
                                        "male": "Male",
                                        }))


auc_summary_table = auc_summary_table[["Model", "Label", 
                                       "Afr-Am.", "Cauc.", "Other Race", "race_range",
                                       "Female", "Male", "sex_range"]]

for column_name in ["Afr-Am.", "Cauc.", "Other Race", "race_range",
                    "Female", "Male", "sex_range"]:
    auc_summary_table[column_name] = auc_summary_table[column_name].apply(lambda x: round(x, 3))
    
auc_summary_table

Unnamed: 0,Model,Label,Afr-Am.,Cauc.,Other Race,race_range,Female,Male,sex_range
0,Arnold NCA Raw,general\_two\_year,0.692,0.713,0.653,0.059,0.714,0.709,0.005
1,Arnold NVCA Raw,violent\_two\_year,0.728,0.74,0.767,0.039,0.728,0.734,0.006
0,EBM,general\_two\_year,0.742,0.751,0.696,0.055,0.745,0.753,0.008
1,EBM,violent\_two\_year,0.775,0.77,0.766,0.009,0.744,0.766,0.022
0,RiskSLIM,general\_two\_year,0.705,0.708,0.62,0.088,0.699,0.712,0.013
1,RiskSLIM,violent\_two\_year,0.744,0.736,0.68,0.063,0.706,0.73,0.024


In [19]:
# print as latex
df_to_latex(auc_summary_table[auc_summary_table["Label"] == "violent\_two\_year"])

\begin{array}{l | l | r | r | r | r | r | r | r} \hline
    \verb|     Model     | & \verb|      Label       | & \verb|Afr-Am.| & \verb|Cauc.| & \verb|Other Race| & \verb|race_range| & \verb|Female| & \verb|Male | & \verb|sex_range| \\ \hline
    \hline
    \verb|Arnold NVCA Raw| & violent\_two\_year &   0.728 & 0.740 &      0.767 &      0.039 &  0.728 & 0.734 &     0.006 \\ \hline
    EBM             & violent\_two\_year &   0.775 & 0.770 &      0.766 &      0.009 &  0.744 & 0.766 &     0.022 \\ \hline
    RiskSLIM        & violent\_two\_year &   0.744 & 0.736 &      0.680 &      0.063 &  0.706 & 0.730 &     0.024 \\ \hline
\end{array}


In [20]:
df_to_latex(auc_summary_table[auc_summary_table["Label"] == "general\_two\_year"])

\begin{array}{l | l | r | r | r | r | r | r | r} \hline
    \verb|    Model     | & \verb|      Label       | & \verb|Afr-Am.| & \verb|Cauc.| & \verb|Other Race| & \verb|race_range| & \verb|Female| & \verb|Male | & \verb|sex_range| \\ \hline
    \hline
    \verb|Arnold NCA Raw| & general\_two\_year &   0.692 & 0.713 &      0.653 &      0.059 &  0.714 & 0.709 &     0.005 \\ \hline
    EBM            & general\_two\_year &   0.742 & 0.751 &      0.696 &      0.055 &  0.745 & 0.753 &     0.008 \\ \hline
    RiskSLIM       & general\_two\_year &   0.705 & 0.708 &      0.620 &      0.088 &  0.699 & 0.712 &     0.013 \\ \hline
\end{array}


In [21]:
## write result
auc_summary_table.to_csv("kentucky/logs/fairness_results/auc_visualizations/auc_summary_general_violence.csv")