In [1]:
import os 
os.chdir('../../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 

from utils.load_settings import load_settings
from utils.latex_helpers import df_to_latex
from utils.fairness_functions import reshape_general_violent_cond_auc_summaries

%matplotlib inline
# load parameters
settings = load_settings()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa_analysis_all\psa-analysis-public-repo


## AUC Fairness 

In this notebook, we check the AUC conditioned on the sensitive attributes of race and gender. 

In [2]:
# read in auc data for prediction problems of interest
auc_data = {}
for time_period in [
                    "six-month", 
                    "two-year"
                    ]:
    
    for problem in ['general', 'violent']:
        problem_path = "kentucky/fairness/race-auc/" + time_period + "/" + problem
        dircontents = os.listdir()
        if os.path.isdir(problem_path):
            onlyfiles = [f for f in os.listdir(problem_path) if os.path.isfile(os.path.join(problem_path, f))]
        else: 
            continue
        for filename in onlyfiles:
            model_name = filename.split("-")[0]
            if model_name in ['ebm', 'riskslim', 'arnold']:
                auc_data["-".join((problem, time_period, model_name))] = pd.read_csv(problem_path + "/" + filename)

auc_data.keys()

dict_keys(['general-six-month-arnold', 'general-six-month-ebm', 'general-six-month-riskslim', 'violent-six-month-arnold', 'violent-six-month-ebm', 'violent-six-month-riskslim', 'general-two-year-arnold', 'general-two-year-ebm', 'general-two-year-riskslim', 'violent-two-year-arnold', 'violent-two-year-ebm', 'violent-two-year-riskslim'])

## AUC of Arnold NCA and Arnold NVCA

In [3]:
# Arnold NCA corresp to general-two-year-arnold
# Arnold NVCA corresp to violent-two-year-arnold
auc_data['general-two-year-arnold'].head()
auc_data['violent-two-year-arnold'].head()

Unnamed: 0,Attribute,Attribute Value,AUC,fold_num
0,race,Caucasian,0.738966,0
1,race,African-American,0.734826,0
2,race,Other,0.772494,0
3,sex,male,0.737504,0
4,sex,female,0.701145,0


In [4]:
# table format for AUC
df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-ebm'],
                                           general_model_name="Arnold NCA",
                                           violent_auc=auc_data['violent-two-year-ebm'],
                                           violent_model_name="Arnold NVCA"
                                           )
df

Unnamed: 0,Label,African-American,Caucasian,Other,female,male,Model,race\_range,sex\_range
0,general\_two\_year,0.73133,0.741527,0.692108,0.73141,0.744553,Arnold NCA,0.049419,0.013143
1,violent\_two\_year,0.774554,0.770887,0.796988,0.745797,0.766778,Arnold NVCA,0.026101,0.020981


In [9]:
models = {'arnold': ['Arnold NCA', 'Arnold NVCA Raw'], 
          'ebm': ['EBM', 'EBM'], 'riskslim': ['RiskSLIM', 'RiskSLIM']}

res_dfs = []
for model_name, model_table_labels in models.items():
    viol_auc_name = 'violent-two-year-' + model_name
    res_df = reshape_general_violent_cond_auc_summaries(general_auc=auc_data['general-two-year-' + model_name],
                                                        general_model_name=model_table_labels[0],
                                                        violent_auc=auc_data['violent-two-year-' + model_name],
                                                        violent_model_name=model_table_labels[1]
                                                        )
    res_dfs.append(res_df)
    
auc_summary_table = (pd.concat(res_dfs, axis=0)
                        .rename(columns={"African-American": "Afr-Am.",
                                        "Other": "Other Race",
                                        "Caucasian": "Cauc.",
                                        "female": "Female", 
                                        "male": "Male"}))


auc_summary_table = auc_summary_table[["Model", "Label", 
                                       "Afr-Am.", "Cauc.", "Other Race", "race\_range",
                                       "Female", "Male", "sex\_range"]]

for column_name in ["Afr-Am.", "Cauc.", "Other Race", "race\_range",
                    "Female", "Male", "sex\_range"]:
    auc_summary_table[column_name] = auc_summary_table[column_name].apply(lambda x: round(x, 3))
    
auc_summary_table

Unnamed: 0,Model,Label,Afr-Am.,Cauc.,Other Race,race\_range,Female,Male,sex\_range
0,Arnold NCA,general\_two\_year,0.673,0.694,0.634,0.061,0.692,0.692,0.0
1,Arnold NVCA Raw,violent\_two\_year,0.727,0.741,0.8,0.072,0.729,0.734,0.005
0,EBM,general\_two\_year,0.731,0.742,0.692,0.049,0.731,0.745,0.013
1,EBM,violent\_two\_year,0.775,0.771,0.797,0.026,0.746,0.767,0.021
0,RiskSLIM,general\_two\_year,0.693,0.697,0.611,0.086,0.685,0.702,0.017
1,RiskSLIM,violent\_two\_year,0.743,0.735,0.694,0.049,0.699,0.728,0.028


In [10]:
# print as latex
df_to_latex(auc_summary_table[auc_summary_table["Label"] == "violent\_two\_year"])

\begin{array}{l | l | r | r | r | r | r | r | r} \hline
    \verb|     Model     | & \verb|      Label       | & \verb|Afr-Am.| & \verb|Cauc.| & \verb|Other Race| & \verb|race\_range| & \verb|Female| & \verb|Male | & \verb|sex\_range| \\ \hline
    \hline
    \verb|Arnold NVCA Raw| & violent\_two\_year &   0.727 & 0.741 &      0.800 &       0.072 &  0.729 & 0.734 &      0.005 \\ \hline
    EBM             & violent\_two\_year &   0.775 & 0.771 &      0.797 &       0.026 &  0.746 & 0.767 &      0.021 \\ \hline
    RiskSLIM        & violent\_two\_year &   0.743 & 0.735 &      0.694 &       0.049 &  0.699 & 0.728 &      0.028 \\ \hline
\end{array}


In [7]:
df_to_latex(auc_summary_table[auc_summary_table["Label"] == "general\_two\_year"])

\begin{array}{l | l | r | r | r | r | r | r | r} \hline
    \verb|  Model   | & \verb|      Label       | & \verb|Afr-Am.| & \verb|Cauc.| & \verb|Other Race| & \verb|race\_range| & \verb|Female| & \verb|Male | & \verb|sex\_range| \\ \hline
    \hline
    \verb|Arnold NCA| & general\_two\_year &   0.673 & 0.694 &      0.634 &       0.061 &  0.692 & 0.692 &      0.000 \\ \hline
    EBM        & general\_two\_year &   0.731 & 0.742 &      0.692 &       0.049 &  0.731 & 0.745 &      0.013 \\ \hline
    RiskSLIM   & general\_two\_year &   0.693 & 0.697 &      0.611 &       0.086 &  0.685 & 0.702 &      0.017 \\ \hline
\end{array}


In [11]:
## write result
auc_summary_table.to_csv("kentucky/logs/fairness_results/auc_visualizations/auc_summary_general_violence.csv")