In [1]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import pandas as pd 
import numpy as np 

from utils.load_settings import load_settings

# load parameters
settings = load_settings()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa-analysis


#### Read in data 



In [2]:
broward_data = pd.read_csv("broward/data/broward_data.csv")
kentucky_data = pd.read_csv("kentucky/data/processed/kentucky_data.csv")

for decoder_name, decoder_dict in settings['decoders'].items():
    broward_data = broward_data.replace({decoder_name: decoder_dict})
    kentucky_data = kentucky_data.replace({decoder_name: decoder_dict})

### Compute group balance 

In [3]:
## broward
broward_attrs = broward_data[['person_id', 'screening_date', 'sex', 'race']]
            
broward_attrs = (pd.melt(broward_attrs, id_vars =['person_id', 'screening_date'], value_vars=['sex', 'race'])
                .rename({'variable': 'Attribute',
                         'value': 'Attribute Value'},
                        axis=1))

broward_attrs = broward_attrs.groupby(["Attribute", "Attribute Value"]).size().to_frame('n_inds').reset_index()

# check totals are same for race and sex
assert (broward_attrs[broward_attrs['Attribute']=='race']['n_inds'].sum()) == (broward_attrs[broward_attrs['Attribute']=='sex']['n_inds'].sum())
total_inds = broward_attrs[broward_attrs['Attribute']=='race']['n_inds'].sum()

# add percent col
broward_attrs["% total"] = ((broward_attrs['n_inds'] / total_inds) * 100).apply(lambda x: round(x, 2))
broward_attrs

Unnamed: 0,Attribute,Attribute Value,n_inds,% total
0,race,African-American,1124,57.52
1,race,Asian,5,0.26
2,race,Caucasian,601,30.76
3,race,Hispanic,127,6.5
4,race,Native American,4,0.2
5,race,Other,93,4.76
6,sex,female,251,12.85
7,sex,male,1703,87.15


In [4]:
## kentucky
kentucky_attrs = kentucky_data[['person_id', 'screening_date', 'sex', 'race']]

kentucky_attrs = (pd.melt(kentucky_attrs, id_vars =['person_id', 'screening_date'], value_vars=['sex', 'race'])
                .rename({'variable': 'Attribute',
                         'value': 'Attribute Value'},
                        axis=1))

kentucky_attrs = kentucky_attrs.groupby(["Attribute", "Attribute Value"]).size().to_frame('n_inds').reset_index()

# check totals are same for race and sex
assert (kentucky_attrs[kentucky_attrs['Attribute']=='race']['n_inds'].sum()) == (kentucky_attrs[kentucky_attrs['Attribute']=='sex']['n_inds'].sum())
total_inds = kentucky_attrs[kentucky_attrs['Attribute']=='race']['n_inds'].sum()

# add percent col
kentucky_attrs["% total"] = ((kentucky_attrs['n_inds'] / total_inds) * 100).apply(lambda x: round(x, 2))
kentucky_attrs

Unnamed: 0,Attribute,Attribute Value,n_inds,% total
0,race,African-American,25367,17.37
1,race,Asian,416,0.28
2,race,Caucasian,117627,80.56
3,race,Indian,117,0.08
4,race,Other,2476,1.7
5,sex,female,44658,30.59
6,sex,male,101345,69.41


In [5]:
## write result
broward_attrs.to_csv("broward/logs/fairness_results/broward_racial_distr.csv")
kentucky_attrs.to_csv("kentucky/logs/fairness_results/kentucky_racial_distr.csv")

### Check for recidivism balance across all groups for all problems

i.e. $P(Y = 1 | group = g_1)$

In [5]:
# loop through problem 
# loop through sensitive attribute 
# need the 'person_id', 'screening_date', 'sex', 'race', recid 
for label in settings['broward_labels']:
    broward_attrs_label = broward_data[['person_id', 'screening_date', 'sex', 'race'] + [label]]
    

   person_id screening_date   sex              race  recid_two_year
0       1001     2014-02-03  male  African-American               0
1        101     2013-01-13  male          Hispanic               1
2        101     2014-02-02  male          Hispanic               0
3       1015     2014-01-22  male  African-American               0
4       1016     2013-04-15  male             Other               1
   person_id screening_date   sex              race  recid_six_month
0       1001     2014-02-03  male  African-American                0
1        101     2013-01-13  male          Hispanic                0
2        101     2014-02-02  male          Hispanic                0
3       1015     2014-01-22  male  African-American                0
4       1016     2013-04-15  male             Other                1
   person_id screening_date   sex              race  recid_drug2
0       1001     2014-02-03  male  African-American            0
1        101     2013-01-13  male          Hispa

In [6]:
kentucky_data.columns

Index(['person_id', 'sex', 'race', 'screening_date', 'age_at_current_charge',
       'p_arrest', 'p_charges', 'p_violence', 'p_felony', 'p_misdemeanor',
       'p_property', 'p_murder', 'p_assault', 'p_sex_offense', 'p_weapon',
       'p_felprop_viol', 'p_felassult', 'p_misdeassult', 'p_traffic', 'p_drug',
       'p_dui', 'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing',
       'p_trespass', 'ADE', 'Treatment', 'p_prison', 'p_jail30',
       'p_fta_two_year', 'p_fta_two_year_plus', 'p_pending_charge',
       'p_probation', 'six_month', 'one_year', 'three_year', 'five_year',
       'current_violence', 'fta_risk_score_raw', 'nca_risk_score_raw',
       'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc',
       'recid_two_year', 'recid_drug_two_year', 'recid_traffic_two_year',
       'recid_violence_two_year', 'recid_F_two_year', 'recid_M_two_year',
       'recid_property_two_year', 'recid_six_month', 'recid_drug_six_month',
       'recid_traffic_six_month', 'recid_violence_six_