In [1]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np 
import pandas as pd 
from broward.models.advanced_models.six_month import stumps

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa-analysis


Prepare data/make predictions. Currently, testing on the six-month violent felonies problem. 

In [2]:
## load whole data
data = pd.read_csv("broward/data/broward_stumps.csv")
X_stumps, Y_stumps = data.loc[:,:'five_year>=1'], data['recid_violent6'].values
Y_stumps[Y_stumps == -1] = 0
cols = X_stumps.columns[3:]

## load train & test data
train_stumps = pd.read_csv("broward/data/broward_train_stumps.csv")
test_stumps = pd.read_csv("broward/data/broward_test_stumps.csv")

X_train_stumps, Y_train_stumps = train_stumps.loc[:,:'five_year>=1'], train_stumps['recid_violent6'].values
X_test_stumps, Y_test_stumps = test_stumps.loc[:,:'five_year>=1'], test_stumps['recid_violent6'].values

Y_train_stumps[Y_train_stumps == -1] = 0
Y_test_stumps[Y_test_stumps == -1] = 0

In [3]:
stump_summary = stumps.stump_cv(X = X_stumps, 
                                Y = Y_stumps, 
                                columns=cols, 
                                c_grid={'C': [0.05]}, 
                                seed = 816)

model_id, score_thresholds 1 {'rank_abs': [136]}
get_disparity_predefined_group()


  fnr = fn / (fn + tp)


model_id, score_thresholds 1 {'rank_abs': [140]}
get_disparity_predefined_group()


  fnr = fn / (fn + tp)
  ppv = tp / (tp + fp)
  ratio = fn / fp


model_id, score_thresholds 1 {'rank_abs': [149]}
get_disparity_predefined_group()


  fnr = fn / (fn + tp)
  ppv = tp / (tp + fp)
  ratio = fn / fp


model_id, score_thresholds 1 {'rank_abs': [168]}
get_disparity_predefined_group()


  fnr = fn / (fn + tp)


model_id, score_thresholds 1 {'rank_abs': [160]}
get_disparity_predefined_group()


  fnr = fn / (fn + tp)
  ppv = tp / (tp + fp)
  ratio = fn / fp


In [4]:
stump_summary['confusion_matrix_stats']

Unnamed: 0,Attribute,Attribute Value,PPV,FPR,FNR,Accuracy,Treatment Equality,Individuals Evaluated On,fold_num
2,race,African-American,0.142857,0.328358,0.45,0.660633,0.136364,221,13
11,race,African-American,0.13253,0.373057,0.352941,0.628571,0.083333,210,14
18,race,African-American,0.127907,0.369458,0.352941,0.631818,0.08,220,16
25,race,African-American,0.142857,0.436364,0.111111,0.588235,0.020833,238,13
32,race,African-American,0.169811,0.451282,0.307692,0.565611,0.090909,221,18
7,race,Asian,0.0,1.0,,0.0,0.0,1,13
14,race,Asian,,0.0,,1.0,,1,14
22,race,Asian,,0.0,,1.0,,1,16
29,race,Asian,0.0,0.5,,0.5,0.0,2,13
5,race,Caucasian,0.075,0.318966,0.625,0.66129,0.135135,124,13


In [5]:
from sklearn.linear_model import LogisticRegression

def stump_preds(X_train, Y_train, X_test, Y_test, c, columns, seed):
        
    ## remove unused feature in modeling
    preds_df = X_test
    X_train = X_train.drop(['person_id', 'screening_date', 'race'], axis=1)
    X_test = X_test.drop(['person_id', 'screening_date', 'race'], axis=1)
    
    ## estimator
    lasso = LogisticRegression(class_weight = 'balanced', solver='liblinear', 
                               random_state=seed, penalty='l1', C = c).fit(X_train, Y_train)
    coefs = lasso.coef_[lasso.coef_ != 0]
    features = columns[lasso.coef_[0] != 0].tolist()
    intercept = round(lasso.intercept_[0],3)
     
    ## dictionary
    lasso_dict_rounding = {}
    for i in range(len(features)):
        lasso_dict_rounding.update({features[i]: round(round(coefs[i], 3)*100, 1)})
    
    ## prediction on test set
    prob = 0
    for k in features:
        test_values = X_test[k]*(lasso_dict_rounding[k]/100)
        prob += test_values
    
    holdout_prob = np.exp(prob)/(1+np.exp(prob))
    preds_df['prediction'] = holdout_prob 
    preds_df['label'] = Y_test
#     test_auc = roc_auc_score(Y_test, holdout_prob)
    
    return {'coefs': coefs, 
            'features': features, 
            'intercept': intercept, 
            'dictionary': lasso_dict_rounding, 
#             'test_auc': test_auc,
            'preds_df': preds_df[['person_id',  'screening_date', 'race', 'prediction', 'label']]}

In [6]:
# based on nested CV procedure, best c value was .05
# 163 columns, 372 rows)
best_stumps_summary = stump_preds(X_train_stumps, Y_train_stumps, 
                          X_test_stumps, Y_test_stumps, 
                          c=0.05, columns=cols, seed=816)
stumps_violence6 = best_stumps_summary['preds_df'].rename(columns={"race": "group"})
stumps_violence6 = stumps_violence6[(stumps_violence6['group']=="Caucasian") | (stumps_violence6['group']=="African-American")]
stumps_violence6.drop(labels = ["person_id","screening_date"], inplace=True, axis=1)
stumps_violence6['group'] = stumps_violence6[0].map({'Caucasian': 0, 'African-American': 1})

Unnamed: 0,group,prediction,label
2,African-American,0.565128,1
3,African-American,0.435118,0
4,African-American,0.450414,0
6,African-American,0.502250,0
7,African-American,0.497500,0
8,African-American,0.340290,0
9,African-American,0.398672,0
10,African-American,0.301114,0
11,African-American,0.368420,0
12,Caucasian,0.500000,0


## Enforce Equalized Odds (Hardt, Srebo)

In [7]:
from fairness_enforcers.equalized_odds.eq_odds import run_eq_odds

In [8]:
run_eq_odds(test_and_val_data=stumps_violence6)


  group_0_val_model = Model(group_0_val_data['prediction'].as_matrix(), group_0_val_data['label'].as_matrix())
  group_1_val_model = Model(group_1_val_data['prediction'].as_matrix(), group_1_val_data['label'].as_matrix())
  group_0_test_model = Model(group_0_test_data['prediction'].as_matrix(), group_0_test_data['label'].as_matrix())
  group_1_test_model = Model(group_1_test_data['prediction'].as_matrix(), group_1_test_data['label'].as_matrix())
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  spn_given_p = (sn2p * (sflip * sm_fn).mean() + sn2n * (sconst * sm_fn).mean()) / sbr + \
  (sp2p * (sconst * sm_tp).mean() + sp2n * (sflip * sm_tp).mean()) / sbr
  spp_given_n = (sp2n * (sflip * sm_fp).mean() + sp2p * (sconst * sm_fp).mean()) / (1 - sbr) + \
  (sn2p * (sflip * sm_tn).mean() + sn2n * (sconst * sm_tn).mean()) / (1 - sbr)
  opn_given_p = (on2p * (oflip * om_fn).mean() + on2n * (oconst * om_fn).mean()) / obr + \
  (op2p * (oconst * om_tp).mean() + op2n * (oflip * om_tp).me

ERROR in LDL_factor: Error in KKT matrix LDL factorization when computing the nonzero elements. The problem seems to be non-convex
ERROR in osqp_setup: KKT matrix factorization.
The problem seems to be non-convex.


ValueError: Workspace allocation error!