In [None]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np 
import pandas as pd 
from broward.models.advanced_models.six_month import stumps

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Criminal Recidivism\psa-analysis


Prepare data/make predictions. Currently, testing on the six-month violent felonies problem. 

In [None]:
## load whole data
data = pd.read_csv("broward/data/broward_stumps.csv")
X_stumps, Y_stumps = data.loc[:,:'five_year>=1'], data['recid_violent6'].values
Y_stumps[Y_stumps == -1] = 0
cols = X_stumps.columns[3:]

## load train & test data
train_stumps = pd.read_csv("broward/data/broward_train_stumps.csv")
test_stumps = pd.read_csv("broward/data/broward_test_stumps.csv")

X_train_stumps, Y_train_stumps = train_stumps.loc[:,:'five_year>=1'], train_stumps['recid_violent6'].values
X_test_stumps, Y_test_stumps = test_stumps.loc[:,:'five_year>=1'], test_stumps['recid_violent6'].values

Y_train_stumps[Y_train_stumps == -1] = 0
Y_test_stumps[Y_test_stumps == -1] = 0

In [None]:
from sklearn.linear_model import LogisticRegression

def stump_preds(X_train, Y_train, X_test, Y_test, c, columns, seed):
        
    ## remove unused feature in modeling
    preds_df = X_test
    X_train = X_train.drop(['person_id', 'screening_date', 'race'], axis=1)
    X_test = X_test.drop(['person_id', 'screening_date', 'race'], axis=1)
    
    ## estimator
    lasso = LogisticRegression(class_weight = 'balanced', solver='liblinear', 
                               random_state=seed, penalty='l1', C = c).fit(X_train, Y_train)
    coefs = lasso.coef_[lasso.coef_ != 0]
    features = columns[lasso.coef_[0] != 0].tolist()
    intercept = round(lasso.intercept_[0],3)
     
    ## dictionary
    lasso_dict_rounding = {}
    for i in range(len(features)):
        lasso_dict_rounding.update({features[i]: round(round(coefs[i], 3)*100, 1)})
    
    ## prediction on test set
    prob = 0
    for k in features:
        test_values = X_test[k]*(lasso_dict_rounding[k]/100)
        prob += test_values
    
    holdout_prob = np.exp(prob)/(1+np.exp(prob))
    preds_df['prediction'] = holdout_prob 
    preds_df['label'] = Y_test
#     test_auc = roc_auc_score(Y_test, holdout_prob)
    
    return {'coefs': coefs, 
            'features': features, 
            'intercept': intercept, 
            'dictionary': lasso_dict_rounding, 
#             'test_auc': test_auc,
            'preds_df': preds_df[['person_id',  'screening_date', 'race', 'prediction', 'label']]}

In [None]:
# based on nested CV procedure, best c value was .05
# 163 columns, 372 rows)
best_stumps_summary = stump_preds(X_train_stumps, Y_train_stumps, 
                          X_test_stumps, Y_test_stumps, 
                          c=0.05, columns=cols, seed=816)
stumps_violence6 = best_stumps_summary['preds_df'].rename(columns={"race": "group"})
stumps_violence6 = stumps_violence6[(stumps_violence6['group']=="Caucasian") | (stumps_violence6['group']=="African-American")]
stumps_violence6.drop(labels = ["person_id","screening_date"], inplace=True, axis=1)
stumps_violence6['group'] = stumps_violence6['group'].map({'Caucasian': 0, 'African-American': 1})
stumps_violence6

## Enforce Equalized Odds (Hardt, Srebo)

In [None]:
from fairness_enforcers.equalized_odds.eq_odds import run_eq_odds

In [None]:
run_eq_odds(test_and_val_data=stumps_violence6)


## Enforce Calibrated Equalized Odds

In [None]:
from fairness_enforcers.equalized_odds.calib_eq_odds import run_calib_eq_odds

In [None]:
run_calib_eq_odds(test_and_val_data=stumps_violence6)