In [None]:
import os 
os.chdir('../../../')
print("Current working directory is now: ", os.getcwd())

import pandas as pd 
import numpy as np
import csv
import utils.interpretable_functions as interpret
import utils.RiskSLIM as slim
import utils.stumps as stumps
import utils.fairness_functions as fairness
import utils.risktool as risktool
from utils.load_settings import load_settings

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score

from pprint import pprint
from riskslim.helper_functions import load_data_from_csv, print_model

# restore saved variables
# %store -r summary_property6_FL_interpret

In [None]:
### CART & EBM

In [None]:
data = pd.read_csv("./broward/data/broward_data.csv").sort_values('person_id')
x = data.loc[:,:'five_year']
y = data['property_six_month'].values

In [None]:
#### CART
depth = [1,2,3,4,5]
impurity = [0.001, 0.002, 0.003, 0.004, 0.005]
cart_summary = interpret.CART(X=x,
                              Y=y,
                              depth=depth,
                              impurity=impurity, 
                              seed = 816)

#### EBM
estimators = [40,60,80,100]
depth = [1,2,3]
learning_rate = [0.01]
holdout_split = [0.7, 0.9]
ebm_summary = interpret.EBM(X=x,
                       Y=y,
                       learning_rate = learning_rate,
                       depth = depth,
                       estimators=estimators,
                       holdout_split=holdout_split,
                       seed=816)

In [None]:
print("CART: ", np.mean(cart_summary['holdout_test_auc']), np.mean(cart_summary['auc_diffs']))
print("EMB: ", np.mean(ebm_summary['holdout_test_auc']), np.mean(ebm_summary['auc_diffs']))

In [None]:
---

In [None]:
### Lasso Stumps

In [None]:
## load stumps & original data
data = pd.read_csv("./broward/data/broward_stumps.csv").sort_values('person_id')
original_data = pd.read_csv("./broward/data/broward_data.csv").sort_values('person_id')
original_data = original_data.loc[:, ['person_id', 'screening_date', 'age_at_current_charge', 'p_charges']]
data = pd.merge(original_data, data, on=['person_id', 'screening_date'])

## split X & Y
X_stumps = data.loc[:,:'five_year1'].copy()
Y_stumps = data['property_six_month'].values.copy()
Y_stumps[Y_stumps == -1] = 0
cols = X_stumps.columns[5:]

## load train & test stumps data
train_stumps = pd.read_csv("./broward/data/broward_train_stumps.csv").sort_values('person_id')
test_stumps = pd.read_csv("./broward/data/broward_test_stumps.csv").sort_values('person_id')
X_train_stumps = train_stumps.loc[:,:'five_year1'].copy()
X_test_stumps = test_stumps.loc[:,:'five_year1'].copy()
Y_train_stumps = train_stumps['property_six_month'].values.copy()
Y_test_stumps = test_stumps['property_six_month'].values.copy()
Y_train_stumps[Y_train_stumps == -1] = 0
Y_test_stumps[Y_test_stumps == -1] = 0

In [None]:
single_stump_model = stumps.stump_model(X_train_stumps, 
                                        Y_train_stumps, 
                                        X_test_stumps, 
                                        Y_test_stumps, 
                                        c=0.06, 
                                        columns=cols, 
                                        seed=816)
## unique original features
unique_stumps = []
for i in single_stump_model['features']:
    unique_stumps.append(''.join([j for j in i if not j.isdigit()]))
print(len(np.unique(unique_stumps)))

In [None]:
stump_summary = stumps.stump_cv(X = X_stumps, 
                                Y = Y_stumps, 
                                columns=cols, 
                                c_grid={'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]}, 
                                seed = 816)

In [None]:
print("CART: ", np.mean(cart_summary['holdout_test_auc']), np.mean(cart_summary['auc_diffs']))
print("EMB: ", np.mean(ebm_summary['holdout_test_auc']), np.mean(ebm_summary['auc_diffs']))
print("Additive:", np.mean(stump_summary['holdout_test_auc']), np.mean(stump_summary['auc_diffs']))

In [None]:
---

In [None]:
### RiskSLIM

In [None]:
## load stumps data
data = pd.read_csv("./broward/data/broward_stumps.csv").sort_values('person_id')

single_stump_model = stumps.stump_model(X_train_stumps, 
                                      Y_train_stumps, 
                                      X_test_stumps, 
                                      Y_test_stumps,  
                                      c=0.04, 
                                      columns=cols, 
                                      seed=816)
len(single_stump_model['features'])

In [None]:
### Subset features
selected_features = single_stump_model['features']
if 'sex1' in selected_features:
    selected_features = ['property_six_month', 'person_id', 'screening_date', 'race'] + selected_features
    indicator = 1
else:
    selected_features = ['property_six_month', 'person_id', 'screening_date', 'race', 'sex1'] + selected_features
    indicator = 0
    
sub_data = data[selected_features]
sub_data = pd.merge(sub_data, original_data, on=['person_id', 'screening_date'])
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

In [None]:
riskslim_cs_summary = slim.risk_nested_cv_constrain(X=sub_X, 
                                                    Y=sub_Y, 
                                                    indicator = indicator,
                                                    y_label='property_six_month',
                                                    max_coef=5,  
                                                    max_coef_number=10, 
                                                    max_runtime=1000, 
                                                    max_offset=100,
                                                    c=1e-3, 
                                                    seed=816)

In [None]:
print(np.mean(cart_summary['holdout_test_auc']), np.mean(cart_summary['auc_diffs']))
print(np.mean(ebm_summary['holdout_test_auc']), np.mean(ebm_summary['auc_diffs']))
print(np.mean(stump_summary['holdout_test_auc']), np.mean(stump_summary['auc_diffs']))
print(np.mean(riskslim_summary['test_auc']))

In [None]:
#### Single RiskSLIM Model 

In [None]:
selected_features = ["property_six_month"] + single_stump_model['features']
sub_train_data = train_stumps[selected_features]
sub_test_data = test_stumps[selected_features]

## split x 
sub_train_X = sub_train_data.iloc[:,1:]
sub_train_X.insert(0, '(Intercept)', 1)
sub_cols = sub_train_X.columns.tolist()
sub_train_X = sub_train_X.values
sub_test_X = sub_test_data.iloc[:,1:].values

## split y
sub_train_Y = sub_train_data.iloc[:,0].values.reshape(-1,1)
sub_test_Y = sub_test_data.iloc[:,0].values.reshape(-1,1)

## sample weight
sample_weights = np.repeat(1, len(sub_train_Y))

## new_train_data
new_train_data = {
    'X': sub_train_X,
    'Y': sub_train_Y,
    'variable_names': sub_cols,
    'outcome_name': 'property_six_month',
    'sample_weights': sample_weights
}

In [None]:
model_info, mip_info, lcpa_info = slim.risk_slim_constrain(new_train_data, 
                                                           max_coefficient=5, 
                                                           max_L0_value=10, 
                                                           c0_value=1e-3, 
                                                           max_offset=100, 
                                                           max_runtime=1000)
print_model(model_info['solution'], new_train_data)

In [None]:
sub_train_X = sub_train_X[:,1:]
sub_train_Y[sub_train_Y == -1] = 0
sub_test_Y[sub_test_Y == -1] = 0

print("Train AUC: {}".format(roc_auc_score(sub_train_Y, slim.riskslim_prediction(sub_train_X, np.array(cols), model_info).reshape(-1,1))))
print("Test AUC: {}".format(roc_auc_score(sub_test_Y, slim.riskslim_prediction(sub_test_X, np.array(cols), model_info).reshape(-1,1))))

In [None]:
---

In [None]:
### Arnold PSA

In [None]:
### load data
data = pd.read_csv("./broward/data/broward_arnold.csv").sort_values('person_id')
X_arnold = data.loc[:,['arnold_nca', 'sex', 'race', 'person_id', 
                       'screening_date', 'age_at_current_charge', 'p_charges']]
Y_arnold = data['property_six_month'].values

## test model
arnold = risktool.risktool(X_arnold, Y_arnold, label='arnold_nca')
print(np.mean(arnold['auc']))

In [None]:
#### Single Arnold PSA

In [None]:
test_data = pd.read_csv("./broward/data/broward_arnold_test.csv")
X = test_data['arnold_nca'].values
Y = test_data['property_six_month'].values
roc_auc_score(Y, X)

In [None]:
---

In [None]:
### Compass

In [None]:
### load data
data = pd.read_csv("./broward/data/broward_arnold.csv").sort_values('person_id')
X_arnold = data.loc[:,['Risk of Recidivism_decile_score', 'sex', 'race', 'person_id', 
                       'screening_date', 'age_at_current_charge', 'p_charges']]
Y_arnold = data['property_six_month'].values

## test model
compas = risktool.risktool(X_arnold, Y_arnold, label='Risk of Recidivism_decile_score')
print(np.mean(compas['auc']))

In [None]:
---

In [None]:
### Results

In [None]:
#### save results
summary_property6_FL_interpret = {"cart": cart_summary,
                                  "ebm": ebm_summary, 
                                  'stumps': stump_summary, 
                                  'riskslim': riskslim_cs_summary, 
                                  'arnold': arnold, 
                                  'compas': compas}
# %store summary_property6_FL_interpret

In [None]:
results = [["cart", np.mean(cart_summary['holdout_test_auc']), np.mean(cart_summary['auc_diffs'])],
           ["ebm", np.mean(ebm_summary['holdout_test_auc']), np.mean(ebm_summary['auc_diffs'])], 
           ["stumps", np.mean(stump_summary['holdout_test_auc']), np.mean(stump_summary['auc_diffs'])],
           ['riskslim', np.mean(riskslim_cs_summary['test_auc'])],
           ['arnold', np.mean(arnold['auc'])], 
           ['compas', np.mean(compas['auc'])]]
results

In [None]:
auc = [np.mean(cart_summary['holdout_test_auc']), 
       np.mean(ebm_summary['holdout_test_auc']), 
       np.mean(stump_summary['holdout_test_auc']), 
       np.mean(riskslim_cs_summary['test_auc']),]

In [None]:
path = "./broward/logs/interpretable/"
results = [["Property", np.str((round(np.mean(cart_summary['holdout_test_auc']), 3))) + " (" + np.str(round(np.std(cart_summary['holdout_test_auc']), 3)) + ")", 
            np.str(round(np.mean(ebm_summary['holdout_test_auc']),3)) + " (" + np.str(round(np.std(ebm_summary['holdout_test_auc']), 3)) + ")", 
            np.str(round(np.mean(stump_summary['holdout_test_auc']),3)) + " (" + np.str(round(np.std(stump_summary['holdout_test_auc']), 3)) + ")",             
            np.str(round(np.mean(riskslim_cs_summary['test_auc']),3)) + " (" + np.str(round(np.std(riskslim_cs_summary['test_auc']), 3)) + ")", 
            round(np.max(auc) - np.min(auc), 3),
            np.str(round(np.mean(arnold['auc']), 3)) + " (" + np.str(round(np.std(arnold['auc']),3)) + ")", 
            np.str(round(np.mean(compas['auc']), 3)) + " (" + np.str(round(np.std(compas['auc']),3)) + ")"]]
with open(path + 'FL-six-month-interpretable-summary.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)

In [None]:
### Save Fairness Matrix

In [None]:
matrix = ['confusion_matrix_stats', 'calibration_stats', 'race_auc', 'condition_pn', 'no_condition_pn']
name = ['confusion', 'calibration', 'race-auc', 'condition-pn', 'no-condition-pn']

In [None]:
for i in range(len(matrix)):
    
    if ( (i == 0) | (i == 1) ):
        
        path = './broward/fairness/' + name[i] + '/six-month/property/'
        ## confusion matrix and calibration only applies to cart, ebm, riskslim and stumps
        cart_matrix = cart_summary[matrix[i]]
        ebm_matrix = ebm_summary[matrix[i]]
        riskslim_matrix = riskslim_cs_summary[matrix[i]]
        stumps_matrix = stump_summary[matrix[i]]
        
        ## save results
        cart_matrix.to_csv(path+'cart-'+name[i]+'.csv', index=None,header=True)
        ebm_matrix.to_csv(path+'ebm-'+name[i]+'.csv', index=None,header=True)
        riskslim_matrix.to_csv(path+'riskslim-'+name[i]+'.csv', index=None,header=True)
        stumps_matrix.to_csv(path+'stumps-'+name[i]+'.csv', index=None,header=True)
        
    else:
        path = './broward/fairness/' + name[i] + '/six-month/property/'
        
        ## including arnold and compas now
        cart_matrix = cart_summary[matrix[i]]
        ebm_matrix = ebm_summary[matrix[i]]
        riskslim_matrix = riskslim_cs_summary[matrix[i]]
        stumps_matrix = stump_summary[matrix[i]]
        arnold_matrix = arnold[matrix[i]]
        compas_matrix = compas[matrix[i]]
        
        ## save results
        cart_matrix.to_csv(path+'cart-'+name[i]+'.csv', index=None,header=True)
        ebm_matrix.to_csv(path+'ebm-'+name[i]+'.csv', index=None,header=True)
        riskslim_matrix.to_csv(path+'riskslim-'+name[i]+'.csv', index=None,header=True)
        stumps_matrix.to_csv(path+'stumps-'+name[i]+'.csv', index=None,header=True)
        arnold_matrix.to_csv(path+'arnold-'+name[i]+'.csv', index=None,header=True)
        compas_matrix.to_csv(path+'compas-'+name[i]+'.csv', index=None,header=True)