baseline model is what COMPASS predicted

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
import matplotlib.pyplot as plt

## Read training dataset from CSV

In [2]:
df = pd.read_csv('https://storage.googleapis.com/what-if-tool-resources/computefest2019/cox-violent-parsed_filt.csv')

In [3]:
df.shape

(18316, 40)

In [4]:
df.head(2)

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,juv_fel_count,...,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,priors_count.1,event
0,1.0,miguel hernandez,miguel,hernandez,Male,18/04/1947,69,Greater than 45,Other,0,...,,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,0,0
1,2.0,miguel hernandez,miguel,hernandez,Male,18/04/1947,69,Greater than 45,Other,0,...,,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,0,0


## Specify input columns and columns to predict

In [5]:
# Filter out entries with no indication of recidivism or no compass score
df = df[df['is_recid'] != -1]
df = df[df['decile_score'] != -1]

# Rename recidivism column
df['recidivism_within_2_years'] = df['is_recid']

# Make the COMPASS label column numeric (0 and 1), for use in our model
df['COMPASS_determination'] = np.where(df['score_text'] == 'Low', 0, 1)

# Set column to predict
label_column = 'COMPASS_determination'

# Get list of all columns from the dataset we will use for model input or output.
input_features = ['sex', 'age', 'race', 'priors_count', 'juv_fel_count', 'juv_misd_count', 'juv_other_count']
features_and_labels = input_features + [label_column]

features_for_file = input_features + ['recidivism_within_2_years', 'COMPASS_determination']

In [23]:
df.columns

Index(['id', 'name', 'first', 'last', 'sex', 'dob', 'age', 'age_cat', 'race',
       'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
       'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'violent_recid', 'is_violent_recid',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'priors_count.1', 'event', 'recidivism_within_2_years',
       'COMPASS_determination'],
      dtype='object')

## Model Evaluation
-- Cinny's prelim eval of the baseline model

In [6]:
def get_data(df, truth_label='recidivism_within_2_years', pred_label='COMPASS_determination'):
    tp = df[(df[truth_label]==1) & (df[pred_label]==1)]
    tn = df[(df[truth_label]==0) & (df[pred_label]==0)]
    fp = df[(df[truth_label]==0) & (df[pred_label]==1)]
    fn = df[(df[truth_label]==1) & (df[pred_label]==0)]
    return tp, tn, fp, fn

In [7]:
def get_length(tp, tn, fp, fn):
    return len(tp), len(tn), len(fp), len(fn)

In [8]:
def get_accuracy(df, truth_label='recidivism_within_2_years', pred_label='COMPASS_determination'):
    tp, tn, fp, fn = get_data(df)
    TP, TN, FP, FN = get_length(tp, tn, fp, fn)
    return (TP+TN)/(TP+FP+FN+TN)

In [9]:
def get_precision(df, truth_label='recidivism_within_2_years', pred_label='COMPASS_determination'):
    tp, tn, fp, fn = get_data(df)
    TP, TN, FP, FN = get_length(tp, tn, fp, fn)
    return (TP)/(TP+FP)

In [10]:
def get_recall(df, truth_label='recidivism_within_2_years', pred_label='COMPASS_determination'):
    tp, tn, fp, fn = get_data(df)
    TP, TN, FP, FN = get_length(tp, tn, fp, fn)
    return (TP)/(TP+FN)

In [11]:
def get_f1(df, truth_label='recidivism_within_2_years', pred_label='COMPASS_determination'):
    P = get_precision(df)
    R = get_recall(df)
    return 2*(P*R)/(P+R)

### Accuracy
`Accuracy = (TP+TN)/(TP+FP+FN+TN)`

Accuracy is a valid choice of evaluation for classification problems which are well balanced and not skewed or No class imbalance.

In [12]:
get_accuracy(df)

0.6261799874134676

### Precision
`Precision = (TP)/(TP+FP)`

Precision is a valid choice of evaluation metric when we want to be very sure of our prediction.

In [13]:
get_precision(df)

0.6002136752136752

### Recall

`Recall = (TP)/(TP+FN)`

Recall is a valid choice of evaluation metric when we want to capture as many positives as possible.

In [14]:
get_recall(df)

0.6680142687277051

### F1 Score

`F1 = 2 * (precision * recall) / (precision + recall)`

The F1 score is a number between 0 (worst) and 1 (best). It is used when you want your model to have both good precision and recall.

### F_beta

`F_beta = (1 + beta^2) * (precision * recall) / ( (beta^2 * precision) + recall )`

The F1 score gives equal weight to precision and recall. `beta` means we give `beta` times more importance to recall as precision.

In [15]:
get_f1(df)

0.6323016319639843

## Bias Assessment

In [16]:
df.groupby(['race', 'recidivism_within_2_years', 'COMPASS_determination']).count()['id']

race              recidivism_within_2_years  COMPASS_determination
African-American  0                          0                        1212
                                             1                        1421
                  1                          0                         746
                                             1                        2264
Asian             0                          0                          23
                                             1                           6
                  1                          0                           2
                                             1                           8
Caucasian         0                          0                        1410
                                             1                         629
                  1                          0                         653
                                             1                         823
Hispanic          0              

In [17]:
cfp = 629
cfn = 653
aafp = 1421
aafn = 746

In [18]:
cfp/(cfp+cfn)

0.4906396255850234

In [19]:
cfn/(cfp+cfn)

0.5093603744149766

In [20]:
aafp/(aafp+aafn)

0.6557452699584679

In [21]:
aafn/(aafp+aafn)

0.34425473004153206

### Duplicate Propublilca Analysis

Prediction Fails Differently for Black Defendants ([source](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing))

|                                           | White | African American |
|-------------------------------------------|-------|------------------|
| Labeled Higher Risk, But Didn’t Re-Offend | 49%   | 66%              |
| Labeled Lower Risk, Yet Did Re-Offend     | 51%   | 34%              |