In [1]:
import pandas as pd 
import numpy as np
import csv
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc

In [2]:
### load data
data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_data.csv")
X_raw = data['nca_risk_score_raw'].values
X_calc = data['nca_calc'].values

## set up cross validation
cv = KFold(n_splits=5,shuffle=True,random_state=816)

### Arnold PSA Raw

In [3]:
labels = ['recid_six_month', 'recid_drug_six_month', 'recid_property_six_month', 'recid_F_six_month', 'recid_M_six_month']
raw_results = []
for l in labels:
    Y = data[l].values
    loop = []
    i = 1
    for train, test in cv.split(X_raw, Y):
        y_pred, y_test = X_raw[test], Y[test]
        fpr,tpr,thresholds = roc_curve(y_test, y_pred)
        loop.append(auc(fpr, tpr))
        i+=1 
    raw_results.append([l, round(np.mean(loop), 3), round(np.std(loop),3)])

In [4]:
raw_results

[['recid_six_month', 0.71, 0.004],
 ['recid_drug_six_month', 0.672, 0.008],
 ['recid_property_six_month', 0.736, 0.013],
 ['recid_F_six_month', 0.719, 0.005],
 ['recid_M_six_month', 0.704, 0.006]]

In [5]:
violence_X_raw = data['pvf_risk_score_raw'].values
violence_Y = data['recid_violence_six_month'].values
violence_raw = []
i = 1
for train, test in cv.split(violence_X_raw, violence_Y):
    y_pred, y_test = violence_X_raw[test], violence_Y[test]
    fpr,tpr,thresholds = roc_curve(y_test, y_pred)
    violence_raw.append(auc(fpr, tpr))
    i+=1 
raw_results.append(['recid_violence', round(np.mean(violence_raw),3), round(np.std(violence_raw),3)])

In [6]:
raw_results

[['recid_six_month', 0.71, 0.004],
 ['recid_drug_six_month', 0.672, 0.008],
 ['recid_property_six_month', 0.736, 0.013],
 ['recid_F_six_month', 0.719, 0.005],
 ['recid_M_six_month', 0.704, 0.006],
 ['recid_violence', 0.85, 0.006]]

### Arnold PSA Calc

In [7]:
labels = ['recid_six_month', 'recid_drug_six_month', 'recid_property_six_month', 'recid_F_six_month', 'recid_M_six_month']
calc_results = []
for l in labels:
    Y = data[l].values
    loop = []
    i = 1
    for train, test in cv.split(X_calc, Y):
        y_pred, y_test = X_calc[test], Y[test]
        fpr,tpr,thresholds = roc_curve(y_test, y_pred)
        loop.append(auc(fpr, tpr))
        i+=1 
    calc_results.append([l, round(np.mean(loop), 3), round(np.std(loop),3)])

In [8]:
calc_results

[['recid_six_month', 0.702, 0.003],
 ['recid_drug_six_month', 0.671, 0.006],
 ['recid_property_six_month', 0.704, 0.02],
 ['recid_F_six_month', 0.709, 0.005],
 ['recid_M_six_month', 0.697, 0.004]]

In [9]:
violence_calc_X = data['pvf_calc'].values
violence_Y = data['recid_violence_six_month'].values
violence_calc = []
i = 1
for train, test in cv.split(violence_calc_X, violence_Y):
    y_pred, y_test = violence_calc_X[test], violence_Y[test]
    fpr,tpr,thresholds = roc_curve(y_test, y_pred)
    violence_calc.append(auc(fpr, tpr))
    i+=1 
calc_results.append(['recid_violence', round(np.mean(violence_calc),3), round(np.std(violence_calc),3)])

In [10]:
calc_results

[['recid_six_month', 0.702, 0.003],
 ['recid_drug_six_month', 0.671, 0.006],
 ['recid_property_six_month', 0.704, 0.02],
 ['recid_F_six_month', 0.709, 0.005],
 ['recid_M_six_month', 0.697, 0.004],
 ['recid_violence', 0.849, 0.005]]