In [1]:
import pandas as pd 
import numpy as np
import csv
import advance_functions as advance
import stumps
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc

### AdaBoost & EBM

In [2]:
### load data
data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_data.csv")
data = data.drop(['PersonID', 'screening_date','fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)
X = data.loc[:,:'current_violence'].values
Y = data['recid_drug_two_year'].values

In [3]:
#### AdaBoost
estimators = [80]
learning_rate = [1]
ada_auc, ada_std, ada_auc_diff, ada_param = advance.Adaboost(X,Y,learning_rate,estimators,816)

#### GAM
estimators = [80]
depth = [2]
learning_rate = [0.3]
ebm_auc, ebm_std, ebm_auc_diff, ebm_param = advance.EBM(X,Y,learning_rate,depth,estimators,816)

### Lasso Stumps

In [None]:
## load stumps data
data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X_stumps, Y_stumps = data.loc[:,:'current_violence>=1'], data['recid_drug_two_year'].values
Y_stumps[Y_stumps == -1] = 0
cols = data.columns[:-14]

In [None]:
### stumps model
alpha = [0.001]
Stump = stumps.stump_features(X_stumps, Y_stumps, cols, alpha, 816)

### Arnold PSA

In [3]:
### load data
data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_data.csv")
X_arnold_raw = data['nca_risk_score_raw'].values
X_arnold_calc = data['nca_calc'].values
Y_arnold = data['recid_drug_two_year'].values

In [6]:
## set up cross validation
cv = KFold(n_splits=5,shuffle=True,random_state=816)
raw_auc = []
calc_auc = []

## raw score
i = 1
for train, test in cv.split(X_arnold_raw, Y_arnold):
    y_pred_raw, y_pred_calc, y_test = X_arnold_raw[test], X_arnold_calc[test], Y_arnold[test]
    raw_fpr, raw_tpr, raw_thresholds = roc_curve(y_test, y_pred_raw)
    calc_fpr, calc_tpr, calc_thresholds = roc_curve(y_test, y_pred_calc)
    raw_auc.append(auc(raw_fpr, raw_tpr))
    calc_auc.append(auc(calc_fpr, calc_tpr))
    i+=1 

### Results

In [9]:
results = [["AdaBoost", ada_auc, ada_auc_diff, ada_param],
           ["EBM", ebm_auc, ebm_auc_diff, ebm_param], 
           ['Lasso Stumps', round(np.mean(Stump['test_auc']),3), round(np.std(Stump['test_auc']),3)], 
           ['Arnold PSA Raw', round(np.mean(raw_auc), 3), round(np.std(raw_auc), 3)], 
           ['Arnold PSA', round(np.mean(calc_auc), 3), round(np.std(calc_auc), 3)]]

In [5]:
results

[['AdaBoost',
  0.7010605503585705,
  0.005001537717634563,
  {'learning_rate': 1, 'n_estimators': 80}],
 ['EBM',
  0.7009131349503854,
  0.006590818275568955,
  {'learning_rate': 0.3, 'max_tree_splits': 2, 'n_estimators': 80}]]

In [6]:
path = "C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY Results/Models/Two Year/"
results = [["Drug", round(ada_auc,3), round(ada_std, 3), round(ebm_auc,3), round(ebm_std, 3), round(np.mean(Stump['test_auc']),3), round(np.std(Stump['test_auc']),3), round(np.mean(raw_auc), 3), round(np.std(raw_auc), 3), round(np.mean(calc_auc), 3), round(np.std(calc_auc), 3)]]
with open(path + 'Two Year Models Summary.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)