In [1]:
import numpy as np
import pandas as pd
import RiskSLIM as risk
import csv

from riskslim.helper_functions import load_data_from_csv, print_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Lasso Feature Selection

In [2]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_violence_six_month'].values
cols = X.columns

In [3]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(23, 0.8748946648396015)

#### subset features

In [4]:
### Subset features
selected_features.insert(0, 'recid_violence_six_month')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [5]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_violence_six_month'
sample_weights = np.repeat(1, len(sub_Y))

In [6]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk.risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-5, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = risk.riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = risk.riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/17/19 @ 04:11 PM | 1021 rows in lookup table
08/17/19 @ 04:11 PM | ------------------------------------------------------------
08/17/19 @ 04:11 PM | runnning initialization procedure
08/17/19 @ 04:11 PM | ------------------------------------------------------------
08/17/19 @ 04:11 PM | CPA produced 2 cuts
08/17/19 @ 04:11 PM | running naive rounding on 97 solutions
08/17/19 @ 04:11 PM | best objective value: 0.0762
08/17/19 @ 04:11 PM | rounding produced 5 integer solutions
08/17/19 @ 04:11 PM | best objective value is 0.0779
08/17/19 @ 04:11 PM | running sequential rounding on 97 solutions
08/17/19 @ 04:11 PM | best objective value: 0.0762
08/17/19 @ 04:11 PM | sequential rounding produced 6 integer solutions
08/17/19 @ 04:11 PM | best objective value: 0.0774
08/17/19 @ 04:11 PM | polishing 11 solutions
08/17/19 @ 04:11 PM | best objective value: 0.0774
08/17/19 @ 04:11 PM | polishing produced 5 integer solutions
08/17/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:11 PM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.0774.
Tried aggregator 1 time.
Reduced MIP has 25 rows, 50 columns, and 95 nonzeros.
Reduced MIP has 23 binaries, 25 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.05 ticks)
Probing time = 0.00 sec. (0.01 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variabl

 191897 17592        cutoff              0.0772        0.0768  1108334    0.55%          rho_18 U 191897  32532     26
 196017 15230        cutoff              0.0772        0.0768  1127321    0.49%           rho_2 D 196017 185985     24
 200059 12545        cutoff              0.0772        0.0769  1145264    0.42%           rho_4 U 200059 179330     20
 204047  9552        cutoff              0.0772        0.0769  1161775    0.33%          rho_21 D 204047  20760     25
Elapsed time = 108.33 sec. (168167.69 ticks, tree = 4.59 MB, solutions = 5)
 207989  6281        cutoff              0.0772        0.0770  1176779    0.24%           rho_1 U 207989  61947     24
 211875  2700        cutoff              0.0772        0.0771  1190055    0.12%          rho_12 U 211875  19533     24

User cuts applied:  1309

Root node processing (before b&c):
  Real time             =    0.09 sec. (1.23 ticks)
Sequential b&c:
  Real time             =  113.38 sec. (178293.35 ticks)
                       



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:13 PM | adding 258 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.0766.
Tried aggregator 1 time.
Reduced MIP has 25 rows, 50 columns, and 95 nonzeros.
Reduced MIP has 23 binaries, 25 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.05 ticks)
Probing time = 0.00 sec. (0.01 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variabl

 154096 23684        0.0756     7        0.0761        0.0754   891725    0.88%          rho_22 D 154096 154094     21
 157551 22748        0.0759    16        0.0761        0.0755   909100    0.83%          rho_11 U 157551 150468     18
 160982 21756        0.0757    20        0.0761        0.0755   926016    0.79%          rho_15 U 160982 119127     16
 164415 20678        cutoff              0.0761        0.0755   942782    0.74%          rho_21 U 164415  18611     23
Elapsed time = 105.14 sec. (167542.81 ticks, tree = 9.15 MB, solutions = 6)
 167804 19541        0.0758     9        0.0761        0.0756   959197    0.69%           rho_7 D 167804 167802     20
 171157 18224        cutoff              0.0761        0.0756   975030    0.64%           rho_0 D 171157  82189     20
 174402 16762        cutoff              0.0761        0.0756   990560    0.59%          rho_23 D 174402 126656     13
 177687 15220        cutoff              0.0761        0.0757  1005415    0.54%          rh



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:16 PM | adding 255 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.0764.
Tried aggregator 1 time.
Reduced MIP has 25 rows, 50 columns, and 95 nonzeros.
Reduced MIP has 23 binaries, 25 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.05 ticks)
Probing time = 0.00 sec. (0.01 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variabl

 159202 26725        0.0754     5        0.0759        0.0752   916399    0.92%          rho_12 U 159202  23607     21
 163256 25844        0.0755     9        0.0759        0.0752   936210    0.87%          rho_13 D 163256 163255     31
 167280 24862        0.0753     9        0.0759        0.0753   955524    0.82%          rho_22 D 167280 120329     23
 171238 23671        cutoff              0.0759        0.0753   974635    0.77%          rho_20 U 171238 171236     20
 175171 22374        cutoff              0.0759        0.0753   993023    0.72%          rho_20 U 175171  13340     26
 178944 20865        cutoff              0.0759        0.0754  1010933    0.67%          rho_21 D 178944 178942     34
 182787 19309        cutoff              0.0759        0.0754  1028635    0.62%          rho_20 U 182787 147128     13
 186496 17501        0.0758     9        0.0759        0.0755  1045832    0.56%          rho_13 D 186496 121314     25
Elapsed time = 121.36 sec. (168160.66 ticks, tre



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:18 PM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.0751.
Tried aggregator 1 time.
Reduced MIP has 25 rows, 50 columns, and 95 nonzeros.
Reduced MIP has 23 binaries, 25 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.05 ticks)
Probing time = 0.00 sec. (0.01 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variabl

setting c0 = 0.0 to ensure that intercept is not penalized
08/17/19 @ 04:20 PM | 1021 rows in lookup table
08/17/19 @ 04:20 PM | ------------------------------------------------------------
08/17/19 @ 04:20 PM | runnning initialization procedure
08/17/19 @ 04:20 PM | ------------------------------------------------------------
08/17/19 @ 04:21 PM | CPA produced 2 cuts
08/17/19 @ 04:21 PM | running naive rounding on 84 solutions
08/17/19 @ 04:21 PM | best objective value: 0.0742
08/17/19 @ 04:21 PM | rounding produced 5 integer solutions
08/17/19 @ 04:21 PM | best objective value is 0.0754
08/17/19 @ 04:21 PM | running sequential rounding on 84 solutions
08/17/19 @ 04:21 PM | best objective value: 0.0742
08/17/19 @ 04:21 PM | sequential rounding produced 6 integer solutions
08/17/19 @ 04:21 PM | best objective value: 0.0754
08/17/19 @ 04:21 PM | polishing 11 solutions
08/17/19 @ 04:21 PM | best objective value: 0.0754
08/17/19 @ 04:21 PM | polishing produced 5 integer solutions
08/17/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:21 PM | adding 254 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.0754.
Tried aggregator 1 time.
Reduced MIP has 25 rows, 50 columns, and 95 nonzeros.
Reduced MIP has 23 binaries, 25 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.05 ticks)
Probing time = 0.00 sec. (0.01 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variabl

 145704 19956        cutoff              0.0750        0.0744   855406    0.79%          rho_14 U 145704  30634     23
 148950 18895        0.0747    11        0.0750        0.0744   871636    0.74%          rho_14 U 148950  41526     17
 152131 17751        0.0748     9        0.0750        0.0745   887579    0.69%           rho_7 U 152131 152130     19
 155266 16450        cutoff              0.0750        0.0745   903043    0.63%           rho_3 U 155266 155264     22
 158348 15047        0.0747    17        0.0750        0.0746   918026    0.58%           rho_5 D 158348  93081     17
Elapsed time = 162.45 sec. (167968.53 ticks, tree = 7.12 MB, solutions = 7)
 161418 13539        0.0749     7        0.0750        0.0746   932613    0.52%          rho_21 D 161418 161417     30
 164468 11835        0.0750    11        0.0750        0.0746   946767    0.46%           rho_7 U 164468 164467     30
 167497  9985        0.0748    12        0.0750        0.0747   960320    0.40%           r

In [7]:
np.mean(train_auc), np.mean(test_auc)

(0.860687437497601, 0.859231174972621)

### Save Results

In [8]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Six Month\\"                   
results = [["Violence", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Six Month RiskSLIM.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)