In [1]:
import numpy as np
import pandas as pd
import RiskSLIM as risk
import csv

from riskslim.helper_functions import load_data_from_csv, print_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Lasso Feature Selection

In [2]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_M_two_year'].values
cols = X.columns

In [3]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(44, 0.711163842640677)

#### subset features

In [4]:
### Subset features
selected_features.insert(0, 'recid_M_two_year')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [5]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_M_two_year'
sample_weights = np.repeat(1, len(sub_Y))

In [6]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk.risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-5, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = risk.riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = risk.riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 10:06 AM | 1861 rows in lookup table
08/16/19 @ 10:06 AM | ------------------------------------------------------------
08/16/19 @ 10:06 AM | runnning initialization procedure
08/16/19 @ 10:06 AM | ------------------------------------------------------------
08/16/19 @ 10:06 AM | CPA produced 2 cuts
08/16/19 @ 10:06 AM | running naive rounding on 65 solutions
08/16/19 @ 10:06 AM | best objective value: 0.4462
08/16/19 @ 10:06 AM | rounding produced 5 integer solutions
08/16/19 @ 10:06 AM | best objective value is 0.4526
08/16/19 @ 10:06 AM | running sequential rounding on 65 solutions
08/16/19 @ 10:06 AM | best objective value: 0.4462
08/16/19 @ 10:06 AM | sequential rounding produced 6 integer solutions
08/16/19 @ 10:06 AM | best objective value: 0.4526
08/16/19 @ 10:06 AM | polishing 11 solutions
08/16/19 @ 10:06 AM | best objective value: 0.4526
08/16/19 @ 10:06 AM | polishing produced 5 integer solutions
08/16/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:06 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.4526.
Tried aggregator 1 time.
Reduced MIP has 46 rows, 92 columns, and 179 nonzeros.
Reduced MIP has 44 binaries, 46 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.04 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 124875 49251        0.4506    11        0.4526        0.4423  1231771    2.28%          rho_43 D 124875 124873     31
 127360 49946        0.4497    11        0.4526        0.4424  1252406    2.27%           rho_3 D 127360 127358     21
Elapsed time = 243.97 sec. (168244.00 ticks, tree = 21.51 MB, solutions = 1)
 129808 50680        0.4450     9        0.4526        0.4425  1273369    2.25%          rho_31 D 129808 129807     29
 132263 51376        0.4523     3        0.4526        0.4425  1293741    2.23%          rho_27 D 132263 132261     39
 134750 52136        0.4490    14        0.4526        0.4426  1314292    2.22%          rho_40 U 134750  30918     11
 137201 52850        cutoff              0.4526        0.4427  1334667    2.20%          rho_27 U 137201 137200     44
 139606 53538        0.4513    16        0.4526        0.4427  1355235    2.19%           rho_5 U 139606  30769     22
 141947 54196        0.4479    15        0.4526        0.4428  1376201    2.17%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:13 AM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.4536.
Tried aggregator 1 time.
Reduced MIP has 46 rows, 92 columns, and 179 nonzeros.
Reduced MIP has 44 binaries, 46 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.04 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 102447 43529        0.4514    13        0.4526        0.4408  1026120    2.59%           rho_2 D 102447 102446     23
 104576 44276        0.4435    19        0.4526        0.4409  1046300    2.58%          rho_29 U 104576  50267     25
 106652 45032        0.4508     9        0.4526        0.4410  1065897    2.56%          rho_20 U 106652 106650     26
 108728 45725        0.4447    20        0.4526        0.4410  1085473    2.55%           rho_7 U 108728 108727     16
 110804 46410        0.4414    21        0.4526        0.4411  1104519    2.53%          rho_29 D 110804  59367     24
Elapsed time = 192.09 sec. (168269.94 ticks, tree = 21.29 MB, solutions = 5)
 112813 47078        cutoff              0.4526        0.4412  1123458    2.52%          rho_28 U 112813 112812     31
 114904 47773        cutoff              0.4526        0.4412  1142262    2.51%          rho_39 D 114904 114902     39
 116970 48445        0.4514    21        0.4526        0.4413  1160927    2.50%          r

setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 10:20 AM | 1861 rows in lookup table
08/16/19 @ 10:20 AM | ------------------------------------------------------------
08/16/19 @ 10:20 AM | runnning initialization procedure
08/16/19 @ 10:20 AM | ------------------------------------------------------------
08/16/19 @ 10:20 AM | CPA produced 2 cuts
08/16/19 @ 10:20 AM | running naive rounding on 54 solutions
08/16/19 @ 10:20 AM | best objective value: 0.4476
08/16/19 @ 10:20 AM | rounding produced 5 integer solutions
08/16/19 @ 10:20 AM | best objective value is 0.4716
08/16/19 @ 10:20 AM | running sequential rounding on 54 solutions
08/16/19 @ 10:20 AM | best objective value: 0.4476
08/16/19 @ 10:20 AM | sequential rounding produced 6 integer solutions
08/16/19 @ 10:20 AM | best objective value: 0.4546
08/16/19 @ 10:20 AM | polishing 11 solutions
08/16/19 @ 10:20 AM | best objective value: 0.4546
08/16/19 @ 10:20 AM | polishing produced 5 integer solutions
08/16/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:20 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.4544.
Tried aggregator 1 time.
Reduced MIP has 46 rows, 92 columns, and 179 nonzeros.
Reduced MIP has 44 binaries, 46 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.04 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 116926 51728        0.4503    15        0.4533        0.4405  1084015    2.81%          rho_29 D 116926 116925     37
 119097 52672        0.4472    18        0.4533        0.4405  1106428    2.81%          rho_40 D 119097 119095     30
 121194 53580        0.4469    19        0.4533        0.4405  1128472    2.81%          rho_22 D 121194 121193     24
 123387 54552        0.4509    16        0.4533        0.4405  1150626    2.81%           rho_7 D 123387 123386     25
 125561 55594        0.4405    14        0.4533        0.4405  1173227    2.81%           rho_0 D 125561 125542     21
Elapsed time = 158.00 sec. (168233.54 ticks, tree = 24.42 MB, solutions = 5)
 127719 56546        0.4437    23        0.4533        0.4405  1195163    2.81%           rho_1 U 127719 127718     19
 129726 57447        0.4417    21        0.4533        0.4405  1217921    2.81%           rho_1 D 129726 119343     24
 131966 58480        cutoff              0.4533        0.4405  1239816    2.81%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:27 AM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.4548.
Tried aggregator 1 time.
Reduced MIP has 46 rows, 92 columns, and 179 nonzeros.
Reduced MIP has 44 binaries, 46 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.04 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 121987 55188        0.4505    23        0.4530        0.4410  1282693    2.63%           rho_0 D 121987 121986     32
 124057 56032        0.4506    15        0.4530        0.4410  1308125    2.63%          rho_28 D 124057 124055     25
 126189 56912        0.4436    19        0.4530        0.4410  1333323    2.63%           rho_0 U 126189  79733     20
 128230 57757        0.4524    14        0.4530        0.4410  1358959    2.63%          rho_29 D 128230 128229     32
Elapsed time = 83.11 sec. (168218.51 ticks, tree = 24.59 MB, solutions = 4)
 130257 58561        0.4516    11        0.4530        0.4410  1383995    2.63%          rho_40 D 130257 130256     36
 132270 59345        0.4482    17        0.4530        0.4410  1409264    2.63%          rho_23 D 132270 132268     22
 134348 60191        0.4460    16        0.4530        0.4410  1434177    2.63%           rho_6 D 134348 134346     30
 136383 61057        0.4444    19        0.4530        0.4410  1459260    2.63%           r

 385185 134241        0.4502     7        0.4529        0.4442  3836318    1.94%          rho_41 D 385185 385184     38
 387210 134684        0.4470    17        0.4529        0.4442  3854028    1.93%          rho_20 D 387210 387209     32
 389226 135165        cutoff              0.4529        0.4442  3871846    1.93%          rho_23 U 389226 389224     18
 391214 135599        cutoff              0.4529        0.4442  3890295    1.92%          rho_28 D 391214 391212     31
 393221 136026        cutoff              0.4529        0.4442  3908356    1.92%          rho_42 D 393221 393219     42
 395239 136527        0.4519    11        0.4529        0.4443  3926237    1.92%          rho_35 D 395239 395237     27
 397244 136960        0.4496    17        0.4529        0.4443  3943856    1.91%           rho_5 U 397244 397243     25
 399220 137372        0.4484    19        0.4529        0.4443  3962112    1.91%          rho_35 D 399220 399218     34
Elapsed time = 305.58 sec. (664303.59 ti



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:34 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.4531.
Tried aggregator 1 time.
Reduced MIP has 46 rows, 92 columns, and 179 nonzeros.
Reduced MIP has 44 binaries, 46 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.04 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 113413 48977        0.4446    17        0.4522        0.4408  1183684    2.52%          rho_26 U 113413  67797     20
 115831 49826        0.4425    26        0.4522        0.4409  1204806    2.50%           rho_2 U 115831  89515     28
 118328 50714        0.4424    14        0.4522        0.4410  1225595    2.49%          rho_25 U 118328  72413     32
Elapsed time = 81.09 sec. (168238.82 ticks, tree = 22.51 MB, solutions = 3)
 120694 51591        0.4519    13        0.4522        0.4410  1247139    2.47%          rho_10 D 120694 120693     25
 123129 52490        cutoff              0.4522        0.4411  1268476    2.46%           rho_5 D 123129 123127     24
 125575 53310        0.4507     9        0.4522        0.4411  1288886    2.45%          rho_42 D 125575 125574     33
 128016 54134        cutoff              0.4522        0.4412  1309125    2.43%          rho_17 U 128016 128015     29
 130420 55031        cutoff              0.4522        0.4413  1329723    2.42%           r

 401853 120891        0.4518    15        0.4522        0.4451  3510330    1.56%          rho_17 D 401853 401852     27
 404019 121312        0.4511    14        0.4522        0.4451  3526965    1.56%          rho_10 D 404019 404017     19
 406201 121686        0.4492     9        0.4522        0.4452  3543298    1.56%          rho_37 D 406201 406199     29
 408360 122045        0.4502    11        0.4522        0.4452  3559849    1.55%           rho_4 D 408360 408358     27
 410559 122432        0.4503    11        0.4522        0.4452  3575980    1.55%          rho_20 D 410559 410557     38
Elapsed time = 289.78 sec. (664319.47 ticks, tree = 57.44 MB, solutions = 3)
 412728 122834        0.4520    17        0.4522        0.4452  3592626    1.55%           rho_4 U 412728 412727     24
 414868 123165        0.4520     9        0.4522        0.4452  3609030    1.54%          rho_17 D 414868 414867     31
 417032 123528        0.4468    11        0.4522        0.4453  3625439    1.54%   

In [7]:
np.mean(train_auc), np.mean(test_auc)

(0.6680906962385309, 0.6679021727513085)

### Save Results

In [7]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Two Year\\"                   
results = [["Misdemeanor", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Two Year RiskSLIM.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)