In [1]:
import numpy as np
import pandas as pd
import RiskSLIM as risk
import csv

from riskslim.helper_functions import load_data_from_csv, print_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Lasso Feature Selection

In [2]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_F_two_year'].values
cols = X.columns

In [3]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(36, 0.743877420048611)

#### subset features

In [4]:
### Subset features
selected_features.insert(0, 'recid_F_two_year')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [5]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_F_two_year'
sample_weights = np.repeat(1, len(sub_Y))

In [6]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk.risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-4, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = risk.riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = risk.riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 10:06 AM | 1541 rows in lookup table
08/16/19 @ 10:06 AM | ------------------------------------------------------------
08/16/19 @ 10:06 AM | runnning initialization procedure
08/16/19 @ 10:06 AM | ------------------------------------------------------------
08/16/19 @ 10:06 AM | CPA produced 2 cuts
08/16/19 @ 10:06 AM | running naive rounding on 60 solutions
08/16/19 @ 10:06 AM | best objective value: 0.3739
08/16/19 @ 10:06 AM | rounding produced 5 integer solutions
08/16/19 @ 10:06 AM | best objective value is 0.3839
08/16/19 @ 10:06 AM | running sequential rounding on 60 solutions
08/16/19 @ 10:06 AM | best objective value: 0.3739
08/16/19 @ 10:06 AM | sequential rounding produced 6 integer solutions
08/16/19 @ 10:06 AM | best objective value: 0.3838
08/16/19 @ 10:06 AM | polishing 11 solutions
08/16/19 @ 10:06 AM | best objective value: 0.3838
08/16/19 @ 10:06 AM | polishing produced 5 integer solutions
08/16/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:06 AM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.3837.
Tried aggregator 1 time.
Reduced MIP has 38 rows, 76 columns, and 147 nonzeros.
Reduced MIP has 36 binaries, 38 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.09 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.05 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 119292 33119        0.3813    16        0.3826        0.3759   907707    1.76%          rho_12 D 119292 119290     26
 121631 33378        cutoff              0.3826        0.3760   924081    1.73%           rho_0 D 121631 121630     20
 124010 33704        0.3817    13        0.3826        0.3761   940258    1.71%           rho_9 D 124010 124009     24
 126390 33924        0.3779    10        0.3826        0.3762   956522    1.68%          rho_16 U 126390  32081     20
Elapsed time = 231.11 sec. (164311.67 ticks, tree = 15.86 MB, solutions = 5)
 128807 34183        cutoff              0.3826        0.3763   972192    1.66%           rho_7 U 128807  11745     27
 131183 34388        0.3779    13        0.3826        0.3764   988066    1.64%           rho_0 U 131183  70392     13
 133570 34644        0.3821    17        0.3826        0.3765  1003839    1.61%          rho_35 D 133570 133568     28
 135928 34883        cutoff              0.3826        0.3766  1019698    1.59%          r

08/16/19 @ 10:13 AM | best objective value: 0.3830
08/16/19 @ 10:13 AM | ------------------------------------------------------------
08/16/19 @ 10:13 AM | completed initialization procedure
08/16/19 @ 10:13 AM | ------------------------------------------------------------
08/16/19 @ 10:13 AM | 1541 rows in lookup table
CPXPARAM_Read_DataCheck                          1
CPXPARAM_Threads                                 1
CPXPARAM_Parallel                                1
CPXPARAM_RandomSeed                              0
CPXPARAM_TimeLimit                               400
CPXPARAM_MIP_Tolerances_LowerCutoff              0.36926376333500893
CPXPARAM_MIP_Tolerances_UpperCutoff              0.38300440884321296




Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:13 AM | adding 254 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.3830.
Tried aggregator 1 time.
Reduced MIP has 38 rows, 76 columns, and 147 nonzeros.
Reduced MIP has 36 binaries, 38 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.09 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.05 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 118242 41932        0.3765    13        0.3818        0.3729   925435    2.34%          rho_36 D 118242 118240     31
 120655 42514        0.3787    11        0.3818        0.3730   942210    2.32%          rho_19 D 120655 120654     24
 123003 43081        0.3793    17        0.3818        0.3731   958909    2.30%           rho_3 D 123003 123001     25
 125372 43665        0.3796    11        0.3818        0.3731   975737    2.28%          rho_23 U 125372 125370     37
 127729 44223        0.3742    15        0.3818        0.3732   992515    2.26%           rho_6 U 127729 127728     19
 130084 44726        cutoff              0.3818        0.3733  1008739    2.24%          rho_11 U 130084 130083     34
Elapsed time = 182.00 sec. (168210.23 ticks, tree = 20.32 MB, solutions = 6)
 132420 45334        0.3801    14        0.3818        0.3734  1025626    2.22%          rho_35 D 132420 132418     37
 134760 45856        0.3805     9        0.3818        0.3734  1042321    2.21%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:20 AM | adding 262 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.3833.
Tried aggregator 1 time.
Reduced MIP has 38 rows, 76 columns, and 147 nonzeros.
Reduced MIP has 36 binaries, 38 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.09 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.05 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

*118000+40472                            0.3816        0.3728             2.32%
 119484 40849        0.3753     9        0.3816        0.3729   990938    2.31%          rho_23 D 119484 119482     28
 121932 41404        0.3772    11        0.3816        0.3729  1008437    2.29%          rho_25 D 121932 121931     26
 124377 41987        0.3790     8        0.3816        0.3730  1025776    2.27%          rho_19 D 124377 124375     22
 126813 42555        0.3805    11        0.3816        0.3731  1042873    2.25%           rho_7 D 126813 126811     23
 129222 43089        0.3812     6        0.3816        0.3732  1060007    2.22%          rho_16 D 129222 129221     41
 131634 43625        0.3792    13        0.3816        0.3732  1076895    2.20%          rho_35 D 131634 131632     37
Elapsed time = 155.16 sec. (168204.81 ticks, tree = 19.55 MB, solutions = 6)
 134070 44204        0.3816    10        0.3816        0.3733  1093691    2.19%          rho_25 N 134070 134069     29
 136435 44



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:27 AM | adding 253 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.3819.
Tried aggregator 1 time.
Reduced MIP has 38 rows, 76 columns, and 147 nonzeros.
Reduced MIP has 36 binaries, 38 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.09 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.05 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

ValueError: max() arg is an empty sequence

In [8]:
np.mean(train_auc), np.mean(test_auc)

(0.7100341955385824, 0.7108446919220622)

### Save Results

In [7]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Two Year\\"                   
results = [["Felony", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Two Year RiskSLIM.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)