In [1]:
import numpy as np
import pandas as pd
import RiskSLIM as risk
import csv

from riskslim.helper_functions import load_data_from_csv, print_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Lasso Feature Selection

In [2]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_F_six_month'].values
cols = X.columns

In [3]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(32, 0.7909530091194694)

#### subset features

In [4]:
### Subset features
selected_features.insert(0, 'recid_F_six_month')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [5]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_F_six_month'
sample_weights = np.repeat(1, len(sub_Y))

In [6]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk.risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-4, max_runtime=200)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = risk.riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = risk.riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/17/19 @ 04:16 PM | 1381 rows in lookup table
08/17/19 @ 04:16 PM | ------------------------------------------------------------
08/17/19 @ 04:16 PM | runnning initialization procedure
08/17/19 @ 04:16 PM | ------------------------------------------------------------
08/17/19 @ 04:16 PM | CPA produced 2 cuts
08/17/19 @ 04:16 PM | running naive rounding on 94 solutions
08/17/19 @ 04:16 PM | best objective value: 0.1564
08/17/19 @ 04:16 PM | rounding produced 5 integer solutions
08/17/19 @ 04:16 PM | best objective value is 0.1620
08/17/19 @ 04:16 PM | running sequential rounding on 94 solutions
08/17/19 @ 04:16 PM | best objective value: 0.1564
08/17/19 @ 04:16 PM | sequential rounding produced 6 integer solutions
08/17/19 @ 04:16 PM | best objective value: 0.1580
08/17/19 @ 04:16 PM | polishing 11 solutions
08/17/19 @ 04:16 PM | best objective value: 0.1580
08/17/19 @ 04:16 PM | polishing produced 5 integer solutions
08/17/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:17 PM | adding 256 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.1578.
Tried aggregator 1 time.
Reduced MIP has 34 rows, 68 columns, and 131 nonzeros.
Reduced MIP has 32 binaries, 34 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.08 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 214027 63808        0.1564    13        0.1578        0.1551  1396519    1.68%          rho_16 U 214027 110797     38
 218530 64472        0.1563    10        0.1578        0.1552  1421738    1.66%          rho_20 D 218530 218528     33
Elapsed time = 140.14 sec. (168163.04 ticks, tree = 24.01 MB, solutions = 1)
 222947 65251        0.1575     8        0.1578        0.1552  1447258    1.65%           rho_3 D 222947 222945     33
 227441 65979        cutoff              0.1578        0.1552  1471931    1.63%           rho_0 D 227441 227440     24
 231874 66663        cutoff              0.1578        0.1552  1497133    1.61%          rho_16 U 231874 231873     32
 236387 67344        cutoff              0.1578        0.1553  1522356    1.60%           rho_1 D 236387 236386     26
 240863 68030        0.1573     6        0.1578        0.1553  1547214    1.58%          rho_29 U 240863 240862     35
 245325 68644        cutoff              0.1578        0.1553  1572122    1.57%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:20 PM | adding 251 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.1581.
Tried aggregator 1 time.
Reduced MIP has 34 rows, 68 columns, and 131 nonzeros.
Reduced MIP has 32 binaries, 34 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.08 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 183885 41473        0.1564    16        0.1578        0.1559  1204629    1.25%           rho_1 D 183885 162759     21
 187760 41824        0.1578    11        0.1578        0.1559  1226609    1.23%          rho_29 U 187760  46119     19
Elapsed time = 171.22 sec. (167187.43 ticks, tree = 17.02 MB, solutions = 3)
 191600 42205        0.1560    13        0.1578        0.1559  1248084    1.21%           rho_3 D 191600 191598     26
 195462 42491        0.1573     7        0.1578        0.1560  1269423    1.19%           rho_4 D 195462 195460     25
 199205 42797        0.1572    16        0.1578        0.1560  1290123    1.17%           rho_4 D 199205 199203     26
 202947 43104        cutoff              0.1578        0.1560  1310193    1.15%           rho_0 D 202947 202946     26
 206630 43315        0.1563    13        0.1578        0.1561  1330204    1.13%          rho_16 U 206630 102304     21
 210360 43485        0.1571    14        0.1578        0.1561  1350343    1.12%           



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:24 PM | adding 251 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.1578.
Tried aggregator 1 time.
Reduced MIP has 34 rows, 68 columns, and 131 nonzeros.
Reduced MIP has 32 binaries, 34 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.08 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 173841 46651        cutoff              0.1575        0.1550  1194048    1.61%           rho_4 U 173841 173838     25
 177428 47095        cutoff              0.1575        0.1550  1215779    1.59%           rho_2 U 177428 177427     26
 180976 47534        0.1558    17        0.1575        0.1550  1237198    1.58%           rho_6 D 180976  76934     18
Elapsed time = 178.91 sec. (168175.17 ticks, tree = 19.27 MB, solutions = 2)
 184478 47956        0.1561    15        0.1575        0.1551  1257442    1.56%           rho_2 D 184478 184476     22
 187881 48364        0.1555    20        0.1575        0.1551  1278099    1.54%          rho_23 D 187881  93960     19
 191300 48726        0.1563    14        0.1575        0.1551  1298395    1.52%           rho_4 U 191300 191299     24
 194755 49135        0.1562    15        0.1575        0.1552  1319085    1.51%           rho_0 U 194755  24322     24
 198218 49500        0.1556    11        0.1575        0.1552  1339234    1.49%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:27 PM | adding 252 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.1568.
Tried aggregator 1 time.
Reduced MIP has 34 rows, 68 columns, and 131 nonzeros.
Reduced MIP has 32 binaries, 34 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.08 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

setting c0 = 0.0 to ensure that intercept is not penalized
08/17/19 @ 04:31 PM | 1381 rows in lookup table
08/17/19 @ 04:31 PM | ------------------------------------------------------------
08/17/19 @ 04:31 PM | runnning initialization procedure
08/17/19 @ 04:31 PM | ------------------------------------------------------------
08/17/19 @ 04:31 PM | CPA produced 2 cuts
08/17/19 @ 04:31 PM | running naive rounding on 75 solutions
08/17/19 @ 04:31 PM | best objective value: 0.1560
08/17/19 @ 04:31 PM | rounding produced 5 integer solutions
08/17/19 @ 04:31 PM | best objective value is 0.1604
08/17/19 @ 04:31 PM | running sequential rounding on 75 solutions
08/17/19 @ 04:31 PM | best objective value: 0.1560
08/17/19 @ 04:31 PM | sequential rounding produced 6 integer solutions
08/17/19 @ 04:31 PM | best objective value: 0.1589
08/17/19 @ 04:31 PM | polishing 11 solutions
08/17/19 @ 04:31 PM | best objective value: 0.1589
08/17/19 @ 04:31 PM | polishing produced 5 integer solutions
08/17/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/17/19 @ 04:31 PM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.1589.
Tried aggregator 1 time.
Reduced MIP has 34 rows, 68 columns, and 131 nonzeros.
Reduced MIP has 32 binaries, 34 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.08 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 180149 34957        cutoff              0.1575        0.1559  1065350    1.03%           rho_3 U 180149 180147     21
 183958 35069        cutoff              0.1575        0.1560  1084834    1.01%           rho_1 U 183958  30577     15
 187816 35125        cutoff              0.1575        0.1560  1104319    0.99%          rho_13 U 187816 187815     30
Elapsed time = 189.47 sec. (167127.49 ticks, tree = 14.95 MB, solutions = 6)
 191642 35226        0.1563    12        0.1575        0.1560  1123469    0.97%           rho_7 D 191642 191640     27
 195434 35273        0.1565     9        0.1575        0.1561  1142282    0.95%          rho_13 U 195434 195433     22
 199220 35290        cutoff              0.1575        0.1561  1161423    0.93%           rho_6 U 199220 199218     15

Gomory fractional cuts applied:  1
User cuts applied:  1191

Root node processing (before b&c):
  Real time             =    0.17 sec. (2.40 ticks)
Sequential b&c:
  Real time             =  199.84 sec. (1786

In [7]:
np.mean(train_auc), np.mean(test_auc)

(0.7633839112530427, 0.7630731762737206)

### Save Results

In [8]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Six Month\\"                   
results = [["Felony", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Six Month RiskSLIM.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)