In [1]:
import numpy as np
import pandas as pd
import RiskSLIM as risk
import csv

from riskslim.helper_functions import load_data_from_csv, print_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Lasso Feature Selection

In [2]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_violence_two_year'].values
cols = X.columns

In [3]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(27, 0.815834144950583)

#### subset features

In [4]:
### Subset features
selected_features.insert(0, 'recid_violence_two_year')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [5]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_violence_two_year'
sample_weights = np.repeat(1, len(sub_Y))

In [6]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk.risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-5, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = risk.riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = risk.riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 09:53 AM | 1181 rows in lookup table
08/16/19 @ 09:53 AM | ------------------------------------------------------------
08/16/19 @ 09:53 AM | runnning initialization procedure
08/16/19 @ 09:53 AM | ------------------------------------------------------------
08/16/19 @ 09:53 AM | CPA produced 2 cuts
08/16/19 @ 09:53 AM | running naive rounding on 75 solutions
08/16/19 @ 09:53 AM | best objective value: 0.2043
08/16/19 @ 09:53 AM | rounding produced 5 integer solutions
08/16/19 @ 09:53 AM | best objective value is 0.2105
08/16/19 @ 09:53 AM | running sequential rounding on 75 solutions
08/16/19 @ 09:53 AM | best objective value: 0.2043
08/16/19 @ 09:53 AM | sequential rounding produced 6 integer solutions
08/16/19 @ 09:53 AM | best objective value: 0.2071
08/16/19 @ 09:53 AM | polishing 11 solutions
08/16/19 @ 09:53 AM | best objective value: 0.2071
08/16/19 @ 09:53 AM | polishing produced 5 integer solutions
08/16/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 09:53 AM | adding 250 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.2071.
Tried aggregator 1 time.
Reduced MIP has 29 rows, 58 columns, and 111 nonzeros.
Reduced MIP has 27 binaries, 29 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 169854 34054        cutoff              0.2068        0.2043  1032177    1.20%           rho_1 U 169854 114459     31
 173383 34018        0.2060     8        0.2068        0.2044  1050565    1.18%          rho_24 U 173383  48660     23
 176936 34031        0.2059    13        0.2068        0.2044  1068760    1.15%          rho_12 D 176936 176934     21
 180430 33979        0.2061     9        0.2068        0.2045  1086847    1.13%          rho_24 D 180430 180428     25
Elapsed time = 116.52 sec. (168166.23 ticks, tree = 14.61 MB, solutions = 5)
 183958 33894        0.2049    14        0.2068        0.2045  1104943    1.11%           rho_8 D 183958  85428     24
 187527 33819        0.2062     7        0.2068        0.2046  1122840    1.09%          rho_14 D 187527 187526     34
 190988 33708        0.2065     7        0.2068        0.2046  1140681    1.06%           rho_5 D 190988 190986     28
 194497 33592        cutoff              0.2068        0.2047  1158272    1.04%          r

CPXPARAM_MIP_Tolerances_LowerCutoff              0.19996269154622398
CPXPARAM_MIP_Tolerances_UpperCutoff              0.20607084742918294




Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 09:57 AM | adding 251 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.2061.
Tried aggregator 1 time.
Reduced MIP has 29 rows, 58 columns, and 111 nonzeros.
Reduced MIP has 27 binaries, 29 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 172913 23140        0.2051    11        0.2053        0.2034   967147    0.89%          rho_19 U 172913 108241     25
 176593 22587        0.2050     9        0.2053        0.2035   984486    0.85%          rho_24 U 176593 117465     20
 180259 21970        0.2050    13        0.2053        0.2036  1002091    0.82%          rho_26 D 180259 180257     25
 183980 21334        cutoff              0.2053        0.2037  1019511    0.78%           rho_9 U 183980  93621     32
 187857 20460        0.2038    13        0.2053        0.2037  1037489    0.74%          rho_24 D 187857  88492     22
Elapsed time = 105.41 sec. (168167.26 ticks, tree = 8.37 MB, solutions = 7)
 191933 19562        0.2051     7        0.2053        0.2038  1056486    0.70%           rho_8 D 191933 191932     27
 196038 18557        0.2047    13        0.2053        0.2039  1075089    0.66%          rho_18 U 196038  49797     21
 200076 17356        cutoff              0.2053        0.2040  1093401    0.62%           r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 09:59 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.2057.
Tried aggregator 1 time.
Reduced MIP has 29 rows, 58 columns, and 111 nonzeros.
Reduced MIP has 27 binaries, 29 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 181114 37801        0.2032    12        0.2048        0.2020  1046455    1.34%           rho_9 U 181114  80292     23
 185160 37918        0.2042     5        0.2048        0.2021  1066331    1.32%          rho_20 D 185160 185158     28
 189182 37976        0.2036     9        0.2048        0.2021  1085951    1.30%          rho_25 U 189182 150582     15
 193212 38022        0.2030    16        0.2048        0.2022  1105521    1.27%           rho_2 D 193212  51628     20
 197207 38081        cutoff              0.2048        0.2022  1125336    1.25%          rho_25 U 197207 197206     24
Elapsed time = 134.58 sec. (168169.39 ticks, tree = 15.58 MB, solutions = 6)
 201278 38109        cutoff              0.2048        0.2023  1144916    1.22%           rho_4 D 201278 201277     28
 205345 38056        cutoff              0.2048        0.2023  1164011    1.20%          rho_23 U 205345 106700     21
 209346 37955        cutoff              0.2048        0.2024  1183414    1.18%           

08/16/19 @ 10:03 AM | completed initialization procedure
08/16/19 @ 10:03 AM | ------------------------------------------------------------
08/16/19 @ 10:03 AM | 1181 rows in lookup table
CPXPARAM_Read_DataCheck                          1
CPXPARAM_Threads                                 1
CPXPARAM_Parallel                                1
CPXPARAM_RandomSeed                              0
CPXPARAM_TimeLimit                               400
CPXPARAM_MIP_Tolerances_LowerCutoff              0.19973543899734161
CPXPARAM_MIP_Tolerances_UpperCutoff              0.20562164967763277




Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:03 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.2056.
Tried aggregator 1 time.
Reduced MIP has 29 rows, 58 columns, and 111 nonzeros.
Reduced MIP has 27 binaries, 29 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 153281 28613        0.2044    12        0.2051        0.2027   945693    1.15%          rho_23 D 153281 153280     21
 156623 28452        0.2035    13        0.2051        0.2028   964020    1.12%           rho_0 U 156623  83455     19
 159994 28193        0.2050     7        0.2051        0.2029   982300    1.09%          rho_26 U 159994 119850     16
 163378 27921        cutoff              0.2051        0.2029  1000507    1.05%          rho_26 U 163378  91394     22
 166701 27648        cutoff              0.2051        0.2030  1018803    1.02%           rho_5 U 166701 166699     30
 170115 27369        0.2041     9        0.2051        0.2031  1036642    0.99%           rho_2 U 170115 170114     25
 173497 26982        0.2045     6        0.2051        0.2031  1054421    0.96%           rho_6 U 173497  26155     34
Elapsed time = 103.98 sec. (168170.79 ticks, tree = 10.89 MB, solutions = 8)
 176860 26622        cutoff              0.2051        0.2032  1072211    0.93%           



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:05 AM | adding 249 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.2059.
Tried aggregator 1 time.
Reduced MIP has 29 rows, 58 columns, and 111 nonzeros.
Reduced MIP has 27 binaries, 29 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.04 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 161120 26516        0.2034    10        0.2050        0.2029   982232    1.03%           rho_8 U 161120 161119     20
 164510 26211        0.2031    11        0.2050        0.2029  1000298    1.00%          rho_24 D 164510  93755     26
 167924 25856        cutoff              0.2050        0.2030  1018034    0.97%           rho_6 U 167924  22145     19
 171301 25429        0.2041    19        0.2050        0.2031  1035862    0.93%           rho_8 D 171301 171299     25
 174654 25067        0.2038    13        0.2050        0.2031  1053528    0.90%          rho_24 D 174654 174652     23
Elapsed time = 200.53 sec. (168185.19 ticks, tree = 10.20 MB, solutions = 6)
 177984 24567        cutoff              0.2050        0.2032  1070890    0.87%          rho_26 U 177984 177982     24
 181360 24109        cutoff              0.2050        0.2033  1087768    0.84%           rho_4 U 181360 181358     32
 184739 23655        0.2043    17        0.2050        0.2033  1104893    0.81%           

In [7]:
np.mean(train_auc), np.mean(test_auc)

(0.8012964025923515, 0.8012692435923807)

### Save Results

In [7]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Two Year\\"                   
results = [["Violence", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Two Year RiskSLIM.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)