In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn.linear_model import Lasso

from pprint import pprint
from riskslim.helper_functions import load_data_from_csv, print_model
from riskslim.setup_functions import get_conservative_offset
from riskslim.coefficient_set import CoefficientSet
from riskslim.lattice_cpa import run_lattice_cpa
from riskslim.lattice_cpa import setup_lattice_cpa, finish_lattice_cpa

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import shuffle

### Modeling -- RiskSLIM

In [2]:
def risk_slim(data, max_coefficient, max_L0_value, c0_value, max_runtime = 120, w_pos = 1, max_offset=50):
    
    
    """
    @parameters:
    
    max_coefficient:  value of largest/smallest coefficient
    max_L0_value:     maximum model size (set as float(inf))
    max_offset:       maximum value of offset parameter (optional)
    c0_value:         L0-penalty parameter such that c0_value > 0; larger values -> 
                      sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms
    max_runtime:      max algorithm running time
    w_pos:            relative weight on examples with y = +1; w_neg = 1.00 (optional)
    
    """
    
    # create coefficient set and set the value of the offset parameter
    coef_set = CoefficientSet(variable_names = data['variable_names'], lb = 0, ub = max_coefficient, sign = 0)
    conservative_offset = get_conservative_offset(data, coef_set, max_L0_value)
    max_offset = min(max_offset, conservative_offset)
    coef_set['(Intercept)'].ub = max_offset
    coef_set['(Intercept)'].lb = -max_offset

    constraints = {
        'L0_min': 0,
        'L0_max': max_L0_value,
        'coef_set':coef_set,
    }
    
    # Set parameters
    settings = {
        # Problem Parameters
        'c0_value': c0_value,
        'w_pos': w_pos,

        # LCPA Settings
        'max_runtime': max_runtime,                         # max runtime for LCPA
        'max_tolerance': np.finfo('float').eps,             # tolerance to stop LCPA (set to 0 to return provably optimal solution)
        'display_cplex_progress': True,                     # print CPLEX progress on screen
        'loss_computation': 'lookup',                       # how to compute the loss function ('normal','fast','lookup')
        
        # LCPA Improvements
        'round_flag': False,                                # round continuous solutions with SeqRd
        'polish_flag': False,                               # polish integer feasible solutions with DCD
        'chained_updates_flag': False,                      # use chained updates
        'add_cuts_at_heuristic_solutions': True,            # add cuts at integer feasible solutions found using polishing/rounding
        
        # Initialization
        'initialization_flag': True,                        # use initialization procedure
        'init_max_runtime': 300.0,                          # max time to run CPA in initialization procedure
        'init_max_coefficient_gap': 0.49,

        # CPLEX Solver Parameters
        'cplex_randomseed': 0,                              # random seed
        'cplex_mipemphasis': 0,                             # cplex MIP strategy
    }
    

    # train model using lattice_cpa
    model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings)
        
    return model_info, mip_info, lcpa_info

In [3]:
def riskslim_prediction(X, feature_name, model_info):
    
    """
    @parameters
    
    X: test input features (np.array)
    feature_name: feature names
    model_info: output from RiskSLIM model
    
    """
    
    ## initialize parameters
    dictionary = {}
    prob = np.zeros(len(X))
    scores = np.zeros(len(X))
    
    ## prepare statistics
    subtraction_score = model_info['solution'][0]
    coefs = model_info['solution'][1:]
    index = np.where(coefs != 0)[0]
    
    nonzero_coefs = coefs[index]
    features = feature_name[index]
    X_sub = X[:,index]
    
    ## build dictionaries
    for i in range(len(features)):
        single_feature = features[i]
        coef = nonzero_coefs[i]
        dictionary.update({single_feature: coef})
        
    ## calculate probability
    for i in range(len(X_sub)):
        summation = 0
        for j in range(len(features)):
            a = X_sub[i,j]
            summation += dictionary[features[j]] * a
        scores[i] = summation
    
    prob = 1/(1+np.exp(-(scores + subtraction_score)))
    
    return prob

In [4]:
def riskslim_accuracy(X, Y, feature_name, model_info, threshold=0.5):
    
    prob = riskslim_prediction(X, feature_name, model_info)
    pred = np.mean((prob > threshold) == Y)
    
    return pred

### Lasso Feature Selection

In [5]:
## load stumps data
data = pd.read_csv("C:/Users/binha/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_stumps.csv")
X, Y = data.loc[:,:'current_violence>=1'], data['recid_two_year'].values
cols = X.columns

In [6]:
## lasso
lasso = Lasso(random_state=816, alpha=0.001).fit(X, Y)
selected_features = cols[lasso.coef_ != 0].tolist()
len(selected_features), roc_auc_score(Y, lasso.predict(X))

(42, 0.7371445290995498)

#### subset features

In [7]:
### Subset features
selected_features.insert(0, 'recid_two_year')
sub_data = data[selected_features]
sub_X, sub_Y = sub_data.iloc[:,1:], sub_data.iloc[:,0].values
sub_X.insert(0, '(Intercept)', 1)

### Cross Validation

In [8]:
SLIM_X, SLIM_Y = sub_X.values, sub_Y.reshape(-1,1)
variable_names = sub_X.columns.tolist()
outcome_name = 'recid_two_year'
sample_weights = np.repeat(1, len(sub_Y))

In [9]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = KFold(n_splits=5, random_state=816, shuffle=True)
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(SLIM_X, SLIM_Y):
    
    ## subset train data & store test data
    X_train, Y_train = SLIM_X[train], SLIM_Y[train]
    X_test, Y_test = SLIM_X[test], SLIM_Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk_slim(new_train_data, max_coefficient=20, max_L0_value=10, 
                                                c0_value=1e-5, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    ## AUC
    train_auc.append(roc_auc_score(Y_train, train_prob))
    test_auc.append(roc_auc_score(Y_test, test_prob))

setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 09:53 AM | 1781 rows in lookup table
08/16/19 @ 09:53 AM | ------------------------------------------------------------
08/16/19 @ 09:53 AM | runnning initialization procedure
08/16/19 @ 09:53 AM | ------------------------------------------------------------
08/16/19 @ 09:53 AM | CPA produced 2 cuts
08/16/19 @ 09:53 AM | running naive rounding on 61 solutions
08/16/19 @ 09:53 AM | best objective value: 0.5174
08/16/19 @ 09:53 AM | rounding produced 5 integer solutions
08/16/19 @ 09:53 AM | best objective value is 0.5286
08/16/19 @ 09:53 AM | running sequential rounding on 61 solutions
08/16/19 @ 09:53 AM | best objective value: 0.5174
08/16/19 @ 09:53 AM | sequential rounding produced 6 integer solutions
08/16/19 @ 09:53 AM | best objective value: 0.5270
08/16/19 @ 09:53 AM | polishing 11 solutions
08/16/19 @ 09:53 AM | best objective value: 0.5270
08/16/19 @ 09:53 AM | polishing produced 4 integer solutions
08/16/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 09:53 AM | adding 253 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5270.
Tried aggregator 1 time.
Reduced MIP has 44 rows, 88 columns, and 171 nonzeros.
Reduced MIP has 42 binaries, 44 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 110308 41082        cutoff              0.5261        0.5129  1271806    2.50%          rho_22 U 110308 110307     31
 112353 41549        cutoff              0.5261        0.5130  1293393    2.48%           rho_6 U 112353 112351     26
 114455 42001        cutoff              0.5261        0.5131  1314557    2.46%           rho_7 U 114455 114454     21
Elapsed time = 135.20 sec. (168223.12 ticks, tree = 18.47 MB, solutions = 2)
 116480 42476        0.5203    21        0.5261        0.5132  1336053    2.44%           rho_9 D 116480  13056     22
 118551 42904        cutoff              0.5261        0.5133  1356749    2.43%          rho_22 U 118551 118550     24
 120566 43377        0.5257    14        0.5261        0.5134  1377976    2.41%           rho_7 D 120566 120564     26
*120700+42915                            0.5259        0.5134             2.38%
 122432 43313        0.5177    19        0.5259        0.5135  1398149    2.36%           rho_0 U 122432 122431     30
 124342 43



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:00 AM | adding 253 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5250.
Tried aggregator 1 time.
Reduced MIP has 44 rows, 88 columns, and 171 nonzeros.
Reduced MIP has 42 binaries, 44 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 118010 31977        0.5222    19        0.5240        0.5157  1144866    1.59%          rho_28 D 118010 118009     25
 120271 32189        cutoff              0.5240        0.5158  1165067    1.56%          rho_40 D 120271 120269     36
 122590 32390        0.5233     9        0.5240        0.5160  1184648    1.54%          rho_34 U 122590 122589     17
 124857 32699        0.5205    21        0.5240        0.5161  1204475    1.51%          rho_22 D 124857 124856     25
 127080 32874        0.5234    14        0.5240        0.5162  1224282    1.49%           rho_0 D 127080 127079     20
Elapsed time = 113.25 sec. (168227.89 ticks, tree = 14.78 MB, solutions = 5)
 129279 33039        cutoff              0.5240        0.5164  1243167    1.46%          rho_18 U 129279 129278     25
 131510 33258        0.5177    21        0.5240        0.5165  1261661    1.44%           rho_0 U 131510 131509     19
 133728 33466        0.5202    14        0.5240        0.5166  1280230    1.42%          r



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:05 AM | adding 256 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5246.
Tried aggregator 1 time.
Reduced MIP has 44 rows, 88 columns, and 171 nonzeros.
Reduced MIP has 42 binaries, 44 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 128135 44229        0.5236    17        0.5246        0.5139  1227099    2.03%          rho_14 D 128135 128133     34
 130429 44780        0.5159    15        0.5246        0.5140  1247234    2.01%          rho_14 D 130429  63167     40
 132688 45287        0.5154     5        0.5246        0.5141  1266948    1.99%          rho_17 D 132688  71035     38
Elapsed time = 218.50 sec. (168187.68 ticks, tree = 19.51 MB, solutions = 1)
 134975 45800        cutoff              0.5246        0.5142  1285674    1.97%          rho_28 U 134975 134974     26
 137214 46353        0.5193    18        0.5246        0.5143  1305520    1.95%           rho_4 D 137214 137212     22
 139408 46779        cutoff              0.5246        0.5144  1324964    1.93%          rho_18 D 139408 139406     26
 141714 47363        0.5216    21        0.5246        0.5145  1343750    1.92%          rho_20 U 141714 141713     18
 143930 47871        0.5171    25        0.5246        0.5146  1363026    1.90%           

CPXPARAM_RandomSeed                              0
CPXPARAM_TimeLimit                               400
CPXPARAM_MIP_Tolerances_LowerCutoff              0.51061833884904473
CPXPARAM_MIP_Tolerances_UpperCutoff              0.52655004682726658




Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:12 AM | adding 252 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5266.
Tried aggregator 1 time.
Reduced MIP has 44 rows, 88 columns, and 171 nonzeros.
Reduced MIP has 42 binaries, 44 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 138345 53817        cutoff              0.5245        0.5132  1195207    2.14%          rho_42 U 138345 138344     28
 141090 54620        0.5202    20        0.5245        0.5133  1217519    2.13%          rho_18 D 141090 141089     31
 143930 55448        0.5171     8        0.5245        0.5134  1239510    2.11%          rho_40 D 143930 143928     38
 146687 56247        0.5184    17        0.5245        0.5135  1262180    2.10%           rho_8 D 146687 146686     27
 149433 56943        0.5219    15        0.5245        0.5135  1283890    2.09%          rho_35 U 149433  82916     33
Elapsed time = 189.67 sec. (168178.28 ticks, tree = 23.35 MB, solutions = 4)
 152145 57635        0.5219    13        0.5245        0.5136  1305814    2.07%          rho_37 D 152145 152143     42
 154826 58339        0.5233    17        0.5245        0.5137  1327701    2.06%          rho_17 U 154826 154825     28
 157499 59053        0.5164    18        0.5245        0.5138  1349776    2.05%          r

                          ------------
Total (root+branch&cut) =  400.02 sec. (387275.38 ticks)
+----------------------------------------------+------------------+-----------+
| Pr(Y = +1) = 1.0/(1.0 + exp(-(-2 + score))   |                  |           |
| p_arrest>=2                                  |         1 points |   + ..... |
| p_arrest>=3                                  |         1 points |   + ..... |
| p_arrest>=6                                  |         1 points |   + ..... |
| ADD POINTS FROM ROWS 1 to 3                  |            SCORE |   = ..... |
+----------------------------------------------+------------------+-----------+
setting c0 = 0.0 to ensure that intercept is not penalized
08/16/19 @ 10:19 AM | 1781 rows in lookup table
08/16/19 @ 10:19 AM | ------------------------------------------------------------
08/16/19 @ 10:19 AM | runnning initialization procedure
08/16/19 @ 10:19 AM | ------------------------------------------------------------
08/16/19 @ 10:1



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/16/19 @ 10:19 AM | adding 251 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5239.
Tried aggregator 1 time.
Reduced MIP has 44 rows, 88 columns, and 171 nonzeros.
Reduced MIP has 42 binaries, 44 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.11 ticks)
Probing time = 0.00 sec. (0.03 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.06 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

 124642 32652        0.5223    15        0.5235        0.5156  1237709    1.51%          rho_40 D 124642 124640     32
 126941 32834        0.5194    23        0.5235        0.5157  1258547    1.49%          rho_12 U 126941 126938     26
 129292 33041        0.5194    13        0.5235        0.5158  1278995    1.47%          rho_26 D 129292 129290     35
Elapsed time = 164.17 sec. (167796.51 ticks, tree = 14.90 MB, solutions = 2)
 131612 33239        0.5217    11        0.5235        0.5159  1300026    1.44%          rho_40 D 131612 131611     31
 133936 33442        0.5234    15        0.5235        0.5160  1320421    1.42%          rho_35 D 133936 133935     34
 136210 33562        0.5170    23        0.5235        0.5162  1341209    1.40%          rho_20 D 136210  71364     21
 138547 33717        0.5194    11        0.5235        0.5163  1361464    1.38%          rho_40 D 138547 138545     36
 140789 33801        0.5196    13        0.5235        0.5164  1382384    1.36%           

In [215]:
np.mean(train_auc), np.mean(test_auc)

(0.7062410736887011, 0.7062727815017429)

### Save Results

In [10]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\Models\\Two Year\\"                   
results = [["Model", "train_auc_mean", "train_auc_std","test_auc_mean", "test_auc_std"],
    ["Recidivism", np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)]]

with open(path + 'Two Year RiskSLIM.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)

## Appendix

#### Original Code

In [None]:
## split features and dependant variable
x_test, y_test = test.iloc[:,1:].values, test.iloc[:,0].values
x_train, y_train = train.iloc[:,1:].values, train.iloc[:,0].values
y_test[y_test == -1] = 0
y_train[y_train == -1] = 0

## column names
cols = train.columns[1:]

In [142]:
# data path
data_name = "lasso_train_stumps.csv"          
data_csv_file = os.getcwd() + '/' + data_name 
sample_weights_csv_file = None 
# load data
data = load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)

  raw_data = df.as_matrix()


In [46]:
model_info, mip_info, lcpa_info = risk_slim(data, max_coefficient=20, max_L0_value=10, c0_value=1e-5, max_runtime=200)

#model info contains key results
print_model(model_info['solution'], data)

setting c0 = 0.0 to ensure that intercept is not penalized
08/05/19 @ 02:29 PM | 2301 rows in lookup table
08/05/19 @ 02:29 PM | ------------------------------------------------------------
08/05/19 @ 02:29 PM | runnning initialization procedure
08/05/19 @ 02:29 PM | ------------------------------------------------------------
08/05/19 @ 02:29 PM | CPA produced 2 cuts
08/05/19 @ 02:29 PM | running naive rounding on 30 solutions
08/05/19 @ 02:29 PM | best objective value: 0.5371
08/05/19 @ 02:29 PM | rounding produced 5 integer solutions
08/05/19 @ 02:29 PM | best objective value is 0.5498
08/05/19 @ 02:29 PM | running sequential rounding on 30 solutions
08/05/19 @ 02:29 PM | best objective value: 0.5371
08/05/19 @ 02:29 PM | sequential rounding produced 6 integer solutions
08/05/19 @ 02:29 PM | best objective value: 0.5321
08/05/19 @ 02:29 PM | polishing 11 solutions
08/05/19 @ 02:29 PM | best objective value: 0.5321
08/05/19 @ 02:29 PM | polishing produced 4 integer solutions
08/05/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/05/19 @ 02:29 PM | adding 255 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5286.
Tried aggregator 1 time.
Reduced MIP has 57 rows, 114 columns, and 223 nonzeros.
Reduced MIP has 55 binaries, 57 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.14 ticks)
Probing time = 0.00 sec. (0.05 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.07 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Varia

 301544 102596        0.5103     3        0.5238        0.5103  1935064    2.58%           rho_0 D 301544 301542    107
 308686 104817        cutoff              0.5238        0.5103  1979034    2.58%          rho_12 D 308686 308685    116
 315615 106773        0.5155     2        0.5238        0.5103  2022903    2.58%           rho_3 U 315615 315488    101
 322327 108708        0.5103    12        0.5238        0.5103  2067096    2.58%          rho_12 D 322327 322326     91
 328936 110740        0.5103    21        0.5238        0.5103  2111868    2.58%          rho_25 D 328936 328935     87
*332000+111689                            0.5238        0.5103             2.58%
 335694 112754        0.5132    16        0.5238        0.5103  2156258    2.58%          rho_19 D 335694 335693    127
 342242 114742        0.5150     2        0.5238        0.5103  2202163    2.58%           rho_1 U 342242 342192    120
 349079 116838        0.5189    17        0.5238        0.5103  2246863    2.58

<prettytable.PrettyTable at 0x1e4008de0b8>

In [58]:
y_score = test['p_arrest>=2'] + test['p_arrest>=3'] + test['p_arrest>=6']

In [59]:
y_pred = 1/(1+np.exp(-(y_score - 2)))

In [60]:
train_fpr, train_tpr, train_thresholds = roc_curve(y_test, y_pred)
holdout_train_auc = auc(train_fpr, train_tpr)
holdout_train_auc

0.7036793673430766

In [65]:
train_prob = riskslim_prediction(x_train, cols, model_info)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, train_prob)
holdout_train_auc = auc(train_fpr, train_tpr)
holdout_train_auc

0.6965010392251154

In [48]:
test_prob = riskslim_prediction(x_test, logistic_features, model_info)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, test_prob)
holdout_test_auc = auc(test_fpr, test_tpr)
holdout_test_auc

0.6949495704964834

### Cross Validation

In [32]:
# data path
data_name = "new_train.csv"          
data_csv_file = os.getcwd() + '/' + data_name 
sample_weights_csv_file = None 

# load data
data = load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)
X, Y = data['X'], data['Y']
variable_names = data['variable_names']
outcome_name = data['outcome_name']
sample_weights = data['sample_weights']

In [34]:
#cv = KFold(n_splits=5, random_state=816, shuffle=True)
cv = StratifiedKFold(n_splits=5, random_state=816, shuffle=True)
train_acc, test_acc = [], []
train_auc, test_auc = [], []

i = 0
for train, test in cv.split(X, Y):
    
    ## subset train data & store test data
    X_train, Y_train = X[train], Y[train]
    X_test, Y_test = X[test], Y[test]
    sample_weights_train, sample_weights_test = sample_weights[train], sample_weights[test]

    ## create new data dictionary
    new_train_data = {
        'X': X_train,
        'Y': Y_train,
        'variable_names': variable_names,
        'outcome_name': outcome_name,
        'sample_weights': sample_weights_train
    }
        
    ## fit the model
    model_info, mip_info, lcpa_info = risk_slim(new_train_data, 
                                                max_coefficient=5, 
                                                max_L0_value=10, 
                                                c0_value=1e-5, max_runtime=400)
    print_model(model_info['solution'], new_train_data)
    
    ## change data format
    X_train, X_test = X_train[:,1:], X_test[:,1:] ## remove the first column, which is "intercept"
    Y_train[Y_train == -1] = 0 ## change -1 to 0
    Y_test[Y_test == -1] = 0
    
    ## probability & accuracy
    train_prob = riskslim_prediction(X_train, cols, model_info).reshape(-1,1)
    test_prob = riskslim_prediction(X_test, cols, model_info).reshape(-1,1)
    
    train_acc.append(np.mean((train_prob > 0.5) == Y_train))
    test_acc.append(np.mean((test_prob > 0.5) == Y_test))
    
    ## AUC
    train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, train_prob)
    test_fpr, test_tpr, test_thresholds = roc_curve(Y_test, test_prob)    
    train_auc.append(auc(train_fpr, train_tpr))
    test_auc.append(auc(test_fpr, test_tpr))

setting c0 = 0.0 to ensure that intercept is not penalized
08/02/19 @ 02:39 PM | 1321 rows in lookup table
08/02/19 @ 02:39 PM | ------------------------------------------------------------
08/02/19 @ 02:39 PM | runnning initialization procedure
08/02/19 @ 02:39 PM | ------------------------------------------------------------
08/02/19 @ 02:39 PM | CPA produced 2 cuts
08/02/19 @ 02:39 PM | running naive rounding on 37 solutions
08/02/19 @ 02:39 PM | best objective value: 0.5302
08/02/19 @ 02:39 PM | rounding produced 5 integer solutions
08/02/19 @ 02:39 PM | best objective value is 0.5468
08/02/19 @ 02:39 PM | running sequential rounding on 37 solutions
08/02/19 @ 02:39 PM | best objective value: 0.5302
08/02/19 @ 02:39 PM | sequential rounding produced 6 integer solutions
08/02/19 @ 02:39 PM | best objective value: 0.5290
08/02/19 @ 02:39 PM | polishing 11 solutions
08/02/19 @ 02:39 PM | best objective value: 0.5290
08/02/19 @ 02:39 PM | polishing produced 5 integer solutions
08/02/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/02/19 @ 02:39 PM | adding 303 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5252.
Tried aggregator 1 time.
Reduced MIP has 125 rows, 250 columns, and 495 nonzeros.
Reduced MIP has 123 binaries, 125 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.24 ticks)
Probing time = 0.00 sec. (0.19 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.16 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Va

 101036 57235        cutoff              0.5241        0.5100  1161905    2.70%          rho_79 D 101036 101035     75
 102939 58140        0.5189    26        0.5241        0.5100  1191567    2.70%         rho_120 D 102939 102937     64
 105182 59327        0.5100     9        0.5241        0.5100  1221057    2.70%          rho_18 D 105182 104555     47
 107378 60539        0.5198    24        0.5241        0.5100  1244239    2.70%          rho_27 D 107378 107377     57
 109299 61569        0.5224    24        0.5241        0.5100  1265992    2.70%          rho_33 D 109299 109297     90
Elapsed time = 84.42 sec. (168257.78 ticks, tree = 26.86 MB, solutions = 5)
 111112 62383        cutoff              0.5241        0.5100  1290182    2.70%           rho_0 U 111112 111110     67
 113347 63387        0.5231    17        0.5241        0.5100  1317097    2.70%           rho_6 D 113347 113346     88
 115634 64449        0.5100    11        0.5241        0.5100  1344773    2.70%          rh

Elapsed time = 289.67 sec. (626306.13 ticks, tree = 86.47 MB, solutions = 10)
 325334 186376        0.5127    30        0.5239        0.5100  3906824    2.66%          rho_43 D 325334 325333     79
 327229 187523        0.5216    18        0.5239        0.5100  3925218    2.66%           rho_0 U 327229 327228     51
 328760 188250        0.5140    32        0.5239        0.5100  3945624    2.66%           rho_0 D 328760 328758     55
 330162 188752        cutoff              0.5239        0.5100  3969383    2.66%          rho_66 U 330162 330161     68
 331739 189395        cutoff              0.5239        0.5100  3993814    2.66%          rho_58 D 331739 331738     66
 333460 190195        0.5233    10        0.5239        0.5100  4017155    2.66%          rho_93 D 333460 333459     55
 335184 191026        cutoff              0.5239        0.5100  4040104    2.66%           rho_0 U 335184 335183     49
 337200 192321        0.5100     4        0.5239        0.5100  4059249    2.66%  

setting c0 = 0.0 to ensure that intercept is not penalized
08/02/19 @ 02:46 PM | 1321 rows in lookup table
08/02/19 @ 02:46 PM | ------------------------------------------------------------
08/02/19 @ 02:46 PM | runnning initialization procedure
08/02/19 @ 02:46 PM | ------------------------------------------------------------
08/02/19 @ 02:46 PM | CPA produced 2 cuts
08/02/19 @ 02:46 PM | running naive rounding on 45 solutions
08/02/19 @ 02:46 PM | best objective value: 0.5201
08/02/19 @ 02:46 PM | rounding produced 5 integer solutions
08/02/19 @ 02:46 PM | best objective value is 0.5279
08/02/19 @ 02:46 PM | running sequential rounding on 45 solutions
08/02/19 @ 02:46 PM | best objective value: 0.5201
08/02/19 @ 02:46 PM | sequential rounding produced 6 integer solutions
08/02/19 @ 02:46 PM | best objective value: 0.5255
08/02/19 @ 02:46 PM | polishing 11 solutions
08/02/19 @ 02:46 PM | best objective value: 0.5255
08/02/19 @ 02:46 PM | polishing produced 5 integer solutions
08/02/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/02/19 @ 02:46 PM | adding 299 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5254.
Tried aggregator 1 time.
Reduced MIP has 125 rows, 250 columns, and 495 nonzeros.
Reduced MIP has 123 binaries, 125 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.24 ticks)
Probing time = 0.00 sec. (0.19 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.16 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Va

 119630 70962        0.5097    16        0.5242        0.5097  1030758    2.77%          rho_17 D 119630 119629     45
 121890 72440        0.5221    21        0.5242        0.5097  1051409    2.77%           rho_3 U 121890 121888     63
 124167 73975        cutoff              0.5242        0.5097  1072239    2.77%          rho_93 U 124167 124165     87
 126359 75476        0.5185    17        0.5242        0.5097  1094309    2.77%          rho_30 U 126359 126358     62
 128758 77166        0.5116    29        0.5242        0.5097  1117286    2.77%          rho_72 D 128758 128757     61
 131060 78798        0.5195    17        0.5242        0.5097  1140260    2.77%          rho_43 D 131060 131057     77
 133227 80271        0.5097    27        0.5242        0.5097  1162711    2.77%          rho_16 U 133227 133225     48
Elapsed time = 90.97 sec. (168257.66 ticks, tree = 36.80 MB, solutions = 8)
 135417 81668        0.5097    12        0.5242        0.5097  1185240    2.77%           r

Elapsed time = 318.05 sec. (626300.56 ticks, tree = 95.96 MB, solutions = 9)
 349365 201595        0.5112    35        0.5242        0.5097  3550123    2.77%          rho_63 D 349365 349364     68
 350900 202542        0.5131    21        0.5242        0.5097  3567932    2.77%           rho_9 D 350900 350899     56
 352469 203531        0.5219    19        0.5242        0.5097  3585255    2.77%         rho_101 D 352469 352468     75
 354060 204491        0.5228    22        0.5242        0.5097  3602819    2.77%         rho_101 D 354060 354059     90
 355560 205366        0.5214     9        0.5242        0.5097  3621135    2.77%           rho_0 U 355560 355559     50
 357012 206215        0.5127    34        0.5242        0.5097  3640370    2.77%           rho_9 U 357012 357010     47
 358460 206929        0.5214    13        0.5242        0.5097  3658097    2.77%          rho_68 D 358460 358459    101
 359895 207595        0.5141    38        0.5242        0.5097  3675987    2.77%   



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/02/19 @ 02:53 PM | adding 296 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5257.
Tried aggregator 1 time.
Reduced MIP has 125 rows, 250 columns, and 495 nonzeros.
Reduced MIP has 123 binaries, 125 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.24 ticks)
Probing time = 0.00 sec. (0.19 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.16 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Va

 115030 78134        0.5177    13        0.5247        0.5097  1031924    2.85%           rho_5 U 115030 115029     66
 117168 79646        0.5097    18        0.5247        0.5097  1054327    2.85%          rho_76 D 117168 117167     62
 119294 81108        0.5124    40        0.5247        0.5097  1076850    2.85%          rho_51 D 119294 119293     49
 121405 82607        0.5150    20        0.5247        0.5097  1100243    2.85%          rho_30 D 121405 121404     80
 123610 84129        0.5209    14        0.5247        0.5097  1122843    2.85%         rho_104 D 123610 123608    112
*125800+85610                            0.5247        0.5097             2.85%
 125900 85684        0.5097     2        0.5247        0.5097  1143113    2.85%           rho_0 D 125900 125898     43
Elapsed time = 91.72 sec. (168280.56 ticks, tree = 39.54 MB, solutions = 6)
 127868 86973        0.5216    18        0.5247        0.5097  1165074    2.85%          rho_52 D 127868 127867     95
 130292 886

 354290 225140        0.5221    14        0.5239        0.5097  3437589    2.71%         rho_119 D 354290 354288     79
 355827 225846        0.5171    23        0.5239        0.5097  3453310    2.71%         rho_119 D 355827 355825     77
 357431 226611        0.5195    25        0.5239        0.5097  3470293    2.71%         rho_120 D 357431 357429     83
 358963 227379        0.5125    21        0.5239        0.5097  3486524    2.71%           rho_0 U 358963 358962     52
 360421 228087        0.5100    20        0.5239        0.5097  3502762    2.71%          rho_19 D 360421 360420     45
 361867 228978        0.5105    30        0.5239        0.5097  3522224    2.71%           rho_7 D 361867 361866     56
Elapsed time = 317.50 sec. (623944.28 ticks, tree = 110.26 MB, solutions = 17)
 363440 230033        0.5160    20        0.5239        0.5097  3540722    2.71%          rho_43 D 363440 363438     77
 364875 230912        0.5224     6        0.5239        0.5097  3558619    2.71% 

setting c0 = 0.0 to ensure that intercept is not penalized
08/02/19 @ 03:00 PM | 1321 rows in lookup table
08/02/19 @ 03:00 PM | ------------------------------------------------------------
08/02/19 @ 03:00 PM | runnning initialization procedure
08/02/19 @ 03:00 PM | ------------------------------------------------------------
08/02/19 @ 03:00 PM | CPA produced 2 cuts
08/02/19 @ 03:00 PM | running naive rounding on 41 solutions
08/02/19 @ 03:00 PM | best objective value: 0.5187
08/02/19 @ 03:00 PM | rounding produced 5 integer solutions
08/02/19 @ 03:00 PM | best objective value is 0.5341
08/02/19 @ 03:00 PM | running sequential rounding on 41 solutions
08/02/19 @ 03:00 PM | best objective value: 0.5187
08/02/19 @ 03:00 PM | sequential rounding produced 6 integer solutions
08/02/19 @ 03:00 PM | best objective value: 0.5245
08/02/19 @ 03:00 PM | polishing 11 solutions
08/02/19 @ 03:00 PM | best objective value: 0.5245
08/02/19 @ 03:00 PM | polishing produced 5 integer solutions
08/02/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/02/19 @ 03:00 PM | adding 297 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5245.
Tried aggregator 1 time.
Reduced MIP has 125 rows, 250 columns, and 495 nonzeros.
Reduced MIP has 123 binaries, 125 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.02 sec. (0.24 ticks)
Probing time = 0.00 sec. (0.19 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.16 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Va

 132783 75535        cutoff              0.5245        0.5100  1198454    2.77%          rho_33 U 132783 132782     69
 135128 76809        0.5162    24        0.5245        0.5100  1226367    2.77%         rho_120 D 135128 135126     70
 137350 77878        0.5100     7        0.5245        0.5100  1253779    2.77%          rho_18 D 137350 137349     41
Elapsed time = 79.05 sec. (168258.22 ticks, tree = 36.22 MB, solutions = 1)
 139646 79355        0.5189    15        0.5245        0.5100  1282317    2.77%           rho_0 U 139646 139645     51
 142000 80722        0.5122    21        0.5245        0.5100  1309474    2.77%           rho_9 D 142000 141999     53
 144159 81877        0.5190    20        0.5245        0.5100  1337096    2.77%          rho_83 D 144159 144157     71
 146465 83234        0.5139    25        0.5245        0.5100  1364813    2.77%         rho_119 D 146465 146464     68
 148932 84845        0.5138    27        0.5245        0.5100  1394289    2.77%          rh

 438346 244692        0.5100    12        0.5245        0.5100  4530627    2.77%           rho_0 D 438346 438344     61
 440624 245815        0.5100    31        0.5245        0.5100  4555090    2.77%          rho_83 D 440624 440623     67
 442890 246931        0.5100     3        0.5245        0.5100  4579050    2.77%           rho_0 D 442890 442888     63
 445230 248185        cutoff              0.5245        0.5100  4604218    2.77%           rho_6 U 445230 445228     71
 447488 249342        0.5174    26        0.5245        0.5100  4628671    2.77%          rho_97 D 447488 447486     84
 449790 250516        0.5151    28        0.5245        0.5100  4653305    2.77%          rho_11 D 449790 449789     73
Elapsed time = 299.63 sec. (664439.70 ticks, tree = 116.00 MB, solutions = 1)
 452170 251707        0.5120    25        0.5245        0.5100  4677614    2.77%          rho_51 D 452170 452169     71
 454800 252964        0.5212    18        0.5245        0.5100  4705943    2.77%  



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
08/02/19 @ 03:07 PM | adding 255 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5358.
Tried aggregator 1 time.
Reduced MIP has 125 rows, 250 columns, and 495 nonzeros.
Reduced MIP has 123 binaries, 125 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.24 ticks)
Probing time = 0.00 sec. (0.19 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.16 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Va

 142679 92864        0.5238     9        0.5242        0.5100  1011025    2.71%          rho_34 D 142679 142677    104
 146570 95305        0.5124    17        0.5242        0.5100  1037989    2.71%           rho_6 D 146570 146569     76
 150450 97714        0.5110    22        0.5242        0.5100  1064460    2.71%          rho_93 D 150450 150449     69
 154330 100188        0.5225    14        0.5242        0.5100  1091212    2.71%          rho_29 D 154330 154328    106
 158098 102320        0.5127    15        0.5242        0.5100  1118009    2.71%         rho_122 D 158098 158096     73
 161700 103997        0.5112    21        0.5242        0.5100  1142273    2.71%          rho_63 D 161700 161699     49
 165749 105786        0.5221    16        0.5242        0.5100  1170196    2.71%          rho_69 D 165749 165747     76
 169463 108014        0.5100     9        0.5242        0.5100  1198827    2.71%          rho_19 D 169463 168623     46
 172701 109580        0.5113    20        0

 598530 360120        0.5135    23        0.5242        0.5100  4142320    2.71%         rho_119 D 598530 598529     84
 602040 361908        0.5100    15        0.5242        0.5100  4162153    2.71%          rho_88 D 602040 602039     65
Elapsed time = 274.84 sec. (626191.01 ticks, tree = 162.01 MB, solutions = 12)
 605719 363924        0.5206    16        0.5242        0.5100  4183560    2.71%          rho_66 D 605719 605718     96
 609100 365856        0.5100    13        0.5242        0.5100  4206013    2.71%           rho_7 D 609100 609099     62
 612370 367948        0.5185    20        0.5242        0.5100  4230927    2.71%          rho_59 D 612370 612369    103
 615840 369988        0.5174    15        0.5242        0.5100  4253649    2.71%          rho_28 D 615840 615839     89
 619470 372013        0.5134     1        0.5242        0.5100  4275581    2.71%           rho_0 D 619470 619469     71
 622985 373923        0.5100    19        0.5242        0.5100  4296789    2.71% 

In [35]:
np.mean(train_auc), np.std(train_auc), np.mean(test_auc), np.std(test_auc)

(0.7063402250318509,
 0.0008498903748643056,
 0.7063254149498768,
 0.0015880118410952582)