In [3]:
import pandas as pd 
import numpy as np
import csv
import advance_functions as advance
import RiskSLIM as slim
import stumps
from sklearn.linear_model import LogisticRegression
from riskslim.helper_functions import load_data_from_csv, print_model

# restore saved variables
%store -r summary_general6_fl_inter_model

no stored variable summary_general6_fl_inter_model


### EBM & CART

In [4]:
KY_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/KY_cleaned")
FL_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/FL_cleaned")

## split x and y 
KY_X = KY_data.loc[:,:'current_violence']
KY_Y = KY_data['recid_six_month'].values
FL_X = FL_data.loc[:,:'current_violence']
FL_Y = FL_data['recid_six_month'].values

In [5]:
#### CART
depth = [1,2]
impurity = [0.001, 0.003]
split = [2,3,4]
cart_summary = advance.CART(KY_X, KY_Y, FL_X, FL_Y, depth, split, impurity, seed=816)

### EBM
estimators = [40,60,80]
depth = [1]
learning_rate = [0.05]
holdout_split = [0.7, 0.9]
ebm_summary = advance.EBM(KY_X, KY_Y, FL_X, FL_Y, learning_rate, depth, estimators, holdout_split, seed=816)

In [15]:
cart_summary['KY_score'], ebm_summary['KY_score']

(0.554246143512394, 0.6513989450017987)

### Lasso Stumps

In [6]:
KY_stumps = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/KY_stumps")
FL_stumps = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/FL_stumps")

## split x and y 
KY_X = KY_stumps.loc[:,:'current_violence>=1']
KY_Y = KY_stumps['recid_six_month'].values
FL_X = FL_stumps.loc[:,:'current_violence>=1']
FL_Y = FL_stumps['recid_six_month'].values

## columns 
cols = KY_X.columns

In [11]:
c_grid = {'C': [0.09, 0.11, 0.13, 0.15]}
stumps_summary = stumps.stump_cv(KY_X, KY_Y, FL_X, FL_Y, cols, c_grid, seed=816)

In [12]:
stumps_summary

{'best_auc': 0.6115419971048976,
 'best_params': {'C': 0.13},
 'auc_diffs': 0.044790668976145476,
 'KY_score': 0.6408958820477442}

### RiskSLIM

In [13]:
## train on best param chosen by Lasso Stumps from above
lasso = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.13, random_state=816).fit(FL_X, FL_Y)
selected_features = cols[lasso.coef_[0] != 0].tolist()
len(selected_features)

25

In [14]:
### Subset features
sub_FL_X = FL_stumps.loc[:, selected_features]
sub_KY_X = KY_stumps.loc[:, selected_features]
sub_FL_X.insert(0, '(Intercept)', 1)

In [15]:
riskslim_summary = slim.risk_cv(sub_KY_X, KY_Y, sub_FL_X, FL_Y, 
                                y_label = 'general2', 
                                max_coef = 20, 
                                max_coef_number = 10, 
                                max_runtime=200, 
                                c=1e-3, 
                                seed=816)

setting c0 = 0.0 to ensure that intercept is not penalized
09/17/19 @ 01:18 PM | 1101 rows in lookup table
09/17/19 @ 01:18 PM | ------------------------------------------------------------
09/17/19 @ 01:18 PM | runnning initialization procedure
09/17/19 @ 01:18 PM | ------------------------------------------------------------
09/17/19 @ 01:18 PM | CPA produced 2 cuts
09/17/19 @ 01:18 PM | running naive rounding on 60 solutions
09/17/19 @ 01:18 PM | best objective value: 0.5025
09/17/19 @ 01:18 PM | rounding produced 4 integer solutions
09/17/19 @ 01:18 PM | best objective value is 0.5355
09/17/19 @ 01:18 PM | running sequential rounding on 60 solutions
09/17/19 @ 01:18 PM | best objective value: 0.5025
09/17/19 @ 01:18 PM | sequential rounding produced 6 integer solutions
09/17/19 @ 01:18 PM | best objective value: 0.5193
09/17/19 @ 01:18 PM | polishing 10 solutions
09/17/19 @ 01:18 PM | best objective value: 0.5193
09/17/19 @ 01:18 PM | polishing produced 5 integer solutions
09/17/19



Lazy constraint(s) or lazy constraint callback is present.
    Disabling dual reductions (CPX_PARAM_REDUCE) in presolve.
    Disabling non-linear reductions (CPX_PARAM_PRELINEAR) in presolve.
         Disabling repeat represolve because of lazy constraint/incumbent callback.
09/17/19 @ 01:18 PM | adding 234 initial cuts
1 of 1 MIP starts provided solutions.
MIP start 'mip_start_0' defined initial solution with objective 0.5184.
Tried aggregator 1 time.
Reduced MIP has 27 rows, 54 columns, and 103 nonzeros.
Reduced MIP has 25 binaries, 27 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.00 sec. (0.06 ticks)
Probing time = 0.00 sec. (0.02 ticks)
MIP emphasis: balance optimality and feasibility.
MIP search method: traditional branch-and-cut.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 0.00 sec. (0.03 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap         Variab

In [16]:
riskslim_summary['KY_auc']

[0.6319598294137343]

In [17]:
#### save results
summary_general6_fl_inter_model = {"CART": cart_summary,
                                    "EBM": ebm_summary,
                                    "Stumps": stumps_summary,
                                    "RiskSLIM": riskslim_summary}
%store summary_general6_fl_inter_model

Stored 'summary_general6_fl_inter_model' (dict)


In [18]:
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\KY_FL\\FL Model\\interpretable\\Six Month\\"
results = [["", "CART", "EBM", "Lasso Stumps", "RiskSLIM"],
           ["General",             
            round(cart_summary['KY_score'],3), 
            round(ebm_summary['KY_score'], 3), 
            round(stumps_summary['KY_score'],3), 
            round(riskslim_summary['KY_auc'][0], 3)]]
with open(path + 'Six Month FL_inter_model.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)