In [1]:
from ortools.linear_solver import pywraplp
import os
import time
import pickle
import pandas as pd
import numpy as np
import slim_or 
import slim_python as slim_cplex

This Notebook can run two versions of SLIM with a CPLEX and CBC (Google OR-tools) solver respectively
Results of several runs can be found in the SLIM comparison Excel file

In [2]:
cd ..

C:\Users\danie\Documents\StageDaniel\research


In [3]:
cd research

[WinError 2] Het systeem kan het opgegeven bestand niet vinden: 'research'
C:\Users\danie\Documents\StageDaniel\research


Function for loading in the data set

In [4]:
def load_data(name='breastcancer'):
# requirements for CSV data file
# - outcome variable in first column
# - outcome variable values should be [-1, 1] or [0, 1]
# - first row contains names for the outcome variable + input variables
# - no empty cells
    data_name = name
    data_dir = os.getcwd() + '/data/'
    data_csv_file = data_dir + data_name + '_processed.csv'

    # load data file from csv
    df = pd.read_csv(data_csv_file, sep = ',')
    data = df.as_matrix()
    data_headers = list(df.columns.values)
    N = data.shape[0]

    # setup Y vector and Y_name
    Y_col_idx = [0]
    Y = data[:, Y_col_idx]
    Y_name = [data_headers[j] for j in Y_col_idx]
    Y[Y == 0] = -1

    # setup X and X_names
    X_col_idx = [j for j in range(data.shape[1]) if j not in Y_col_idx]
    X = data[:, X_col_idx]
    X_names = [data_headers[j] for j in X_col_idx]

    # insert a column of ones to X for the intercept
    X = np.insert(arr = X, obj = 0, values = np.ones(N), axis = 1)
    X_names.insert(0, '(Intercept)')

    # run sanity checks
    slim_cplex.check_data(X = X, Y = Y, X_names = X_names)
    
    return (X, X_names, Y, Y_name)

Auxiliary functions for saving and loading of results

In [9]:
# auxiliary function for saving results  
def save_results(results, name ):
    with open('results/'+ name + '.pkl', 'wb') as f:
        pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)

def load_results(name ):
    with open('results/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

Functions to train SLIM using either CPLEX or OR Tools

In [10]:
def train_slim_cplex(data_info, timelimit=20, C_0= 0.01, slim_settings=None):
      
    (X, X_names, Y, Y_name) = data_info
    #### TRAIN SCORING SYSTEM USING SLIM ####
    # setup SLIM coefficient set
    coef_constraints = slim_cplex.SLIMCoefficientConstraints(variable_names = X_names, ub = 5, lb = -5)
    #choose upper and lower bounds for the intercept coefficient
    #to ensure that there will be no regularization due to the intercept, choose
    #
    #intercept_ub < min_i(min_score_i)
    #intercept_lb > max_i(max_score_i)
    #
    #where min_score_i = min((Y*X) * \rho) for rho in \Lset
    #where max_score_i = max((Y*X) * \rho) for rho in \Lset
    #
    #setting intercept_ub and intercept_lb in this way ensures that we can always
    # classify every point as positive and negative
    scores_at_ub = (Y * X) * coef_constraints.ub
    scores_at_lb = (Y * X) * coef_constraints.lb
    non_intercept_ind = np.array([n != '(Intercept)' for n in X_names])
    scores_at_ub = scores_at_ub[:, non_intercept_ind]
    scores_at_lb = scores_at_lb[:, non_intercept_ind]
    max_scores = np.fmax(scores_at_ub, scores_at_lb)
    min_scores = np.fmin(scores_at_ub, scores_at_lb)
    max_scores = np.sum(max_scores, 1)
    min_scores = np.sum(min_scores, 1)

    intercept_ub = -min(min_scores) + 1
    intercept_lb = -max(max_scores) + 1
    coef_constraints.set_field('ub', '(Intercept)', intercept_ub)
    coef_constraints.set_field('lb', '(Intercept)', intercept_lb)
        
    #create SLIM IP
    if slim_settings != None:
        slim_input = slim_settings
    else:
        slim_input = {
            'X': X,
            'X_names': X_names,
            'Y': Y,
            'C_0': C_0,
            'w_pos': 1.0,
            'w_neg': 1.0,
            'L0_min': 0,
            'L0_max': float('inf'),
            'err_min': 0,
            'err_max': 1.0,
            'pos_err_min': 0,
            'pos_err_max': 1.0,
            'neg_err_min': 0,
            'neg_err_max': 1.0,
            'coef_constraints': coef_constraints
        }
    

    slim_IP, slim_info = slim_cplex.create_slim_IP(slim_input)
    
    slim_IP.set_log_stream(None)
    slim_IP.set_error_stream(None)
    slim_IP.set_warning_stream(None)
    slim_IP.set_results_stream(None)
    
    # setup SLIM IP parameters
    # see docs/usrccplex.pdf for more about these parameters
    slim_IP.parameters.timelimit.set(timelimit) #set runtime here
    #TODO: add these default settings to create_slim_IP
    slim_IP.parameters.randomseed.set(0)
    slim_IP.parameters.threads.set(1)
    slim_IP.parameters.parallel.set(1)
    slim_IP.parameters.output.clonelog.set(0)
    slim_IP.parameters.mip.tolerances.mipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.absmipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.integrality.set(np.finfo(np.float).eps)
    slim_IP.parameters.emphasis.mip.set(1)

    # solve SLIM IP
    slim_IP.solve()

    # run quick and dirty tests to make sure that IP output is correct
    slim_cplex.check_slim_IP_output(slim_IP, slim_info, X, Y, coef_constraints)
    
    return (slim_IP, slim_info)

In [11]:
def train_slim_or(data_info, timelimit=20, C_0= 0.01, slim_settings=None):
        
    (X, X_names, Y, Y_name) = data_info
    #### TRAIN SCORING SYSTEM USING SLIM ####
    # setup SLIM coefficient set
    coef_constraints = slim_or.SLIMCoefficientConstraints(variable_names = X_names, ub = 5, lb = -5)
    #choose upper and lower bounds for the intercept coefficient
    #to ensure that there will be no regularization due to the intercept, choose
    #
    #intercept_ub < min_i(min_score_i)
    #intercept_lb > max_i(max_score_i)
    #
    #where min_score_i = min((Y*X) * \rho) for rho in \Lset
    #where max_score_i = max((Y*X) * \rho) for rho in \Lset
    #
    #setting intercept_ub and intercept_lb in this way ensures that we can always
    # classify every point as positive and negative
    scores_at_ub = (Y * X) * coef_constraints.ub
    scores_at_lb = (Y * X) * coef_constraints.lb
    non_intercept_ind = np.array([n != '(Intercept)' for n in X_names])
    scores_at_ub = scores_at_ub[:, non_intercept_ind]
    scores_at_lb = scores_at_lb[:, non_intercept_ind]
    max_scores = np.fmax(scores_at_ub, scores_at_lb)
    min_scores = np.fmin(scores_at_ub, scores_at_lb)
    max_scores = np.sum(max_scores, 1)
    min_scores = np.sum(min_scores, 1)

    intercept_ub = -min(min_scores) + 1
    intercept_lb = -max(max_scores) + 1
    coef_constraints.set_field('ub', '(Intercept)', intercept_ub)
    coef_constraints.set_field('lb', '(Intercept)', intercept_lb)

    
    #create SLIM IP
    if slim_settings != None:
        slim_input = slim_settings
    else:
        slim_input = {
            'X': X,
            'X_names': X_names,
            'Y': Y,
            'C_0': C_0,
            'w_pos': 1.0,
            'w_neg': 1.0,
            'L0_min': 0,
            'L0_max': float('inf'),
            'err_min': 0,
            'err_max': 1.0,
            'pos_err_min': 0,
            'pos_err_max': 1.0,
            'neg_err_min': 0,
            'neg_err_max': 1.0,
            'coef_constraints': coef_constraints
        }
    

    slim_IP, slim_info = slim_or.create_slim_IP(slim_input)
    
    # setup SLIM IP parameters
    # see docs/usrccplex.pdf for more about these parameters
    slim_IP.SetTimeLimit(timelimit*1000) #set runtime here
    

    # solve SLIM IP
    satus = slim_IP.Solve()
    
    # run quick and dirty tests to make sure that IP output is correct
    #     slim_os.check_slim_IP_output(slim_IP, slim_info, X, Y, coef_constraints)
    
    return (slim_IP, slim_info)

In [18]:
#  Selecting data set and setting hyperparameters
dataset = 'adult'
timelimit = 600
C_0 = 0.001
(X, X_names, Y, Y_name) = load_data(name=dataset)

  del sys.path[0]


In [19]:
# Solve SLIM using CPLEX
(cplex_model, cplex_info) = train_slim_cplex((X, X_names, Y, Y_name), timelimit, C_0)

In [20]:
# Solve SLIM using OR-tools
(or_model, or_info) = train_slim_or((X, X_names, Y, Y_name), timelimit, C_0)

In [22]:
or_results = slim_or.get_slim_summary(or_model, or_info, X, Y)

print("C = ", C_0, or_results['rho'])
# or_results['rho']

C =  0.001 [ 0.  0.  0.  0.  0.  0. -1.  0. -1.  0. -1. -1.  0.  0.  0.  0.  0.  0.
  0. -1.  0.  1.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0. -1.
  0.]


In [24]:
# Extract results and prepare for saving to pickle
cplex_results = slim_cplex.get_slim_summary(cplex_model, cplex_info, X, Y)
or_results = slim_or.get_slim_summary(or_model, or_info, X, Y)
del cplex_results['pretty_model']
del or_results['pretty_model']

In [27]:
# Saves the results to a pickle file
save_results(cplex_results, 'slim_comparison/cplex_'+dataset+'_timelimit_'+str(timelimit)+'_C0_'+str(C_0))
save_results(or_results, 'slim_comparison/or_'+dataset+'_timelimit_'+str(timelimit)+'_C0_'+str(C_0))

In [29]:
# Automated function for training and storing results for both solvers given 
# a certain data set and parameters

def test_run(dataset, timelimit, C_0):
#     print("Started test run at:          ", time.ctime())
#     print("Will finish approximately:    ", time.ctime(time.time()+2*timelimit))
    (X, X_names, Y, Y_name) = load_data(name=dataset)
    
    (cplex_model, cplex_info) = train_slim_cplex((X, X_names, Y, Y_name), timelimit, C_0)
    (or_model, or_info) = train_slim_or((X, X_names, Y, Y_name), timelimit, C_0)
    
    cplex_results = slim_cplex.get_slim_summary(cplex_model, cplex_info, X, Y)
    or_results = slim_or.get_slim_summary(or_model, or_info, X, Y)
    del cplex_results['pretty_model']
    del or_results['pretty_model']
    
    save_results(cplex_results, 'cplex_'+dataset+'_timelimit_'+str(timelimit)+'_C0_'+str(C_0))
    save_results(or_results, 'or_'+dataset+'_timelimit_'+str(timelimit)+'_C0_'+str(C_0))

In [36]:
# cs = [0.00001, 0.000001, 0.0000001, 0.00000001]
# for c in cs:
#         test_run('mammo', 1800, c)
test_run('adult', 3600, 0.01)

  del sys.path[0]


In [35]:
name='or_adult_timelimit_1800_C0_0.01'
res = load_results(name)
print(res['string_model'])
res

+-------------------------------+-------------------+-----------+
| PREDICT O IF SCORE >= 0       |                   |           |
| (Intercept)                   |         -1 points |   + ..... |
| ADD POINTS FROM ROWS 1 to 1   |             SCORE |   = ..... |
+-------------------------------+-------------------+-----------+


{'objective_value': 7841.0,
 'simplex_iterations': 546548,
 'rho': array([-1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 'true_positives': 0,
 'true_negatives': 24720,
 'false_positives': 0,
 'false_negatives': 7841,
 'mistakes': 7841,
 'error_rate': 0.2408095574460244,
 'true_positive_rate': 0.0,
 'false_positive_rate': 0.0,
 'L0_norm': nan}