In [1]:
! python --version
# Should be 2.7

Python 2.7.11


In [2]:
import os
import numpy as np
import pandas as pd
import cplex as cp
# import slim_python as slim

In [3]:
import slim_python as slim

Check working directory is Stagedaniel

In [4]:
cd ..


C:\Users\danie\Documents\StageDaniel


In [5]:
os.getcwd()

'C:\\Users\\danie\\Documents\\StageDaniel'

#### LOAD DATA ####

In [6]:

# requirements for CSV data file
# - outcome variable in first column
# - outcome variable values should be [-1, 1] or [0, 1]
# - first row contains names for the outcome variable + input variables
# - no empty cells
data_name = 'bignormpoland'
data_dir = os.getcwd() + '/data/'
data_csv_file = data_dir + data_name + '_processed.csv'

# load data file from csv
df = pd.read_csv(data_csv_file, sep = ',')
data = df.as_matrix()
data_headers = list(df.columns.values)
N = data.shape[0]

# setup Y vector and Y_name
Y_col_idx = [0]
Y = data[:, Y_col_idx]
Y_name = [data_headers[j] for j in Y_col_idx]
Y[Y == 0] = -1

# setup X and X_names
X_col_idx = [j for j in range(data.shape[1]) if j not in Y_col_idx]
X = data[:, X_col_idx]
X_names = [data_headers[j] for j in X_col_idx]

# insert a column of ones to X for the intercept
X = np.insert(arr = X, obj = 0, values = np.ones(N), axis = 1)
X_names.insert(0, '(Intercept)')

# run sanity checks
slim.check_data(X = X, Y = Y, X_names = X_names)

  del sys.path[0]


#### TRAIN SCORING SYSTEM USING SLIM ####

In [7]:
#### TRAIN SCORING SYSTEM USING SLIM ####
# setup SLIM coefficient set
coef_constraints = slim.SLIMCoefficientConstraints(variable_names = X_names, ub = 100, lb = -100)
# coef_constraints.view()

In [8]:
#choose upper and lower bounds for the intercept coefficient
#to ensure that there will be no regularization due to the intercept, choose
#
#intercept_ub < min_i(min_score_i)
#intercept_lb > max_i(max_score_i)
#
#where min_score_i = min((Y*X) * \rho) for rho in \Lset
#where max_score_i = max((Y*X) * \rho) for rho in \Lset
#
#setting intercept_ub and intercept_lb in this way ensures that we can always
# classify every point as positive and negative
scores_at_ub = (Y * X) * coef_constraints.ub
scores_at_lb = (Y * X) * coef_constraints.lb
non_intercept_ind = np.array([n != '(Intercept)' for n in X_names])
scores_at_ub = scores_at_ub[:, non_intercept_ind]
scores_at_lb = scores_at_lb[:, non_intercept_ind]
max_scores = np.fmax(scores_at_ub, scores_at_lb)
min_scores = np.fmin(scores_at_ub, scores_at_lb)
max_scores = np.sum(max_scores, 1)
min_scores = np.sum(min_scores, 1)

intercept_ub = -min(min_scores) + 1
intercept_lb = -max(max_scores) + 1
coef_constraints.set_field('ub', '(Intercept)', intercept_ub)
coef_constraints.set_field('lb', '(Intercept)', intercept_lb)
# coef_constraints.view()


In [9]:
#create SLIM IP
slim_input = {
    'X': X,
    'X_names': X_names,
    'Y': Y,
    'C_0': 0.01,
    'w_pos': 42705./(2*2023),      # N/(2*number of positive datapoints)
    'w_neg': 42705./(2*40682),      # N/(2*number of negative datapoints)
    'L0_min': 0,
    'L0_max': float('inf'),
    'err_min': 0,
    'err_max': 1.0,
    'pos_err_min': 0,
    'pos_err_max': 1.0,
    'neg_err_min': 0,
    'neg_err_max': 1.0,
    'coef_constraints': coef_constraints
}

slim_IP, slim_info = slim.create_slim_IP(slim_input)

In [13]:
# slim_input

In [11]:
# %% capture
# setup SLIM IP parameters
# see docs/usrccplex.pdf for more about these parameters
slim_IP.parameters.timelimit.set(10.0) #set runtime here
#TODO: add these default settings to create_slim_IP
slim_IP.parameters.randomseed.set(0)
slim_IP.parameters.threads.set(1)
slim_IP.parameters.parallel.set(1)
slim_IP.parameters.output.clonelog.set(0)
slim_IP.parameters.mip.tolerances.mipgap.set(np.finfo(np.float).eps)
slim_IP.parameters.mip.tolerances.absmipgap.set(np.finfo(np.float).eps)
slim_IP.parameters.mip.tolerances.integrality.set(np.finfo(np.float).eps)
slim_IP.parameters.emphasis.mip.set(1)


# solve SLIM IP
slim_IP.solve()

# run quick and dirty tests to make sure that IP output is correct
slim.check_slim_IP_output(slim_IP, slim_info, X, Y, coef_constraints)

Found incumbent of value 4088.350384 after 0.20 sec. (46.18 ticks)




Tried aggregator 2 times.
MIP Presolve modified 1932 coefficients.
Aggregator did 4 substitutions.
Reduced MIP has 40951 rows, 40899 columns, and 2241069 nonzeros.
Reduced MIP has 40792 binaries, 54 generals, 0 SOSs, and 0 indicators.
Presolve time = 1.20 sec. (651.82 ticks)
Probing time = 0.06 sec. (55.92 ticks)
Tried aggregator 1 time.
MIP Presolve modified 38807 coefficients.
Reduced MIP has 40951 rows, 40899 columns, and 2241069 nonzeros.
Reduced MIP has 40792 binaries, 107 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.92 sec. (473.88 ticks)
Probing time = 0.08 sec. (55.92 ticks)
MIP emphasis: integer feasibility.
MIP search method: dynamic search.
Parallel mode: none, using 1 thread.
Root relaxation solution time = 6.95 sec. (4082.55 ticks)

        Nodes                                         Cuts/
   Node  Left     Objective  IInf  Best Integer    Best Bound    ItCnt     Gap

*     0+    0                         4088.3504        0.0000           100.00%
      0     0 -

In [14]:
#### CHECK RESULTS ####
slim_results = slim.get_slim_summary(slim_IP, slim_info, X, Y)
# print(slim_results)

# print model
print(slim_results['string_model'])

# print coefficient vector
print(slim_results['rho'])

# print accuracy metrics
print('error_rate: %1.2f%%' % (100*slim_results['error_rate']))
print('TPR: %1.2f%%' % (100*slim_results['true_positive_rate']))
print('FPR: %1.2f%%' % (100*slim_results['false_positive_rate']))
print('true_positives: %d' % slim_results['true_positives'])
print('false_positives: %d' % slim_results['false_positives'])
print('true_negatives: %d' % slim_results['true_negatives'])
print('false_negatives: %d' % slim_results['false_negatives'])


+------------------------------------------+--------------------+-----------+
| PREDICT O IF SCORE >= -251               |                    |           |
| X02_total_liabilities_div_total_assets   |         -23 points |   + ..... |
| ADD POINTS FROM ROWS 1 to 1              |              SCORE |   = ..... |
+------------------------------------------+--------------------+-----------+
[-251.    0.  -23.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.]
error_rate: 0.00%
TPR: 0.00%
FPR: 0.00%
true_positives: 0
false_positives: 0
true_negatives: 38807
false_negatives: 1932


In the results error/tp/fp rates are reported zero due to Python 2 integer division

In [88]:
err = np.array(slim_IP.solution.get_values(slim_info['error_idx']))
sum(err)

330.0

In [100]:
pred = X.dot(slim_results['rho'])>0
pred
Y>0

array([[ True],
       [ True],
       [ True],
       ...,
       [False],
       [False],
       [False]])

In [116]:
sum((Y*X).dot(slim_results['rho'])<slim_info['epsilon'])

326

In [122]:
len(slim_info['error_idx'])

4601

In [128]:
slim_IP.solution.get_values(3)

1.0

In [63]:
print(slim.get_rho_summary(slim_results['rho'], slim_info, X, Y))

        0.,  0.,  0.,  4.,  1.,  0., -1.,  0.,  1.,  2.,  3.,  4., -4.,
       -1., -4., -4., -4.,  0., -2., -1., -1.,  0., -4.,  2., -2., -2.,
       -4.,  0., -4., -4., -4., -5., -2., -4.,  0.,  0., -4., -1., -3.,
        5.,  5.,  0.,  0.,  0.,  0.]), 'true_positives': 1689, 'false_positives': 213, 'pretty_model': <prettytable.PrettyTable object at 0x0000000019C149E8>, 'L0_norm': -15.0}
