In [1]:
! python --version

Python 2.7.11


In [2]:
import os
import numpy as np
import pandas as pd
import cplex as cp
import slim_python as slim

from sklearn.metrics import classification_report, roc_auc_score, zero_one_loss, accuracy_score

In [3]:
cd ..

C:\Users\danie\Documents\StageDaniel


In [4]:
os.getcwd()

'C:\\Users\\danie\\Documents\\StageDaniel'

#### LOAD DATA ####

In [6]:
def load_data(name='breastcancer'):
# requirements for CSV data file
# - outcome variable in first column
# - outcome variable values should be [-1, 1] or [0, 1]
# - first row contains names for the outcome variable + input variables
# - no empty cells
    data_name = name
    data_dir = os.getcwd() + '/data/'
    data_csv_file = data_dir + data_name + '_processed.csv'

    # load data file from csv
    df = pd.read_csv(data_csv_file, sep = ',')
    data = df.as_matrix()
    data_headers = list(df.columns.values)
    N = data.shape[0]

    # setup Y vector and Y_name
    Y_col_idx = [0]
    Y = data[:, Y_col_idx]
    Y_name = [data_headers[j] for j in Y_col_idx]
    Y[Y == 0] = -1

    # setup X and X_names
    X_col_idx = [j for j in range(data.shape[1]) if j not in Y_col_idx]
    X = data[:, X_col_idx]
    X_names = [data_headers[j] for j in X_col_idx]

    # insert a column of ones to X for the intercept
    X = np.insert(arr = X, obj = 0, values = np.ones(N), axis = 1)
    X_names.insert(0, '(Intercept)')

    # run sanity checks
    slim.check_data(X = X, Y = Y, X_names = X_names)
    
    return (X, X_names, Y, Y_name)

In [11]:
data_info = load_data("breastcancer")

  del sys.path[0]


#### TRAIN SCORING SYSTEM USING SLIM ####

In [12]:
def train_slim(data_info, slim_settings=None):
    
    (X, X_names, Y, Y_name) = data_info
    #### TRAIN SCORING SYSTEM USING SLIM ####
    # setup SLIM coefficient set
    coef_constraints = slim.SLIMCoefficientConstraints(variable_names = X_names, ub = 5, lb = -5)
    #choose upper and lower bounds for the intercept coefficient
    #to ensure that there will be no regularization due to the intercept, choose
    #
    #intercept_ub < min_i(min_score_i)
    #intercept_lb > max_i(max_score_i)
    #
    #where min_score_i = min((Y*X) * \rho) for rho in \Lset
    #where max_score_i = max((Y*X) * \rho) for rho in \Lset
    #
    #setting intercept_ub and intercept_lb in this way ensures that we can always
    # classify every point as positive and negative
    scores_at_ub = (Y * X) * coef_constraints.ub
    scores_at_lb = (Y * X) * coef_constraints.lb
    non_intercept_ind = np.array([n != '(Intercept)' for n in X_names])
    scores_at_ub = scores_at_ub[:, non_intercept_ind]
    scores_at_lb = scores_at_lb[:, non_intercept_ind]
    max_scores = np.fmax(scores_at_ub, scores_at_lb)
    min_scores = np.fmin(scores_at_ub, scores_at_lb)
    max_scores = np.sum(max_scores, 1)
    min_scores = np.sum(min_scores, 1)

    intercept_ub = -min(min_scores) + 1
    intercept_lb = -max(max_scores) + 1
    coef_constraints.set_field('ub', '(Intercept)', intercept_ub)
    coef_constraints.set_field('lb', '(Intercept)', intercept_lb)
    coef_constraints.view()
    
    #create SLIM IP
    if slim_settings != None:
        slim_input = slim_settings
    else:
        slim_input = {
            'X': X,
            'X_names': X_names,
            'Y': Y,
            'C_0': 0.001,
            'w_pos': 1.0,
            'w_neg': 1.0,
            'L0_min': 0,
            'L0_max': float('inf'),
            'err_min': 0,
            'err_max': 1.0,
            'pos_err_min': 0,
            'pos_err_max': 1.0,
            'neg_err_min': 0,
            'neg_err_max': 1.0,
            'coef_constraints': coef_constraints
        }
    

    slim_IP, slim_info = slim.create_slim_IP(slim_input)
    
    # setup SLIM IP parameters
    # see docs/usrccplex.pdf for more about these parameters
    slim_IP.parameters.timelimit.set(200.0) #set runtime here
    #TODO: add these default settings to create_slim_IP
    slim_IP.parameters.randomseed.set(0)
    slim_IP.parameters.threads.set(1)
    slim_IP.parameters.parallel.set(1)
    slim_IP.parameters.output.clonelog.set(0)
    slim_IP.parameters.mip.tolerances.mipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.absmipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.integrality.set(np.finfo(np.float).eps)
    slim_IP.parameters.emphasis.mip.set(1)


    # solve SLIM IP
    slim_IP.solve()

    # run quick and dirty tests to make sure that IP output is correct
    slim.check_slim_IP_output(slim_IP, slim_info, X, Y, coef_constraints)
    
    return (slim_IP, slim_info, X, Y)

In [13]:
(slim_IP, slim_info, X, Y) = train_slim(data_info)

+--------------------------+-------+------+--------+-------+------+
|      variable_name       | vtype | sign |   lb   |   ub  | C_0j |
+--------------------------+-------+------+--------+-------+------+
|       (Intercept)        |   I   | nan  | -419.0 | 421.0 | 0.0  |
|      ClumpThickness      |   I   | nan  |  -5.0  |  5.0  | nan  |
|   UniformityOfCellSize   |   I   | nan  |  -5.0  |  5.0  | nan  |
|  UniformityOfCellShape   |   I   | nan  |  -5.0  |  5.0  | nan  |
|     MarginalAdhesion     |   I   | nan  |  -5.0  |  5.0  | nan  |
| SingleEpithelialCellSize |   I   | nan  |  -5.0  |  5.0  | nan  |
|        BareNuclei        |   I   | nan  |  -5.0  |  5.0  | nan  |
|      BlandChromatin      |   I   | nan  |  -5.0  |  5.0  | nan  |
|      NormalNucleoli      |   I   | nan  |  -5.0  |  5.0  | nan  |
|         Mitoses          |   I   | nan  |  -5.0  |  5.0  | nan  |
+--------------------------+-------+------+--------+-------+------+
Found incumbent of value 230.053397 after 0.00 s

In [15]:
#### CHECK RESULTS ####
slim_results = slim.get_slim_summary(slim_IP, slim_info, X, Y)
# print(slim_results)

# print model
print(slim_results['string_model'])

# print coefficient vector
print(slim_results['rho'])

# print accuracy metrics
print('error_rate: %1.2f%%' % (100*slim_results['error_rate']))
print('TPR: %1.2f%%' % (100*slim_results['true_positive_rate']))
print('FPR: %1.2f%%' % (100*slim_results['false_positive_rate']))
print('true_positives: %d' % slim_results['true_positives'])
print('false_positives: %d' % slim_results['false_positives'])
print('true_negatives: %d' % slim_results['true_negatives'])
print('false_negatives: %d' % slim_results['false_negatives'])


+-------------------------------+------------------+-----------+
| PREDICT O IF SCORE >= -60     |                  |           |
| UniformityOfCellSize          |         4 points |   + ..... |
| UniformityOfCellShape         |         4 points |   + ..... |
| BareNuclei                    |         4 points |   + ..... |
| NormalNucleoli                |         3 points |   + ..... |
| ClumpThickness                |         2 points |   + ..... |
| SingleEpithelialCellSize      |         2 points |   + ..... |
| ADD POINTS FROM ROWS 1 to 6   |            SCORE |   = ..... |
+-------------------------------+------------------+-----------+
[-60.   2.   4.   4.   0.   2.   4.   0.   3.   0.]
error_rate: 0.00%
TPR: 0.00%
FPR: 0.00%
true_positives: 237
false_positives: 11
true_negatives: 433
false_negatives: 2


In [16]:
print(slim.get_rho_summary(slim_results['rho'], slim_info, X, Y))



In [17]:
rho = slim_results['rho']

In [18]:
yhat = X.dot(rho) > 0

In [19]:
    y = np.array(Y.flatten(), dtype = np.float)
    pos_ind = y == 1
    neg_ind = ~pos_ind
    N = len(Y)
    N_pos = np.sum(pos_ind)
    N_neg = N - N_pos

    #get predictions
    yhat = X.dot(rho) > 0
    yhat = np.array(yhat, dtype = np.float)
    yhat[yhat == 0] = -1

    true_positives = np.sum(yhat[pos_ind] == 1)
    false_positives = np.sum(yhat[neg_ind] == 1)
    true_negatives= np.sum(yhat[neg_ind] == -1)
    false_negatives = np.sum(yhat[pos_ind] == -1)

In [20]:
1- (false_positives + false_negatives)/ float(N)

0.9809663250366032

In [21]:
print(classification_report(Y, yhat))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       444
           1       0.96      0.99      0.97       239

   micro avg       0.98      0.98      0.98       683
   macro avg       0.98      0.98      0.98       683
weighted avg       0.98      0.98      0.98       683

