# Legacy script #
Different processing for the adult data set
 
 _Daniël de Bondt - Viqtor Davis NL_

Importing modules

In [1]:
## import standard modules for data handling and visualization
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

## import model specific modules
import cplex as cp
import slim_python as slim
import shap
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

## import additional functionalities
from interpret import show
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, zero_one_loss, accuracy_score

pd.set_option('display.max_columns', None)

Auxiliary Functions

In [2]:
## SLIM requires the data in a specific non dataframe format, this code prepares that
def prep_data(X_train, y_train, X_test, y_test):
    # requirements for slim data file
    # - outcome variable in first column
    # - outcome variable values should be [-1, 1] or [0, 1]
    # - first row contains names for the outcome variable + input variables
    # - no empty cells
    
    N_train = X_train.values.shape[0]
    N_test = X_test.values.shape[0]

    # setup Y vector and Y_name
    y_name = y_test.name
    y_train_slim = y_train.copy().values.reshape(len(y_train),1)
    y_train_slim[y_train_slim == 0] = -1
    y_test_slim = y_test.copy().values.reshape(len(y_test),1)
    y_test_slim[y_test_slim == 0] = -1

    # setup X and X_names
    X_names = list(X_train.columns.values)
    X_train_slim = X_train.values
    X_test_slim = X_test.values
    

    # insert a column of ones to X for the intercept
    X_train_slim = np.insert(arr = X_train_slim, obj = 0, values = np.ones(N_train), axis = 1)
    X_test_slim = np.insert(arr = X_test_slim, obj = 0, values = np.ones(N_test), axis = 1)
    X_names.insert(0, '(Intercept)')

    # run sanity checks
    slim.check_data(X = X_train_slim, Y = y_train_slim, X_names = X_names)      
    
    return (X_train_slim, X_test_slim, y_train_slim, y_test_slim, X_names, y_name)

## Simple function for getting predictions for a SLIM scoring system
def pred_slim(X, rho):
    return (X.dot(rho[1:])+rho[0]>=0)*1

def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [3]:
# help(ExplainableBoostingClassifier)

# 1. All models were trained with their default parameters. EBM’s current default parameters are chosen
# for computational speed, to enable ease of experimentation. For the best accuracy and interpretability,
# we recommend using reference parameters: 100 inner bags, 100 outer bags, 5000 epochs, and a learning
# rate of 0.01.


Models

In [4]:
def train_slim(X, X_names, Y, y_name, params={}, timelimit=20.0, silent=True):
    #### TRAIN SCORING SYSTEM USING SLIM ####
    # setup SLIM coefficient set
    coef_constraints = slim.SLIMCoefficientConstraints(variable_names = X_names, ub = 5, lb = -5)
    #choose upper and lower bounds for the intercept coefficient
    #to ensure that there will be no regularization due to the intercept, choose
    #
    #intercept_ub < min_i(min_score_i)
    #intercept_lb > max_i(max_score_i)
    #
    #where min_score_i = min((Y*X) * \rho) for rho in \Lset
    #where max_score_i = max((Y*X) * \rho) for rho in \Lset
    #
    #setting intercept_ub and intercept_lb in this way ensures that we can always
    # classify every point as positive and negative
    scores_at_ub = (Y * X) * coef_constraints.ub
    scores_at_lb = (Y * X) * coef_constraints.lb
    non_intercept_ind = np.array([n != '(Intercept)' for n in X_names])
    scores_at_ub = scores_at_ub[:, non_intercept_ind]
    scores_at_lb = scores_at_lb[:, non_intercept_ind]
    max_scores = np.fmax(scores_at_ub, scores_at_lb)
    min_scores = np.fmin(scores_at_ub, scores_at_lb)
    max_scores = np.sum(max_scores, 1)
    min_scores = np.sum(min_scores, 1)

    intercept_ub = -min(min_scores) + 1
    intercept_lb = -max(max_scores) + 1
    coef_constraints.set_field('ub', '(Intercept)', intercept_ub)
    coef_constraints.set_field('lb', '(Intercept)', intercept_lb)
#     coef_constraints.view()
    
    #create SLIM IP
    if not params:
        slim_input = {
            'X': X,
            'X_names': X_names,
            'Y': Y,
            'C_0': 0.001,
            'w_pos': 1.0,
            'w_neg': 1.0,
            'L0_min': 0,
            'L0_max': float('inf'),
            'err_min': 0,
            'err_max': 1.0,
            'pos_err_min': 0,
            'pos_err_max': 1.0,
            'neg_err_min': 0,
            'neg_err_max': 1.0,
            'coef_constraints': coef_constraints
        }
    else:
        slim_input = params
        
    

    slim_IP, slim_info = slim.create_slim_IP(slim_input)
    
    if silent:
        slim_IP.set_log_stream(None)
        slim_IP.set_error_stream(None)
        slim_IP.set_warning_stream(None)
        slim_IP.set_results_stream(None)
    
    # setup SLIM IP parameters
    # see docs/usrccplex.pdf for more about these parameters
    slim_IP.parameters.timelimit.set(timelimit) #set runtime here
    #TODO: add these default settings to create_slim_IP
    slim_IP.parameters.randomseed.set(0)
    slim_IP.parameters.threads.set(1)
    slim_IP.parameters.parallel.set(1)
    slim_IP.parameters.output.clonelog.set(0)
    slim_IP.parameters.mip.tolerances.mipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.absmipgap.set(np.finfo(np.float).eps)
    slim_IP.parameters.mip.tolerances.integrality.set(np.finfo(np.float).eps)
    slim_IP.parameters.emphasis.mip.set(1)


    # solve SLIM IP
    slim_IP.solve()

    # run quick and dirty tests to make sure that IP output is correct
    slim.check_slim_IP_output(slim_IP, slim_info, X, Y, coef_constraints)
    
    return slim_IP, slim_info
        
    
def train_ebm(X, y, params={}):
    
    if not params:
        n_estimators = 100
        learning_rate = 1.0
        random_state=0
    else:
        n_estimators=params['n_estimators']
        learning_rate = params['learning_rate']
        random_state = params['random_state']
        
    ebm = ExplainableBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state).fit(X, y)
    return ebm

def train_shap(X, y, params={}):

    if not params:
        gamma = 0.5
        n_estimators = 600
        objective = 'binary:logistic'
        silent=True
        nthread=1
    else:
        gamma = params['gamma']
        n_estimators = params['n_estimators']
        objective = params['objective']
        silent = params['silent']
        nthread = params['nthread']
        
    model = XGBClassifier(max_depth=4,
        gamma=gamma,
        n_estimators=n_estimators,
        objective=objective,
        silent=silent,
        nthread=nthread)
    model.fit(X, y)
    return model

def train_logit(X, y, params={}):
    if not params:
        penalty='l1'
        solver = 'liblinear'
        C = 0.01
    else:
        penalty = params['penalty']
        solver = params['solver']
        C = params['C']
    logit = LogisticRegression(penalty=penalty, solver=solver, C=C).fit(X, y)
    return logit

def run_all(X_train, y_train, X_test, y_test, params=None, timelimit=20):
    ## runs all three models in consecutive order
    
    ## SLIM needs specially prepared data
    X_train_slim, X_test_slim, y_train_slim, y_test_slim, X_names, y_name = prep_data(X_train, y_train, X_test, y_test)
    print('-----------------------------------------')
    print("Running Supersparse Linear Integer Model")
    print("Max runtime: ", timelimit, 'seconds')
    slim_model, slim_info = train_slim(X_train_slim, X_names, y_train_slim, y_name, timelimit=timelimit)
    slim_results = slim.get_slim_summary(slim_model, slim_info, X_train_slim, y_train_slim)
    slim_results_test = slim.get_slim_summary(slim_model, slim_info, X_test_slim, y_test_slim)
    # print(slim_results['string_model'])
    print('train accuracy: ', (1-slim_results['error_rate']))
    print('test accuracy: ',  (1-slim_results_test['error_rate']))
    slim_object = (slim_info, slim_results)
       
    print('-----------------------------------------')
    print("Running Explainable Boosting Machine")
    ebm = train_ebm(X_train, y_train)
    print('train accuracy: ', ebm.score(X_train, y_train))
    print('test accuracy: ', ebm.score(X_test, y_test))
    
    print('-----------------------------------------')
    print("Running SHAP explained XGBoost")
    shap = train_shap(X_train, y_train)
    print('train accuracy: ', accuracy_score(y_train, shap.predict(X_train)))
    print('test accuracy: ', accuracy_score(y_test, shap.predict(X_test)))
    
    print('-----------------------------------------')
    print("Running Logit")
    logit = train_logit(X_train, y_train, params[0])
    print('train accuracy: ', logit.score(X_train, y_train))
    print('test accuracy: ', logit.score(X_test, y_test))
    
    return (slim_object, ebm, shap, logit)

Use this cell to change the working directory to access the data

In [5]:
cd ..

C:\Users\danie\Documents\StageDaniel


In [6]:
cd StageDaniel

[WinError 2] Het systeem kan het opgegeven bestand niet vinden: 'StageDaniel'
C:\Users\danie\Documents\StageDaniel


In [7]:
# Should be C:\Users\danie\Documents\StageDaniel
os.getcwd()

'C:\\Users\\danie\\Documents\\StageDaniel'

In [47]:
# datasets
# 'mammo' or 'adult' for binary features
# 'breastcancer' for discrete features
# 'spambase' for continuous features

dataset = 'adult'
dataframe = pd.read_csv(os.getcwd() + '/data/'+ dataset + '_processed.csv', sep = ',')
adult_origin = pd.read_csv(os.getcwd() + '/data/'+ dataset + '.csv', sep = ',', names=list(dataframe.columns.values))
dataframe.head()

Unnamed: 0,Over50K,Age_leq_21,Age_22_to_29,Age_30_to_44,Age_45_to_59,Age_geq_60,EmployedInPrivateSector,EmployedInPublicSector,SelfEmployedNotInc,SelfEmployedInc,HSDiploma,ProfVocOrAS,Bachelors,Graduate,NoHS,JobManagerial,JobAdministrative,JobArmedForces,JobService,JobSkilledSpecialty,JobAgriculture,Married,DivorcedOrSeparated,Widowed,NeverMarried,WorkHrsPerWeek_lt_40,WorkHrsPerWeek_40_to_50,WorkHrsPerWeek_geq_50,Male,Female,White,Black,OtherRace,NativeUSorCanada,NativeImmigrant,AnyCapitalGains,AnyCapitalLoss
0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0
1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0
2,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0
3,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0
4,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0


In [48]:
dataframe['Age_leq_21']=1-dataframe['Age_leq_21']


In [49]:
dataframe['Age'] = adult_origin['Over50K']/adult_origin['Over50K'].max()
cols = [c for c in dataframe.columns if c.lower()[:4] == 'age_']
dataframe = dataframe.drop(columns=cols)

dataframe.head()

Unnamed: 0,Over50K,EmployedInPrivateSector,EmployedInPublicSector,SelfEmployedNotInc,SelfEmployedInc,HSDiploma,ProfVocOrAS,Bachelors,Graduate,NoHS,JobManagerial,JobAdministrative,JobArmedForces,JobService,JobSkilledSpecialty,JobAgriculture,Married,DivorcedOrSeparated,Widowed,NeverMarried,WorkHrsPerWeek_lt_40,WorkHrsPerWeek_40_to_50,WorkHrsPerWeek_geq_50,Male,Female,White,Black,OtherRace,NativeUSorCanada,NativeImmigrant,AnyCapitalGains,AnyCapitalLoss,Age
0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0.433333
1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0.555556
2,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0.422222
3,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0.588889
4,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0.311111


In [50]:
dataframe.to_csv(os.getcwd() +'/data/adult_new_processed.csv', index=False)