In [1]:
# -- Imports -- 
import pandas as pd
import numpy as np
from SVM_model import SVM_model
from Functions import separate_bins_feature

In [2]:
df = pd.read_csv("working_data_full.csv")
vals = df.values
X = vals[:,2:]
y = vals[:,1]

no_samples, no_features = X.shape

svm_model = SVM_model(None,"working_data_full.csv")
svm_model.train_model(0.001)
svm_model.test_model()

Training Accuracy: 72.64 %
Test Accuracy: 74.68 %


In [3]:
def divide_data_bins(data, special=[]):
    no_feat = data.shape[1]
    bins_centred = []
    X_pos_array = []
    
    for i in range(no_feat):
        # Handles special case
        bins, new_col = separate_bins_feature(X[:,i].flatten(),(i in special))[:2]
        
        bins_centred.append(bins)
        X_pos_array.append(new_col)
        
    # Convert to numpy array
    bins_centred = np.array(bins_centred)
    X_pos_array = (np.array(X_pos_array)).transpose() 

    return bins_centred, X_pos_array

In [4]:
bins_centred, X_pos_array = divide_data_bins(X,[9,10])

print(bins_centred)
print(X_pos_array[9])

[[  54.   58.   62.   66.   70.   74.   78.   82.   86.   90.]
 [  25.   63.  101.  139.  177.  215.  253.  291.  329.  367.]
 [   1.    4.    7.   10.   13.   16.   19.   22.   25.   28.]
 [  17.   30.   43.   56.   69.   82.   95.  108.  121.  134.]
 [   2.    6.   10.   14.   18.   22.   26.   30.   34.   38.]
 [   0.    1.    2.    3.   -1.   -1.   -1.   -1.   -1.   -1.]
 [   0.    1.    2.   -1.   -1.   -1.   -1.   -1.   -1.   -1.]
 [  71.   75.   79.   83.   87.   91.   95.   99.  103.  107.]
 [  10.   31.   52.   73.   94.  115.  136.  157.  178.  199.]
 [   0.    1.    2.    3.    4.    5.    6.    7.   -1.   -1.]
 [   0.    1.    2.    3.    4.    5.    6.    7.   -1.   -1.]
 [   2.    6.   10.   14.   18.   22.   26.   30.   34.   38.]
 [   0.    1.    2.    3.    4.    5.    6.   -1.   -1.   -1.]
 [   3.   10.   17.   24.   31.   38.   45.   52.   59.   66.]
 [   7.   21.   35.   49.   63.   77.   91.  105.  119.  133.]
 [   0.    1.    2.    3.    4.    5.    6.   -1.   -1.

In [30]:
def evaluate_data_set(data):
    no_features = data.shape[1]
    avg_list = []
    std_list = []
    for i in range(no_features):
        current_col = data[:,i].flatten()
        std_list.append(np.std(current_col))
        avg_list.append(np.mean(current_col))
          
    return avg_list, std_list


def perturb_special(min_val,max_val,avg,std,no_val):
    new_col = np.random.normal(avg, std, no_val)
    # Note: these functions have poor time complexity
    np.place(new_col,new_col < min_val, min_val)
    np.place(new_col,new_col > max_val, max_val)
    new_col = new_col.round(0)
    return new_col
    

def find_anchors(model, data_set, sample, no_val):
    # Account for the special categorical columns
    special_cols = [9,10]
    
    features = sample.shape[0]
    avg_list, std_list = evaluate_data_set(data_set)

    # Precision Treshold
    treshold = 0.95
    
    # Identify original result from sample
    initial_percentage = model.run_model(sample)
    decision = np.round(initial_percentage,0)

    # Create empty mask 
    mask = np.zeros(features)
    
    # Allows tracking the path
    locked = []
    
    # Iterations allowed
    iterations = 4
    
    while (iterations > 0):
        # Retains best result and the corresponding index
        max_ind = (0,0)
    
        # Assign column that is being tested
        for test_col in range(features):
            new_data = np.empty([features, no_val])

            # Perturb data
            for ind in range(features):
                if (ind == test_col) or (ind in locked):
                    new_data[ind] = np.array(np.repeat(sample[ind],no_val))
                else:
                    if (ind in special_cols):
                        new_data[ind] = perturb_special(0,7,avg_list[ind],std_list[ind],no_val)
                    else:
                        new_data[ind] = np.random.normal(avg_list[ind], std_list[ind], no_val)
            
            new_data = new_data.transpose()

            # Run Model 
            pred = model.run_model_data(new_data)
            acc = (np.mean(pred == decision))
            
            if (acc > max_ind[0]):
                max_ind = (acc,test_col)
                

        locked.append(max_ind[1])
            
        for n in locked:
            mask[n] = 1
            
        if (max_ind[0] >= treshold):
            print("Anchors Mask:")
            return mask
        iterations -= 1
        
    print("!!! No anchors found !!!")
    return None


def perturb_row_feature(model, row, row_idx, feat_idx, current_bins, X_bin_pos, mean_bins, mono_arr, improve):
    
    monot_arr = np.copy(mono_arr)                        
    
    c_current_bins = np.copy(current_bins)
    direction = monot_arr[feat_idx]
    current_bin = np.copy(c_current_bins[feat_idx])
    
    if current_bin != 9:
        next_value = mean_bins[feat_idx][int(current_bin+1)]
    if current_bin != 0:
        prev_value = mean_bins[feat_idx][int(X_bin_pos[row_idx][feat_idx]-1)]
    
    # Check if in boundary and return the same row
    if direction == -1:
        if current_bin == 0:
            direction = 1
        elif current_bin == 9 or next_value == -1:
            direction = 0
    if direction == 1:
        if current_bin == 9 or next_value == -1:
            return (row, c_current_bins)
    elif direction == 0 and current_bin ==  0:
            return (row, c_current_bins)
    
    # Decide direction in special case
    if direction == -1:
        row_up = np.copy(row)
        row_down = np.copy(row)
        row_up[feat_idx] = next_value
        row_down[feat_idx] = prev_value
        percent_1 = model.run_model(row_up)
        percent_0 = model.run_model(row_down)
        if percent_1 >= percent_0:
            if improve:
                c_current_bins[feat_idx] += 1
                return (row_up, c_current_bins)
            else:
                c_current_bins[feat_idx] -= 1
                return (row_down, c_current_bins)
        elif not improve:
            c_current_bins[feat_idx] -= 1
            return (row_down, c_current_bins)
        else:
            c_current_bins[feat_idx] += 1
            return (row_up, c_current_bins)
        
    else:
        p_row = np.copy(row)
        if direction == 1:
            c_current_bins[feat_idx] += 1
            p_row[feat_idx] = next_value
        elif direction == 0:
            c_current_bins[feat_idx] -= 1
            p_row[feat_idx] = prev_value
        
        return (p_row, c_current_bins)
    
    
def percent_cond (improve, percent):
    if improve and percent <= 0.5:
        return True
    elif (not improve) and percent > 0.5:
        return True
    else:
        return False
    
    
def find_MSC (model, data, k_row, row_idx, X_bin_pos, mean_bins):
    
    row = np.copy(k_row)
    percent = model.run_model(row)
    features_moved = np.zeros(23)
    times_moved = np.zeros(23)
    change_vector = np.zeros(23)
    
    original_bins = np.copy(X_bin_pos[row_idx])
    current_bins = np.copy(X_bin_pos[row_idx])
    
    # Decides class to change into
    improve = True
    if percent >= .5:
        improve = False
        
    # Hardcodes the constraints for the direction in which to move
    # 1: Move up to to improve
    # 0: Move down to improve
    # -1: Needs check
    monotonicity_arr = np.array([1,1,1,1,1,0,0,1,1,1,1,-1,0,-1,1,0,0,0,0,-1,-1,0,-1])
    monotonicity_arr_c = np.copy(monotonicity_arr)
    if not improve:
        for i in range(len(monotonicity_arr)):
            if monotonicity_arr[i] == 1:
                monotonicity_arr_c[i] = 0
            elif monotonicity_arr[i] == 0:
                monotonicity_arr_c[i] = 1
    monotonicity_arr = np.copy(monotonicity_arr_c)
    
    while percent_cond(improve, percent) and (features_moved == 1).sum() < 5 and max(times_moved) < 5:
        
        print(percent)
    
        new_percents = []
        pert_rows = []
        new_curr_bins = []
        
        # Avoids moving ExternalScore
        for i in range(1,len(row)):
            t_row, t_current_bins = perturb_row_feature(model, row, row_idx, i, current_bins, X_bin_pos, mean_bins, monotonicity_arr, improve)
            pert_rows.append(t_row)
            new_curr_bins.append(t_current_bins)
            new_percents.append(model.run_model(t_row))

        new_percents = np.array(new_percents)
        
        if improve:
            idx = np.argmax(new_percents)
        else:
            idx = np.argmin(new_percents)
        
        row = pert_rows[idx]
        percent = new_percents[idx]
        current_bins = new_curr_bins[idx]

        features_moved[idx] = 1
        times_moved[idx] += 1
    
    for l in range(23):
        change_vector[l] = current_bins[l] - original_bins[l]
        
    if not percent_cond(improve, percent):
        print(percent)
        return change_vector, row
    else:
        return "Decision can't be moved within thresholds:", None

def instance_explanation(model, data, k_row, row_idx, X_bin_pos, mean_bins):
    
    initial_percentage = model.run_model(k_row)
    print("Initial %",initial_percentage)
    
    print("Change vector: ")
    print(find_MSC(model, data, k_row, row_idx, X_bin_pos, mean_bins)[0])
    print(find_anchors(model, data, k_row, 100))

    return (None,None)


In [31]:
i = 1
print(instance_explanation(svm_model, X, X[i], i, X_pos_array, bins_centred)[0])

Initial % 0.123501554987
Change vector: 
0.123501554987
0.150279500932
0.166155936758
0.180704495323
0.196170247685
Decision can't be moved within thresholds:
Anchors Mask:
[ 1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.]
None


In [33]:
for i in range(15,50):
    print(instance_explanation(svm_model, X, X[i], i, X_pos_array, bins_centred)[0])
    print("\n==================\n")
    

Initial % 0.175849107052
Change vector: 
0.175849107052
0.195970212179
0.21729203183
0.240104512149
0.260114612279
Decision can't be moved within thresholds:
Anchors Mask:
[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.]
None


Initial % 0.0699947278886
Change vector: 
0.0699947278886
0.0789562731643
0.0888950491454
0.0990423810139
0.108444174439
Decision can't be moved within thresholds:
Anchors Mask:
[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.]
None


Initial % 0.377406216875
Change vector: 
0.377406216875
0.407623649434
0.433305170882
0.458184454527
0.480191500694
0.5
0.520979293436
[ 0.  0.  0.  3.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0. -1.  0.]
!!! No anchors found !!!
None
None


Initial % 0.256279216058
Change vector: 
0.256279216058
0.281862517663
0.307075228724
0.33207740297
0.350829877334
0.370295332431
0.390453135181
Decision can't be moved within thr

!!! No anchors found !!!
None
None


Initial % 0.0612622538755
Change vector: 
0.0612622538755
0.0705600008229
0.0795877257997
0.089598255254
0.0982012594367
Decision can't be moved within thresholds:
Anchors Mask:
[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.]
None


Initial % 0.536007306692
Change vector: 
0.536007306692
0.5
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.]
!!! No anchors found !!!
None
None


Initial % 0.568240819194
Change vector: 
0.568240819194
0.535890637855
0.5
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  2.  0.]
!!! No anchors found !!!
None
None


Initial % 0.3331905277
Change vector: 
0.3331905277
0.361092306796
0.388004148601
0.408990757145
0.430260108088
0.450255790813
0.470491958196
0.490858041738
0.511260225178
[ 0.  0.  0.  5.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0. -1.
  0.  0.  0. -1.  0.]
Anchors Mask:
[ 1