# Welcome to the Shapley Value Notebook!
In this notebook, we begin to examine Shapley Values and algorithms to approximate expected contributions for subgroup-hypothesis pairs in order to find an optimal ordering.

A few imports that will be helpful

In [1]:
import numpy as np
import pandas as pd
import copy
import sklearn as sk
import torch
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression

import model
import verifier
import cscUpdater
import updater

import importlib as imp

import itertools
import time
import dill as pickle

from hummingbird.ml import convert

import warnings
warnings.filterwarnings('ignore')

import acsDataParallel



Import data set

In [2]:
test_size = 0.2 #train-test split

acs_task = 'income' # options: employment, income, public_coverage, mobility, and travel_time.
acs_year = 2018 #must be >= 2014. Upper bound unknown.
acs_states = ['CA']
acs_horizon='1-Year' #1-Year or 5-Year
acs_survey='person' #'person' or 'household'

# for subsampling rows: can specify first and last of data to be pulled. currently pulling everything.
row_start = 0
row_end = 30000

# for subsampling columns. note: can only subsample consecutive columns with current implementation
col_start=0
col_end=-1

[train_x, train_y, test_x, test_y, demo_group_functions, demo_group_indicators, min_age, mid_age] = acsDataParallel.get_data(test_size, acs_task, acs_year, acs_states,acs_horizon=acs_horizon, acs_survey=acs_survey, row_start = row_start,row_end = row_end, col_start=col_start, col_end=col_end)

Define 10 subgroup-hypothesis pairs that are accepted when introduced in increasing order

In [69]:
def g1(X):
    return ((X['WKHP'] == 40))

truth_series = g1(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf1 = sk.ensemble.RandomForestClassifier(n_estimators=100, max_depth=11)
clf1.fit(X_train,y_train)
clf1GPU = convert(clf1, 'pytorch')
clf1GPU.to('cuda')

def h1(x):
    return clf1GPU.predict(x)

def g2(X):
    return ((X['WKHP'] <= 20))

truth_series = g2(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf2 = sk.ensemble.RandomForestClassifier(n_estimators=100, max_depth=11)
clf2.fit(X_train,y_train)
clf2GPU = convert(clf2, 'pytorch')
clf2GPU.to('cuda')
def h2(x):
    return clf2GPU.predict(x)

def g3(X):
    return ((X['RAC1P'] >= 3))

truth_series = g3(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf3 = sk.ensemble.RandomForestClassifier(n_estimators=200, max_depth=13)
clf3.fit(X_train,y_train)
clf3GPU = convert(clf3, 'pytorch')
clf3GPU.to('cuda')
def h3(x):
    return clf3GPU.predict(x)

def g4(X):
    return ((X['RAC1P'] == 1))

truth_series = g4(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf4 = sk.ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
clf4.fit(X_train,y_train)
clf4GPU = convert(clf4, 'pytorch')
clf4GPU.to('cuda')
def h4(x):
    return clf4GPU.predict(x)

def g5(X):
    return (X['SCHL'] <= 12)

truth_series = g5(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]
clf5 = sk.ensemble.RandomForestClassifier(n_estimators=100, max_depth=14)
clf5.fit(X_train,y_train)
clf5GPU = convert(clf5, 'pytorch')
clf5GPU.to('cuda')
def h5(x):
    return clf5GPU.predict(x)

def g6(X):
    return (X['SCHL'] >= 16)

truth_series = g6(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf6 = sk.ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200,max_depth = 3,random_state=0)
clf6.fit(X_train,y_train)
clf6GPU = convert(clf6, 'pytorch')
clf6GPU.to('cuda')
def h6(x):
    return clf6GPU.predict(x)

def g7(X):
    return (X['AGEP'] <=30)

truth_series = g7(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]
clf7 = ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100,max_depth = 4,random_state=0)
clf7.fit(X_train,y_train)
clf7GPU = convert(clf7, 'pytorch')
clf7GPU.to('cuda')
def h7(x):
    return clf7GPU.predict(x)

def g8(X):
    return (X['COW'] == 1)

truth_series = g8(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf8 = ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=500,max_depth = 3,random_state=0)
clf8.fit(X_train,y_train)
clf8GPU = convert(clf8, 'pytorch')
clf8GPU.to('cuda')

def h8(x):
    return clf8GPU.predict(x)

def g9(X):
    return ((X['POBP'] <= 20) & (X['SEX'] == 2))

truth_series = g9(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf9 = sk.ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
clf9.fit(X_train,y_train)
clf9GPU = convert(clf9, 'pytorch')
clf9GPU.to('cuda')
def h9(x):
    return clf9GPU.predict(x)

def g10(X):
    return ((X['OCCP'] <= 100))

truth_series = g10(train_x)
X_train = train_x[truth_series]
y_train = train_y[truth_series]

clf10 = ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=30,max_depth = 4,random_state=0)
clf10.fit(X_train,y_train)
clf10GPU = convert(clf10, 'pytorch')
clf10GPU.to('cuda')
def h10(x):
    return clf10GPU.predict(x)

In [24]:
def verify_size(x, group):
# helper function that checks that the discovered group isn't too small to run on
    g_indices = group(x) == 1
    g_xs = x[g_indices]
    if len(g_xs) == 0:
        return False
    else:
        return True

In [70]:
initial_model = DecisionTreeClassifier(max_depth = 1, random_state=0)
initial_model.fit(train_x, train_y);

mod = model.PointerDecisionList(initial_model.predict, [])
mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))

In [293]:
def fractional_shapley_value(mod,permutation, group_list, predicate_list):
    
    contribution_array = np.zeros(len(permutation))
    
    file = open('pdl.pkl','wb')
    pickle.dump(mod,file)
    
    for value in permutation:
        improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[value],group_list[value])
        if improvement_check:
            # run the update
            cscUpdater.iterative_update(mod, predicate_list[value], group_list[value], train_x, train_y, test_x, test_y, 'g'+str(value))
            
    permutation_error = sk.metrics.zero_one_loss(test_y,np.array(mod.predict(test_x),dtype=bool))
    

    for value in permutation:
        #load file
        file = open('pdl.pkl','rb')
        mod = pickle.load(file)
        #final error of permuation
        for sub_value in permutation[permutation.index(value)+1:]:
            improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[sub_value],group_list[sub_value])
            if improvement_check:
            # run the update
                cscUpdater.iterative_update(mod, predicate_list[sub_value], group_list[sub_value], train_x, train_y, test_x, test_y, 'g'+str(sub_value))
        error_without_group = sk.metrics.zero_one_loss(test_y,np.array(mod.predict(test_x),dtype=bool))
        
        contribution_array[value] += (error_without_group - permutation_error)
        
        #reload file
        file = open('pdl.pkl','rb')
        mod = pickle.load(file)
        
        improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[value],group_list[value])
        if improvement_check:
            # run the update
            cscUpdater.iterative_update(mod, predicate_list[value], group_list[value], train_x, train_y, test_x, test_y, 'g'+str(value))
        
        file = open('pdl.pkl','wb')
        pickle.dump(mod,file)
    return contribution_array
        

In [294]:
group_list = [g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
predicate_list = [h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

In [295]:
initial_model = DecisionTreeClassifier(max_depth = 1, random_state=0)
initial_model.fit(train_x, train_y);

mod = model.PointerDecisionList(initial_model.predict, [])
mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))
contribution = fractional_shapley_value(mod,[0,1,2,3,4,5,7,6], group_list, predicate_list)

In [297]:
contribution

array([-0.00016667,  0.002     ,  0.00033333,  0.00016667,  0.0005    ,
        0.00266667,  0.        ,  0.00116667])

In [291]:
mod = model.PointerDecisionList(initial_model.predict, [])
mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))

for value in [0,1,2,3,4,5,7,6]:
    improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[value],group_list[value])
    if improvement_check:
        # run the update
        cscUpdater.iterative_update(mod, predicate_list[value], group_list[value], train_x, train_y, test_x, test_y, 'g'+str(value))
print(f' Error incluiding g1: {mod.test_errors[-1][0]}')

mod = model.PointerDecisionList(initial_model.predict, [])
mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))

for value in [1,2,3,4,5,7,6]:
    improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[value],group_list[value])
    if improvement_check:
        # run the update
        cscUpdater.iterative_update(mod, predicate_list[value], group_list[value], train_x, train_y, test_x, test_y, 'g'+str(value))
print(f' Error not incluiding g1: {mod.test_errors[-1][0]}')     
print('Incluiding g1 in this permutation hurts the overall accuracy!')

 Error incluiding g1: 0.15533333333333332
 Error not incluiding g1: 0.15516666666666667


Now, we define a permutation with the Shapley Values in increasing order, but we force negative values to be at the end of the permutation since we know them to hurt more than they help. We want to add those groups which may only contribute a small amount on average first so they are not forgotten in later rounds.

In [322]:
def get_shapley_ordering(contribution):
#     for index in range(len(contribution)):
#         if contribution[index] < 0:
#             contribution[index] = 1
    return np.flip(np.argsort(contribution))

In [312]:
def output_pdl_error(ordering):
    mod = model.PointerDecisionList(initial_model.predict, [])
    mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
    mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))
    for value in ordering:
        improvement_check = verifier.is_proposed_group_good_csc(mod, test_x, test_y, predicate_list[value],group_list[value])
        if improvement_check:
            # run the update
            cscUpdater.iterative_update(mod, predicate_list[value], group_list[value], train_x, train_y, test_x, test_y, 'g'+str(value))
    print(f' Shapley Ordering Error: {mod.test_errors[-1][0]}')      

We now attempt to take the average Shapley Value over many permutations to see of there is correlation with this metric. Using 8 groups, we will take permutations and output the ordering 

In [None]:
#define groups used for permutations
groups=[0,1,2,3,4,5,6,7]

#running shapley value contribution totals
shapley_array = np.zeros(len(groups))

#all possible permutations
permutations_list = list(itertools.permutations(groups))

#define which permutations we will take
selection = np.random.choice(len(permutations_list), size=10000, replace=False, p=None)

mod = model.PointerDecisionList(initial_model.predict, [])
mod.test_errors.append(cscUpdater.measure_group_errors(mod, test_x, test_y))
mod.train_errors.append(cscUpdater.measure_group_errors(mod, train_x, train_y))

#save base PDL
file = open('basepdl.pkl','wb')
pickle.dump(mod,file)
   
#intialize df to record shapley values
shapley_df = pd.DataFrame(columns= groups)

#initialize counter
counter = 1

for index in selection:
    
    #reload base PDL  
    file = open('basepdl.pkl','rb')
    mod = pickle.load(file)
    
    #define ordering
    permutation = permutations_list[index]
    
    #get fractional contributions
    fractional_contribution = fractional_shapley_value(mod,permutation, group_list, predicate_list)

    #add fractional contribution
    shapley_array += fractional_contribution
    
    if counter%10 == 0:
        #prints the error of the PDL using Shapley Values to determine ordering
        output_pdl_error(get_shapley_ordering(shapley_array))
        
    shapley_df.loc[counter] = fractional_contribution
    counter += 1
    print(counter)

2
3
4
5
6
7
8
9
10
 Shapley Ordering Error: 0.15549999999999997
11
12
13
14
15
16
17
18
19
20
 Shapley Ordering Error: 0.15549999999999997
21
22
23
24
25
26
27
28
29
30
 Shapley Ordering Error: 0.15549999999999997
31
32
33
34
35
36
37
38
39
40
 Shapley Ordering Error: 0.15549999999999997
41
42
43
44
45
46
47
48
49
50
 Shapley Ordering Error: 0.15549999999999997
51
52
53
54
55
56
57
58
59
60
 Shapley Ordering Error: 0.15549999999999997
61
62
63
64
65
66
67
68
69
70
 Shapley Ordering Error: 0.15549999999999997
71
72
73
74
75
76
77
78
79
80
 Shapley Ordering Error: 0.15549999999999997
81
82
83
84
85
86
87
88
89
90
 Shapley Ordering Error: 0.15549999999999997
91
92
93
94
95
96
97
98
99
100
 Shapley Ordering Error: 0.15549999999999997
101
102
103
104
105
106
107
108
109
110
 Shapley Ordering Error: 0.15549999999999997
111
112
113
114
115
116
117
118
119
120
 Shapley Ordering Error: 0.15549999999999997
121
122
123
124
125
126
127
128
129
130
 Shapley Ordering Error: 0.15549999999999997
131
1

In [323]:
output_pdl_error(get_shapley_ordering(shapley_array))

 Shapley Ordering Error: 0.15533333333333332


In [326]:
shapley_df

Unnamed: 0,0,1,2,3,4,5,6,7
1,0.0025,0.0,0.0,0.0,0.001833,0.003,0.000833,0.001833
2,0.000667,0.0,-0.000833,0.0,0.000667,0.001833,0.0,0.001
3,0.0,0.0,0.0,-0.002667,0.0005,0.0005,0.0,0.000333
4,0.0,0.0,-0.000333,-0.001667,0.000167,0.001833,0.000333,0.0
5,0.0,0.0,0.0,0.0,0.000667,0.002333,0.001,0.002167
6,0.000667,0.000333,0.0,0.0,0.000667,0.003333,0.000833,0.0015
7,0.0,0.0,0.0,0.0005,0.0005,0.0035,0.0025,0.002
8,-0.001333,-0.001167,-0.001167,-0.003667,0.000333,0.0,0.000167,-0.000833
9,0.0,0.0,0.000167,-0.001,0.000167,0.002,0.000333,0.0
10,0.0,0.0,0.0,0.0,0.000667,0.002167,0.001,0.002167


In [7]:
selection = np.random.choice(len(permutations), size=100, replace=False, p=None)

# Greedy Approach

Instead of randomly choosing permutations and computing Shapley Values for arbitrary places, could we greedily build our Pointer Decision List by the best current update? I.e., if given G, compute an expected Shapley Value given that the update has to be the first update. Compute over all groups and then choose the best from that way. Makes sense that if a group makes a big impact, you probably want to pick it then vice later.

In [27]:
def build_pdl(initial_mod, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y):
    for index in permutation:
        improvement_check = verifier.is_proposed_group_good_csc(initial_mod, test_x, test_y, predicate_list[index], group_list[index])
        if improvement_check:
            cscUpdater.iterative_update(initial_mod, predicate_list[index], group_list[index], train_x, train_y, test_x, test_y, 'g'+str(index))
    return initial_mod.test_errors[-1][0]

In [58]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g1,g2,g3,g4,g5,g6,g7]
predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2, 3, 4, 5, 6, 7]
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21130000000000004
0.21130000000000004
0.21130000000000004
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
2
[1, 3, 4, 5, 6, 7]
0.21245000000000003
0.21109999999999995
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21075
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000

In [62]:
shapley_by_position

array([ 0.00000000e+00,  6.70000000e-03,  1.15000000e-03, -3.33066907e-16,
        0.00000000e+00, -5.00000000e-05,  1.70000000e-03])

In [63]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g1,g3,g4,g5,g6,g7]
predicate_list = [h0,h1,h3,h4,h5,h6,h7]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2, 3, 4, 5, 6]
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
0.21245000000000003
2
[1, 3, 4, 5, 6]
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
0.21275
0.21245000000000003
0.21275
3
[1

In [64]:
shapley_by_position

array([0.   , 0.003, 0.004, 0.   , 0.005, 0.   ])

In [66]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g1,g3,g4,g5,g7]
predicate_list = [h0,h1,h3,h4,h5,h7]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g6')

#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2, 3, 4, 5]
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
0.21484999999999999
0.21294999999999997
0.21484999999999999
2
[1, 3, 4, 5]
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000000000005
0.21294999999999997
0.21630000000000005
0.21630000

In [67]:
shapley_by_position

array([0.019 , 0.0335, 0.1535, 0.    , 0.0085])

In [68]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g1,g3,g5,g7]
predicate_list = [h0,h1,h3,h5,h7]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g6')
cscUpdater.iterative_update(current_model, h4, g4, train_x, train_y, test_x, test_y, 'g4')

#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2, 3, 4]
0.248
0.22829999999999995
0.248
0.248
0.22829999999999995
0.248
0.248
0.22829999999999995
0.248
0.248
0.22829999999999995
0.248
0.248
0.22829999999999995
0.248
0.248
0.22829999999999995
0.248
2
[1, 3, 4]
0.23165000000000002
0.22829999999999995
0.23165000000000002
0.23165000000000002
0.22829999999999995
0.23165000000000002
0.23165000000000002
0.22829999999999995
0.23165000000000002
0.23165000000000002
0.22829999999999995
0.23165000000000002
0.23165000000000002
0.22829999999999995
0.23165000000000002
0.23165000000000002
0.22829999999999995
0.23165000000000002
3
[1, 2, 4]
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
0.22829999999999995
4
[1, 2, 3]
0.23860000000000003
0.22829999999999995


In [69]:
shapley_by_position

array([0.1182, 0.0201, 0.    , 0.0618])

In [70]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g3,g5,g7]
predicate_list = [h0,h3,h5,h7]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g6')
cscUpdater.iterative_update(current_model, h4, g4, train_x, train_y, test_x, test_y, 'g4')
cscUpdater.iterative_update(current_model, h1, g1, train_x, train_y, test_x, test_y, 'g4')
#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2, 3]
0.25675000000000003
0.248
0.25675000000000003
0.25675000000000003
0.248
0.25675000000000003
2
[1, 3]
0.248
0.248
0.248
0.248
0.248
0.248
3
[1, 2]
0.26249999999999996
0.248
0.26249999999999996
0.26249999999999996
0.248
0.26249999999999996
2


In [71]:
#number of permutations to use to estimate Shapley Values
estimation_instances = 10


#dummies
g0 = None
h0 = None

#define initial group list
# group_list = [g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10]
# predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10]

group_list = [g0,g3,g5]
predicate_list = [h0,h3,h5]

#initialize model
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g6')
cscUpdater.iterative_update(current_model, h4, g4, train_x, train_y, test_x, test_y, 'g4')
cscUpdater.iterative_update(current_model, h1, g1, train_x, train_y, test_x, test_y, 'g1')
cscUpdater.iterative_update(current_model, h7, g7, train_x, train_y, test_x, test_y, 'g7')

#store
file = open('pdl.pkl','wb')
pickle.dump(current_mod,file)

#use intermediaries
remaining_group_indices = list(range(1,len(group_list)))

#define current shapley contributions relative to position
shapley_by_position = np.zeros(len(remaining_group_indices))

#working group list
position = 0
for group_index in remaining_group_indices:
    print(group_index)
    working_copy = copy.copy(remaining_group_indices)
    
    working_copy.pop(position)
    
    print(working_copy)
    
    possible_orderings = list(itertools.permutations(working_copy))
    
    if estimation_instances > len(possible_orderings):
        estimation_counter = len(possible_orderings)
    else:
        estimation_counter = estimation_instances
    
    subset_indices = np.random.choice(len(possible_orderings), size=estimation_counter, replace=False, p=None) 
    random_permutations = []
    for selection in subset_indices:
        random_permutations.append(possible_orderings[selection])
    for permutation in random_permutations:
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        error_without_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_without_group)
        
        file = open('pdl.pkl','rb')
        current_model = pickle.load(file)
        improvement_check = verifier.is_proposed_group_good_csc(current_model, test_x, test_y, predicate_list[group_index], group_list[group_index])
        if improvement_check:
            cscUpdater.iterative_update(current_model, predicate_list[group_index], group_list[group_index], train_x, train_y, test_x, test_y, 'g'+str(group_index))
        error_with_group = build_pdl(current_model, group_list, predicate_list, permutation, train_x, train_y, test_x, test_y)
        print(error_with_group)
        print(error_without_group)
        shapley_contribution = error_without_group - error_with_group
        
        shapley_by_position[group_index-1] += shapley_contribution
    position += 1                                     
best = np.argmax(shapley_by_position)
print(best)

1
[2]
0.28085000000000004
0.26249999999999996
0.28085000000000004
2
[1]
0.26249999999999996
0.26249999999999996
0.26249999999999996
0


In [72]:
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g6')
cscUpdater.iterative_update(current_model, h4, g4, train_x, train_y, test_x, test_y, 'g4')
cscUpdater.iterative_update(current_model, h1, g1, train_x, train_y, test_x, test_y, 'g1')
cscUpdater.iterative_update(current_model, h7, g7, train_x, train_y, test_x, test_y, 'g7')
cscUpdater.iterative_update(current_model, h3, g3, train_x, train_y, test_x, test_y, 'g3')
cscUpdater.iterative_update(current_model, h5, g5, train_x, train_y, test_x, test_y, 'g5')
current_model.test_errors[-1][0]

0.21245000000000003

In [73]:
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
cscUpdater.iterative_update(current_model, h1, g1, train_x, train_y, test_x, test_y, 'g2')
cscUpdater.iterative_update(current_model, h2, g2, train_x, train_y, test_x, test_y, 'g6')
cscUpdater.iterative_update(current_model, h3, g3, train_x, train_y, test_x, test_y, 'g4')
cscUpdater.iterative_update(current_model, h4, g4, train_x, train_y, test_x, test_y, 'g1')
cscUpdater.iterative_update(current_model, h5, g5, train_x, train_y, test_x, test_y, 'g7')
cscUpdater.iterative_update(current_model, h6, g6, train_x, train_y, test_x, test_y, 'g3')
cscUpdater.iterative_update(current_model, h7, g7, train_x, train_y, test_x, test_y, 'g5')
current_model.test_errors[-1][0]

0.21145000000000003

In [77]:
current_mod = model.PointerDecisionList(initial_model.predict, [])
current_mod.test_errors.append(cscUpdater.measure_group_errors(current_mod, test_x, test_y))
current_mod.train_errors.append(cscUpdater.measure_group_errors(current_mod, train_x, train_y))
group_list = [g0,g1,g2,g3,g4,g5,g6,g7]
predicate_list = [h0,h1,h2,h3,h4,h5,h6,h7]
for i in [6,4,1,5,3,7,2]:
    cscUpdater.iterative_update(current_model, predicate_list[i], group_list[i], train_x, train_y, test_x, test_y, 'g5')
current_model.test_errors[-1][0]

0.21145000000000003