-
Notifications
You must be signed in to change notification settings - Fork 4
/
humanPolicy.py
317 lines (249 loc) · 11.6 KB
/
humanPolicy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
from __future__ import print_function, division
import numpy as np
import tensorflow as tf
import gym
from gym.utils import seeding
from gym import spaces
from sandbox.rocky.tf.spaces.discrete import Discrete #supercedes the gym spaces
from assistive_bandits.envs.utils import softmax
class HumanPolicy:
"""Abstract class for human policies"""
def __init__(self, env):
self.env = env
self.seed()
self.reset()
def get_action(self, obs):
raise NotImplementedError
def learn(self, old_obs, act, rew, new_obs, done):
raise NotImplementedError
def reset(self):
raise NotImplementedError
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def get_state(self):
""" Returns a dict of the human's internal state """
raise NotImplementedError
def get_flat_state(self):
""" returns a numpy vector """
raise NotImplementedError
# Functions for MCMC
def get_initial_state(self):
""" Returns a dict of the human's initial internal state """
raise NotImplementedError
def get_action_from_state(self, state, obs):
raise NotImplementedError
def update_state(self, state, old_obs, act, rew, new_obs):
raise NotImplementedError
def log_likelihood(self, state, obs, act):
""" Computes the log likelihood of taking the action given the state. """
raise NotImplementedError
def act_probs_from_counts(self, counts, **kwargs):
""" Returns the act probabilities given the current counts. """
raise NotImplementedError
class RandomPolicy(HumanPolicy):
def __init__(self, env):
self.env = env
self.seed()
self.reset()
def get_action(self, obs):
return env.action_space.sample()
def learn(self, old_obs, act, rew, new_obs):
return
def reset(self):
return
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def get_state(self):
return {}
def get_initial_state(self, **kwargs):
return {}
def get_action_from_state(self, state, obs):
return np.random.choice(range(self.env.nA))
def update_state(self, state, old_obs, act, rew, new_obs):
return {}
def log_likelihood(self, state, obs, act):
return np.log(self.likelihood(state, obs, act))
def likelihood(self, state, obs, act):
""" Computes the likelihood of taking the action given the state. """
return 1.0/self.env.n_arms
def act_probs_from_counts(self, counts, **kwargs):
return [1.0/self.env.n_arms for _ in range(self.env.n_arms)]
class QLearningBanditPolicy(HumanPolicy):
"""
Tabular Q-learning policy for bandit problems with softmax exploration.
"""
def __init__(self, env, softmax_temp=0.2, alpha=0.5, initial_q_value=0., discount=0.99):
assert (isinstance(env.observation_space, Discrete) \
or isinstance(env.observation_space, spaces.Discrete))
assert (isinstance(env.action_space, Discrete) \
or isinstance(env.action_space, spaces.Discrete))
self.softmax_temp=softmax_temp
self.alpha=alpha
self.initial_q_value=initial_q_value
self.discount=discount
super(QLearningBanditPolicy, self).__init__(env)
def reset(self):
#reset Q Table
self.Q_human = np.full((self.env.nS, self.env.nA),
self.initial_q_value, dtype=np.float32)
def get_action(self, obs):
act_dist = softmax(self.Q_human[obs]/self.softmax_temp)
act = self.np_random.choice(range(self.env.nA), p=act_dist)
return act
def learn(self, old_obs, act, rew, new_obs, done):
Q_next = np.max(self.Q_human[new_obs])
Q_sa = self.Q_human[old_obs][act]
self.Q_human[old_obs][act] = (self.alpha * (rew + self.discount*Q_next - Q_sa) + Q_sa)
def get_state(self):
return {'Q_human': self.Q_human}
#For MCMC-based methods
def get_initial_state(self, **kwargs):
Q_human = np.full((self.env.nS, self.env.nA),
self.initial_q_value, dtype=np.float32)
return {'Q_human': Q_human}
def get_action_from_state(self, state, obs):
act_dist = softmax(state['Q_human'][obs]/self.softmax_temp)
act = np.random.choice(range(self.env.nA), p=act_dist)
return act
def update_state(self, state, old_obs, act, rew, new_obs):
Q_human = state['Q_human'].copy()
Q_next = np.max(Q_human[new_obs])
Q_sa = Q_human[old_obs][act]
Q_human[old_obs][act] = (self.alpha * (rew + self.discount*Q_next - Q_sa) + Q_sa)
return {'Q_human': Q_human}
def likelihood(self, state, obs, act):
""" Computes the likelihood of taking the action given the state. """
return softmax(state['Q_human'][obs]/self.softmax_temp)[act]
def log_likelihood(self, state, obs, act):
return np.log(self.likelihood(state, obs, act))
class EpsGreedyBanditPolicy(HumanPolicy):
"""
Tabular Epsilon-greedy policy for discrete problems.
Maintain the empirical means of each of the actions then choose the action with highest mean
with probability 1-epsilon and a random action with probability epsilon.
"""
def __init__(self, env, epsilon = 0.1):
assert (isinstance(env.action_space, Discrete) \
or isinstance(env.action_space, spaces.Discrete))
self.epsilon = epsilon
super(EpsGreedyBanditPolicy, self).__init__(env)
def reset(self):
#reset empirical mean table
self.means = np.zeros((self.env.observation_space.n, self.env.action_space.n),
dtype=np.float32)
self.counts = np.zeros((self.env.observation_space.n, self.env.action_space.n),
dtype=np.int32)
def get_action(self, obs):
act = self.env.action_space.sample() if self.np_random.random_sample() < self.epsilon \
else self.np_random.choice(np.where(np.isclose(self.means[obs],
self.means[obs].max()))[0])
return act
def learn(self, old_obs, act, rew, new_obs, done):
count = self.counts[old_obs][act] = self.counts[old_obs][act] + 1
self.means[old_obs][act] = (count-1)*self.means[old_obs][act]/count + rew/count
def get_state(self):
return {'means': self.means, 'counts': self.counts}
#Functions for MCMC
def get_initial_state(self, **kwargs):
means = np.zeros((self.env.observation_space.n, self.env.action_space.n),
dtype=np.float32)
counts = np.zeros((self.env.observation_space.n, self.env.action_space.n),
dtype=np.int32)
return {'means': means, 'counts': counts}
def get_action_from_state(self, state, obs):
act = self.np_random.choice(range(self.env.nA))if np.random.random_sample() < self.epsilon \
else self.np_random.choice(np.where(np.isclose(state['means'][obs],
state['means'][obs].max()))[0])
return act
def update_state(self, state, old_obs, act, rew, new_obs):
means = state['means'].copy()
counts = state['counts'].copy()
count = counts[old_obs][act] = counts[old_obs][act] + 1
means[old_obs][act] = (count-1)*means[old_obs][act]/count + rew/count
return {'means': means, 'counts': counts}
def likelihood(self, state, obs, act):
""" Computes the likelihood of taking the action given the state. """
greedy_arms = np.where(np.isclose(state['means'][obs], state['means'][obs].max()))[0]
return (1-self.epsilon)/len(greedy_arms)+ self.epsilon/self.env.n_arms \
if act in greedy_arms else self.epsilon/self.env.n_arms
def log_likelihood(self, state, obs, act):
return np.log(self.likelihood(state, obs, act))
# Used for RTDP
def act_probs_from_counts(self, counts, **kwargs):
means = np.array([(counts[2*i]-1)/(counts[2*i] + counts[2*i+1]-2) if counts[2*i] + counts[2*i+1]-2 > 0 else 0
for i in range(self.env.n_arms)])
act_probs = np.full(self.env.n_arms, self.epsilon/self.env.n_arms)
greedy_arms = np.where(np.isclose(means,means.max()))[0]
act_probs[greedy_arms] += (1-self.epsilon)/len(greedy_arms)
return act_probs
class GreedyBanditPolicy(EpsGreedyBanditPolicy):
"""
Greedy policy for bandit problems. Just a eps-greedy policy with epsilon set to zero.
"""
def __init__(self, env):
super(GreedyBanditPolicy, self).__init__(env, epsilon=0)
class WSLSBanditPolicy(HumanPolicy):
"""
Win-stay-lose-shift policy. If the arm's reward is higher than average, stay.
If the arm's reward is lower than average, switch to a random other arm.
"""
def __init__(self, env):
assert (isinstance(env.action_space, Discrete) \
or isinstance(env.action_space, spaces.Discrete))
super(WSLSBanditPolicy, self).__init__(env)
def reset(self):
self.curr_arm = 0
self.mean = 0.5
self.count = 8
def get_action(self, obs):
return self.curr_arm
def learn(self, old_obs, act, rew, new_obs, done):
if rew < self.mean: #select another arm at random
arm = self.np_random.randint(0, self.env.nA-1)
if arm >= self.curr_arm:
arm += 1
self.curr_arm = arm
else:
self.curr_arm = act
count = self.count = self.count + 1
self.mean = (count - 1) * self.mean / count + rew /count
def get_state(self):
return {'curr_arm': self.curr_arm, 'mean': self.mean, 'count': self.count}
def get_initial_state(self, **kwargs):
curr_arm = 0
mean = 0.5
count = 8
return {'curr_arm': curr_arm, 'mean': mean, 'count': count}
def update_state(self, state, old_obs, act, rew, new_obs):
mean = state['mean']
curr_arm = state['curr_arm']
if rew < mean: #select another arm at random
arm = np.random.randint(0, self.env.nA-1)
if arm >= curr_arm:
arm += 1
curr_arm = arm
else:
curr_arm = act
count = state['count'] = state['count']+1
mean = (count - 1) * mean / count + rew /count
return {'curr_arm': curr_arm, 'mean': mean, 'count': count}
def get_action_from_state(self, state, obs):
return state['curr_arm']
def likelihood(self, state, obs, act):
return 1 if act == state['curr_arm'] else 1e-10
def log_likelihood(self, state, obs, act):
""" Computes the log likelihood of taking the action given the state. """
return 0 if act == state['curr_arm'] else -1e10
def act_probs_from_counts(self, counts, last_arm, last_rew, **kwargs):
successes = sum(counts[2*i] for i in range(self.env.n_arms))
failures = sum(counts[2*i+1] for i in range(self.env.n_arms))
mean = successes/(successes + failures)
if last_rew > mean:
act_probs = np.zeros(self.env.n_arms)
act_probs[last_arm] = 1.
else:
act_probs = np.full(self.env.n_arms, 1/(self.env.n_arms -1))
act_probs[last_arm] = 0.
return act_probs