This repository has been archived by the owner on May 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
batch.py
347 lines (316 loc) · 13.5 KB
/
batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import numpy as np
import torch
import torch.utils.data as data
from episode import Episode
class Batch:
"""
A batch of samples, collected into a vector of episode
"""
def __init__(self, weights=None):
self.weights = weights
self.episodes = []
self.size = 0
def add_episode(self, episode) -> None:
"""
Ad an episode to the batch
:param episode: the added episode
:return: nothing
"""
self.episodes.append(episode)
self.size += 1
def copy_batch(self):
"""
Make a copy of the current batch
:return: the copied batch
"""
b2 = Batch()
for i in range(self.size):
ep = Episode()
sep = self.episodes[i]
for j in range(self.episodes[i].len):
ep.add(sep.state_pool[j], sep.action_pool[j], sep.reward_pool[j],
sep.done_pool[j], sep.next_state_pool[j])
b2.add_episode(ep)
return b2
def add_sample(self, state, action, reward, done, next_state) -> None:
"""
Add a sample to the current episode
:param state: the current state
:param action: the taken action
:param reward: the resulting reward
:param done: whether the episode is over
:param next_state: the resulting next state
:return: nothing
"""
self.episodes[self.size].add(state, action, reward, done, next_state)
def discounted_sum_rewards(self, gamma) -> None:
"""
Apply a discounted sum of rewards to all samples of all episodes
:param gamma: the discount factor
:return: nothing
"""
for i in range(self.size):
self.episodes[i].discounted_sum_rewards(gamma)
def sum_rewards(self) -> None:
"""
Apply a sum of rewards to all samples of all episodes
:return: nothing
"""
for i in range(self.size):
self.episodes[i].sum_rewards()
def substract_baseline(self, critic) -> None:
"""
Substracts a baseline to the reward of all samples of all episodes
:param critic: the baseline critic to be substracted
:return: nothing
"""
for i in range(self.size):
self.episodes[i].substract_baseline(critic)
def nstep_return(self, n, gamma, critic) -> None:
"""
Apply Bellman backup n-step return to all rewards of all samples of all episodes
:param n: the number of steps in n-step
:param gamma: the discount factor
:param critic: the critic used to perform Bellman backups
:return: nothing
"""
for i in range(self.size):
self.episodes[i].nstep_return(n, gamma, critic)
def normalize_rewards(self, gamma) -> None:
"""
Apply a normalized and discounted sum of rewards to all samples of all episodes
:param gamma: the discount factor
:return: nothing
"""
reward_pool = []
for i in range(self.size):
self.episodes[i].discounted_sum_rewards(gamma)
reward_pool += self.episodes[i].reward_pool
reward_std = np.std(reward_pool)
if reward_std > 0:
reward_mean = np.mean(reward_pool)
# print("normalize_rewards : ", reward_std, "mean=", reward_mean)
for i in range(self.size):
self.episodes[i].normalize_discounted_rewards(gamma, reward_mean, reward_std)
else:
reward_mean = np.mean(reward_pool)
print("normalize_rewards : std=0, mean=", reward_mean)
for i in range(self.size):
self.episodes[i].normalize_discounted_rewards(gamma, reward_mean, 1.0)
def exponentiate_rewards(self, beta) -> None:
"""
Apply an exponentiation factor to the rewards of all samples of all episodes
:param beta: the exponentiation factor
:return: nothing
"""
for i in range(self.size):
self.episodes[i].exponentiate_rewards(beta)
def train_policy_td(self, policy):
"""
Trains a policy through a temporal difference method from a batch of data
:param policy: the trained policy
:return: the average loss over the batch
"""
do_print = False
losses = []
# gradient_vect = [policy.get_weights()]
# gradient_angles = []
if do_print: print("training data :")
for j in range(self.size):
episode = self.episodes[j]
state = np.array(episode.state_pool)
action = np.array(episode.action_pool)
reward = np.array(episode.reward_pool)
if do_print: print("state", state)
if do_print: print("action", action)
if do_print: print("reward", reward)
policy_loss = policy.train_pg(state, action, reward)
# gradient = policy.get_grads()
# gradient_vect.append(gradient)
# if j >=1:
# angle = self.get_angle_with_grad(gradient_vect)
# gradient_angles.append(angle)
if do_print: print("loss", policy_loss)
policy_loss = policy_loss.data.numpy()
mean_loss = policy_loss.mean()
losses.append(mean_loss)
if do_print: print("end of training data :")
return np.array(losses).mean()#, gradient_angles
def get_angle_with_grad(self, gradient_vect):
unit_vector_1 = (gradient_vect[-2] - gradient_vect[-3])/np.linalg.norm(gradient_vect[-2] - gradient_vect[-3])
unit_vector_2 = (gradient_vect[-1] - gradient_vect[-2])/np.linalg.norm(gradient_vect[-1] - gradient_vect[-2])
return np.dot(unit_vector_1,unit_vector_2)
def train_policy_cem(self, policy, bests_frac):
"""
Trains a policy through a CEM from a batch of data
:param policy: the trained policy
:return: the average loss over the batch
"""
do_print = False
rewards = []
if do_print: print("training data :")
for j in range(self.size):
episode = self.episodes[j]
if do_print: print("reward", reward)
reward = np.sum(episode.reward_pool)
#reward = episode.reward_pool[-1]
rewards.append(reward)
rewards = np.array(rewards)
bests_nb = int(bests_frac * self.size)
bests_idxs = rewards.argsort()[-bests_nb:]
bests_rewards = [rewards[i] for i in bests_idxs]
#print(rewards)
average_reward = np.mean(bests_rewards)
#average_reward = np.mean(rewards)
#print(average_reward)
if do_print: print("end of training data :")
return average_reward
def train_policy_through_regress(self, policy):
"""
Trains a policy through regression from a batch of data
Moves the policy closer to performing the same action in the same state
:param policy: the trained policy
:return: the average loss over the batch
"""
losses = []
for j in range(self.size):
episode = self.episodes[j]
state = np.array(episode.state_pool)
action = np.array(episode.action_pool)
policy_loss = policy.train_regress(state, action)
loss = policy_loss.data.numpy()
mean_loss = loss.mean()
losses.append(mean_loss)
return np.array(losses).mean()
def train_critic_td(self, gamma, policy, critic, train):
"""
Trains a critic through a temporal difference method
:param gamma: the discount factor
:param critic: the trained critic
:param policy:
:param train: True to train, False to compute a validation loss
:return: the average critic loss
"""
losses = []
for j in range(self.size):
episode = self.episodes[j]
state = np.array(episode.state_pool)
action = np.array(episode.action_pool)
reward = np.array(episode.reward_pool)
done = np.array(episode.done_pool)
next_state = np.array(episode.next_state_pool)
next_action = policy.select_action(next_state)
target = critic.compute_bootstrap_target(reward, done, next_state, next_action, gamma)
target = torch.FloatTensor(target).unsqueeze(1)
critic_loss = critic.compute_loss_to_target(state, action, target)
if train:
critic.update(critic_loss)
critic_loss = critic_loss.data.numpy()
losses.append(critic_loss)
mean_loss = np.array(losses).mean()
return mean_loss
def train_critic_mc(self, gamma, critic, n, train):
"""
Trains a critic through a Monte Carlo method. Also used to perform n-step training
:param gamma: the discount factor
:param critic: the trained critic
:param n: the n in n-step training
:param train: True to train, False to just compute a validation loss
:return: the average critic loss
"""
if n == 0:
self.discounted_sum_rewards(gamma)
else:
self.nstep_return(n, gamma, critic)
losses = []
targets = []
for j in range(self.size):
episode = self.episodes[j]
state = np.array(episode.state_pool)
action = np.array(episode.action_pool)
reward = np.array(episode.reward_pool)
target = torch.FloatTensor(reward).unsqueeze(1)
targets.append(target.mean().data.numpy())
critic_loss = critic.compute_loss_to_target(state, action, target)
if train:
critic.update(critic_loss)
critic_loss = critic_loss.data.numpy()
losses.append(critic_loss)
mean_loss = np.array(losses).mean()
return mean_loss
def prepare_dataset_mc(self, gamma):
"""
Computes the dataset of samples to allow for immediate update of the critic.
The dataset contains the list of states, of actions, and the target value V(s) or Q(s,a)
The computation of the target value depends on the critic update method.
:param gamma: the discount factor
:return: the dataset corresponding to the content of the replay buffer
"""
list_targets = []
list_states = []
list_actions = []
# prepare reward data for the mc case
self.discounted_sum_rewards(gamma)
for j in range(self.size):
episode = self.episodes[j]
state = episode.state_pool
action = episode.action_pool
#### MODIF: transform actions in array, without it the dataset conversion in TensorDataset will crash on MountainCar and CartPole
action_cp = []
for i in range(len(action)) :
action_cp.append([int(action[i])])
action = action_cp
####
target = episode.reward_pool
list_targets = np.concatenate((list_targets, target))
list_states = list_states + state
list_actions = list_actions + action
t_target = torch.Tensor(list_targets).unsqueeze(1)
dataset = data.TensorDataset(torch.Tensor(list_states), torch.Tensor(list_actions), t_target)
return dataset
def prepare_dataset_td(self, params, policy, critic):
"""
Computes the dataset of samples to allow for immediate update of the critic.
The dataset contains the list of states, of actions, and the target value V(s) or Q(s,a)
The computation of the target value depends on the critic update method.
:param params: parameters
:param policy: the actor, useful to determine the next action
:param critic: the critic to be updated (useful to compute the target value)
:return: the dataset corresponding to the content of the replay buffer
"""
list_targets = []
list_states = []
list_actions = []
# prepare reward data for the td and n-step case
if params.critic_estim_method == "nstep":
self.nstep_return(params.nstep, params.gamma, critic)
else:
if not params.critic_estim_method == "td":
print("batch prepare_dataset_td: unknown estimation method :", params.critic_estim_method)
for j in range(self.size):
episode = self.episodes[j]
state = episode.state_pool
action = episode.action_pool
#### MODIF: transform actions in array, without it the dataset conversion in TensorDataset will crash on MountainCar and CartPole
action_cp = []
for i in range(len(action)) :
try:
action_cp.append([int(action[i])])
except ValueError:
action_cp.append([0])
action = action_cp
####
reward = episode.reward_pool
if params.critic_estim_method == "td":
done = np.array(episode.done_pool)
next_state = np.array(episode.next_state_pool)
next_action = policy.select_action(next_state)
target = critic.compute_bootstrap_target(reward, done, next_state, next_action, params.gamma)
else:
target = reward
list_targets = np.concatenate((list_targets, target))
list_states = list_states + state
list_actions = list_actions + action
t_target = torch.Tensor(list_targets).unsqueeze(1)
dataset = data.TensorDataset(torch.Tensor(list_states), torch.Tensor(list_actions), t_target)
return dataset