-
Notifications
You must be signed in to change notification settings - Fork 4.8k
/
runner.py
76 lines (64 loc) · 3.17 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
from baselines.a2c.utils import discount_with_dones
from baselines.common.runners import AbstractEnvRunner
class Runner(AbstractEnvRunner):
"""
We use this class to generate batches of experiences
__init__:
- Initialize the runner
run():
- Make a mini batch of experiences
"""
def __init__(self, env, model, nsteps=5, gamma=0.99):
super().__init__(env=env, model=model, nsteps=nsteps)
self.gamma = gamma
self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
def run(self):
# We initialize the lists that will contain the mb of experiences
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
mb_states = self.states
epinfos = []
for n in range(self.nsteps):
# Given observations, take action and value (V(s))
# We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
# Append the experiences
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_values.append(values)
mb_dones.append(self.dones)
# Take actions in env and look the results
obs, rewards, dones, infos = self.env.step(actions)
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
self.states = states
self.dones = dones
self.obs = obs
mb_rewards.append(rewards)
mb_dones.append(self.dones)
# Batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
if self.gamma > 0.0:
# Discount/bootstrap off value fn
last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
mb_actions = mb_actions.reshape(self.batch_action_shape)
mb_rewards = mb_rewards.flatten()
mb_values = mb_values.flatten()
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos