-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_utils.py
225 lines (184 loc) · 9.68 KB
/
train_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Standard library imports
import os
import sys
import shutil
import time
import multiprocessing
from multiprocessing import Process, Pool
import numpy as np
from tqdm import tqdm
import tensorflow as tf
# Local application imports
from game_environment.commons_env import HarvestCommonsEnv, DEFAULT_COLORMAP, MAP
from game_environment.utils import utility_funcs
from DDQN import DDQNAgent, DeepQNet
from game_environment.map_env import DEFAULT_COLOURS
EPISODES = 1000
STEPS = 1000
BATCH_SIZE = 8
LR = 0.0015
gamma = 0.99
epsilon = 0.15
epsilon_dacay = 0.995
N_AGENTS = 1
REPLAY_BUFFER_SIZE = 200
TARGET_UPDATE_ITERATION = 20
EPISODE_RECORD_FREQ = 10
START_LEARNING = 100
KERNEL_INITIALIZER = 'glorot_uniform'
AGENT_VIEW_RANGE = 5
def train_agents(n_agents=4, map_type="small", logs_path="logs", n_episodes=EPISODES, n_steps=STEPS,
batch_size=BATCH_SIZE, lr=0.0015, gamma=0.99, epsilon=0.10, epsilon_decay=0.995, log=True):
# Configure expermiment logs
logdir = logs_path + "/MAP=%s-AGENTS=%d-lr=%.5f-e=%.2f-ed=%.3f-g=%.2f-b=%d" % (map_type, n_agents, lr, epsilon,
epsilon_decay, gamma, batch_size)
os.makedirs(logdir, exist_ok=True)
# sys.stdout = open(os.path.join(logdir, "console_output.out"), "w+")
social_metrics_writer = tf.summary.create_file_writer(logdir + "/social_metrics")
env = HarvestCommonsEnv(ascii_map=MAP[map_type], num_agents=n_agents, render=True,
agent_view_range=AGENT_VIEW_RANGE)
obs = env.reset()
# Instanciate DDQN agent models
ddqn_models = {}
for agent_id, agent in env.agents.items():
obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3)
q_net_model = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape,
kernel_initializer=KERNEL_INITIALIZER)
q_net_target = DeepQNet(num_actions=env.action_space.n, input_shape=obs_shape,
kernel_initializer=KERNEL_INITIALIZER)
ddqn_models[agent_id] = DDQNAgent(model=q_net_model, target_model=q_net_target, obs_shape=obs_shape,
env=env, buffer_size=REPLAY_BUFFER_SIZE, learning_rate=lr, epsilon=epsilon,
epsilon_decay=epsilon_decay, gamma=gamma, batch_size=batch_size)
for episode in range(n_episodes + 1):
start_t = time.time()
print("- A:%d Episode %d" % (n_agents, episode))
episode_path = logdir + "/episodes/episode=%04d" % episode
models_path = logdir + "/model/episode=%04d" % episode
if episode % EPISODE_RECORD_FREQ == 0:
os.makedirs(episode_path, exist_ok=True)
obs = env.reset()
# Convert initial observations to [0,1] float imgs
for agent_id in env.agents.keys():
# Convert observation image from (Int to float)
obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32)
# Reset replay buffers
ddqn_models[agent_id].reset_replay_buffer()
ddqn_models[agent_id].e_decay()
for t in tqdm(range(1, n_steps), position=0, leave=True):
# Select agent actions to take
actions = {}
for agent_id, agent in env.agents.items():
# Follow e greedy policy using Q(s,a) function approximator
best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0))
actions[agent_id] = ddqn_models[agent_id].get_action(best_action)
# Apply agents actions on the environment
next_obs, rewards, dones, info = env.step(actions)
# Store transition in each agent replay buffer
for agent_id in env.agents.keys():
next_obs[agent_id] = (next_obs[agent_id] / 255.).astype(np.float32)
ddqn_models[agent_id].store_transition(obs[agent_id], actions[agent_id], rewards[agent_id],
next_obs[agent_id], dones[agent_id])
ddqn_models[agent_id].num_in_buffer = min(ddqn_models[agent_id].num_in_buffer + 1, REPLAY_BUFFER_SIZE)
# When enough experience is collected, start on-line learning
if t > START_LEARNING:
losses = []
for agent_id in env.agents.keys():
loss = ddqn_models[agent_id].train_step()
losses.append(loss)
if t % TARGET_UPDATE_ITERATION == 0: # Update target model with learned changes
for agent_id in env.agents.keys():
ddqn_models[agent_id].update_target_model()
# with social_metrics_writer.as_default():
# tf.summary.histogram('mse_Q', data=losses, step=t + (n_steps * episode))
# Update current state observations
obs = next_obs
# Update the environment social metrics
env.update_social_metrics(rewards)
if episode % EPISODE_RECORD_FREQ == 0:
env.render(episode_path + "/t=%04d.png" % t, title="t=%04d" % t)
# Log metrics to tensorboard
social_metrics = env.get_social_metrics(episode_steps=n_steps)
efficiency, equality, sustainability, peace = social_metrics
with social_metrics_writer.as_default():
tf.summary.scalar('efficiency', data=efficiency, step=episode)
tf.summary.scalar('equality', data=equality, step=episode)
tf.summary.scalar('sustainability', data=sustainability, step=episode)
tf.summary.scalar('peace', data=peace, step=episode)
# Log agent accumulated reward distribution
agent_rewards = [np.sum(rewards) for rewards in env.rewards_record.values()]
tf.summary.histogram('accumulated_reward', agent_rewards, step=episode)
# Make video of episode
if episode % EPISODE_RECORD_FREQ == 0:
utility_funcs.make_video_from_image_dir(vid_path=logdir + "/episodes",
img_folder=episode_path,
video_name="episode=%04d" % episode,
fps=10)
# Delete images
shutil.rmtree(episode_path, ignore_errors=True)
# Save models Q value NN function approximators
for agent_id in env.agents.keys():
ddqn_models[agent_id].save_policy(path=models_path + "/%s" % agent_id)
print("- A:%d Episode %d - DONE in: %.3f min" % (n_agents, episode, (time.time() - start_t)/60))
def gen_episode_video(models_path, map_type, n_agents, video_path):
os.makedirs(video_path + "/imgs", exist_ok=True)
STEPS = 1000
selected_map = MAP[map_type]
AGENT_VIEW_RANGE = 5
# Create environment
env = HarvestCommonsEnv(ascii_map=selected_map, num_agents=n_agents, render=True,
agent_view_range=AGENT_VIEW_RANGE)
# Instanciate DDQN agent models
ddqn_models = {}
for agent_id, agent in env.agents.items():
obs_shape = (2 * AGENT_VIEW_RANGE + 1, 2 * AGENT_VIEW_RANGE + 1, 3)
model_path = os.path.join(models_path, agent_id)
ddqn_models[agent_id] = DDQNAgent.from_trained_policy(path_to_model=model_path, obs_shape=obs_shape,
env=env)
obs = env.reset()
# Convert initial observations to [0,1] float imgs
for agent_id in env.agents.keys():
# Convert observation image from (Int to float)
obs[agent_id] = (obs[agent_id] / 255.).astype(np.float32)
for t in tqdm(range(1, STEPS), desc="Steps", position=0, leave=True):
# Select agent actions to take
actions = {}
for agent_id, agent in env.agents.items():
# Follow policy using Q(s,a) function approximator
best_action, q_values = ddqn_models[agent_id].model.action_value(tf.expand_dims(obs[agent_id], axis=0))
actions[agent_id] = ddqn_models[agent_id].get_action(best_action)
# Apply agents actions on the environment
next_obs, rewards, dones, info = env.step(actions)
# Update current state observations
obs = next_obs
# Update the environment social metrics
env.update_social_metrics(rewards)
env.render(video_path + "/imgs/t=%04d.png" % t, title="t=%04d" % t)
# Make video of episode
utility_funcs.make_video_from_image_dir(vid_path=video_path,
img_folder=video_path + "/imgs",
video_name="learned_policy",
fps=10)
# Delete images
shutil.rmtree(video_path + "/imgs", ignore_errors=True)
if __name__ == "__main__":
logs_path = "logs"
params1 = {"n_agents": 1, "map_type": "small", "logs_path": logs_path, "n_episodes": EPISODES, "n_steps": STEPS,
"batch_size": BATCH_SIZE, "lr": 0.0005, "gamma": 0.99, "epsilon": 0.15, "epsilon_decay": 0.999,
"log": True}
#
# params2 = {"n_agents": 2, "map_type": "small", "logs_path": logs_path, "episodes": EPISODES, "n_steps": STEPS,
# "batch_size": BATCH_SIZE, "lr": 0.0015, "gamma": 0.99, "epsilon": 0.15, "epsilon_decay": 0.999,
# "log": False}
#
train_agents(**params1)
# # processes = []
# #
# # for i, params in enumerate([params1.values(), params2.values()]):
# # p = Process(target=train_agents, args=params, name="Exp%d" % i)
# # p.start()
# # processes.append(p)
# # print(p.name)
# #
# # for p in processes:
# # p.join()
# #