-
Notifications
You must be signed in to change notification settings - Fork 2
/
vpg.py
134 lines (107 loc) · 5.43 KB
/
vpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 23 14:55:34 2019
@author: clytie
"""
import numpy as np
import tensorflow as tf
from algorithms.base import Base
class VanillaPolicyGradient(Base):
def __init__(self,
n_action, dim_ob_image,
rnd=1,
temperature=1.0,
discount=0.99,
lr=2.5e-4,
gae=0.95,
entropy_coef=0.1, # policy gradient may need more explorations
critic_coef=1.0,
max_grad_norm=0.5,
save_path="./vpg_log"):
self.n_action = n_action
self.dim_ob_image = dim_ob_image
self.entropy_coef = entropy_coef
self.critic_coef = critic_coef
self.tau = gae
self.temperature = temperature
self.max_grad_norm = max_grad_norm
self.discount = discount
self.lr = lr
super().__init__(save_path=save_path, rnd=rnd)
def _build_network(self):
self.ob_image = tf.placeholder(
tf.uint8, [None, *self.dim_ob_image], name="image_observation")
x = tf.divide(tf.cast(self.ob_image, tf.float32), 255.0)
x = tf.layers.conv2d(x, 32, 8, 4, activation=tf.nn.relu)
x = tf.layers.conv2d(x, 64, 4, 2, activation=tf.nn.relu)
x = tf.layers.conv2d(x, 64, 3, 1, activation=tf.nn.relu)
x = tf.contrib.layers.flatten(x)
x = tf.layers.dense(x, 512, activation=tf.nn.relu)
self.logit_action_probability = tf.layers.dense(
x, self.n_action,
kernel_initializer=tf.truncated_normal_initializer(0.0, 0.01))
self.state_value = tf.squeeze(tf.layers.dense(
x, 1, kernel_initializer=tf.truncated_normal_initializer()))
def _build_algorithm(self):
self.optimizer = tf.train.AdamOptimizer(self.lr, epsilon=1e-5)
self.action = tf.placeholder(tf.int32, [None], name="action")
self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
self.target_state_value = tf.placeholder(tf.float32, [None], "target_state_value")
# Get selected action index.
batch_size = tf.shape(self.ob_image)[0]
selected_action_index = tf.stack([tf.range(batch_size), self.action], axis=1)
log_prob = tf.nn.log_softmax(self.logit_action_probability)
prob = tf.nn.softmax(log_prob)
entropy = - tf.reduce_mean(log_prob * prob, axis=1) # entropy = - 1/n \sum_i p_i \log(p_i)
log_act = tf.gather_nd(log_prob, selected_action_index)
target = - tf.reduce_mean(log_act * self.advantage)
critic_loss = tf.reduce_mean(tf.squared_difference(self.state_value, self.target_state_value))
total_loss = target - self.entropy_coef * entropy + self.critic_coef * critic_loss
grads = tf.gradients(total_loss, tf.trainable_variables())
# Clip gradients.
clipped_grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
# train_op
self._train_op = self.optimizer.apply_gradients(
zip(clipped_grads, tf.trainable_variables()), global_step=tf.train.get_global_step())
def get_action(self, obs):
batch_size = obs.shape[0]
logit = self.sess.run(self.logit_action_probability,
feed_dict={self.ob_image: obs})
logit = logit - np.max(logit, axis=1, keepdims=True)
prob = np.exp(logit / self.temperature) / np.sum(np.exp(logit / self.temperature), axis=1, keepdims=True)
action = [np.random.choice(self.n_action, p=prob[i, :]) for i in range(batch_size)]
return action
def update(self, s_batch, a_batch, r_batch, d_batch):
advantage_batch, target_value_batch = [], []
for i in range(len(d_batch)):
traj_size = len(d_batch[i])
adv = np.empty(traj_size, dtype=np.float32)
state_value = self.sess.run(
self.state_value,
feed_dict={self.ob_image: s_batch[i]})
delta_value = r_batch[i] + self.discount * (1 - d_batch[i]) * state_value[1:] - state_value[:-1]
last_advantage = 0
for t in reversed(range(traj_size)):
adv[t] = delta_value[t] + self.discount * self.tau * (1 - d_batch[i][t]) * last_advantage
last_advantage = adv[t]
# Compute target value.
target_value_batch.append(state_value[:-1] + adv)
# Collect advantage.
advantage_batch.append(adv)
# Flat the batch values.
advantage_batch = np.concatenate(advantage_batch, axis=0)
target_value_batch = np.concatenate(target_value_batch, axis=0)
all_step = sum(len(dones) for dones in d_batch)
s_batch = np.concatenate([s[:-1] for s in s_batch], axis=0)
a_batch = np.concatenate(a_batch, axis=0)
advantage_batch = advantage_batch.reshape(all_step)
target_value_batch = target_value_batch.reshape(all_step)
# Normalize Advantage.
advantage_batch = (advantage_batch - advantage_batch.mean()) / (advantage_batch.std() + 1e-5)
# update
self.sess.run(self._train_op,
feed_dict={self.ob_image: s_batch,
self.action: a_batch,
self.advantage: advantage_batch,
self.target_state_value: target_value_batch})