Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
541 changes: 336 additions & 205 deletions Lab3-policy-gradient.ipynb

Large diffs are not rendered by default.

Binary file added image/p3-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added image/p3-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added image/p4-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added image/p4-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added image/p6-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added image/p6-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions image/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
image
36 changes: 33 additions & 3 deletions policy_gradient/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,35 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
"""
# YOUR CODE HERE >>>>>>
# <<<<<<<<



# fc1
layer1 = tf.layers.dense(
inputs=self._observations,
units=hidden_dim,
activation=tf.nn.tanh, # tanh activation
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc1'
)
# fc2
probs = tf.layers.dense(
inputs=layer1,
units=out_dim,
activation=tf.nn.softmax,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc2'
)

#probs = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability

'''
nn_activations = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
probs = nn_activations = tf.contrib.layers.fully_connected(nn_activations, out_dim, activation_fn=tf.nn.softmax)
'''

# 山頭水覆疑無路, 柳暗花明又一村
# --------------------------------------------------
# This operation (variable) is used when choosing action during data sampling phase
# Shape of probs: [1, n_actions]
Expand Down Expand Up @@ -72,8 +100,10 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
Sample solution is about 1~3 lines.
"""
# YOUR CODE HERE >>>>>>
# <<<<<<<<

#莫凡5.2 4:23
surr_loss = -tf.reduce_mean(log_prob * self._advantages)
# <<<<<<<<

grads_and_vars = self._opt.compute_gradients(surr_loss)
train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")

Expand Down
6 changes: 5 additions & 1 deletion policy_gradient/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ def discount_bootstrap(x, discount_rate, b):
Sample code should be about 3 lines
"""
# YOUR CODE >>>>>>>>>>>>>>>>>>>
b = np.append(b[1:],0)
y = x + discount_rate*b
return y

# <<<<<<<<<<<<<<<<<<<<<<<<<<<<

def plot_curve(data, key, filename=None):
# plot the surrogate loss curve
x = np.arange(len(data))
Expand Down
70 changes: 70 additions & 0 deletions report.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,73 @@
# Homework3-Policy-Gradient report

TA: try to elaborate the algorithms that you implemented and any details worth mentioned.

## Problem 1: Construct a neural network to represent policy

```python
# fc1
layer1 = tf.layers.dense(
inputs=self._observations,
units=hidden_dim,
activation=tf.nn.tanh, # tanh activation
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc1'
)
# fc2
probs = tf.layers.dense(
inputs=layer1,
units=out_dim,
activation=tf.nn.softmax,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc2'
)
```

> 建立兩個Fully-connected layer 的隱藏層<br>
> 參考視頻 莫煩<br>
> https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/5-1-policy-gradient-softmax1/

## Problem 2: Compute the surrogate loss
```python
surr_loss = -tf.reduce_mean(log_prob * self._advantages)
```
<p align="left"><img src="https://morvanzhou.github.io/static/results/reinforcement-learning/5-1-1.png" height="200"/></p>

## Problem 3: Use baseline to reduce the variance of our gradient estimate
```python
"""
1. b: values predicted by our baseline
2. use baselineto reduce variance
"""
a = r - b
```

<p align="left"><img src="image/p3-1.png" height="250"/><img src="image/p3-2.png" height="250"/></p>
> 藉由baseline去對reward進行調整 (63回合收斂)

## Problem 4: Replace baseline with None
```python
baseline = None
```
<p align="left"><img src="image/p4-1.png" height="250"/><img src="image/p4-2.png" height="250"/></p>
> 不用baseline對Loss做調整,原本以為會變更差,結果好像也還好 (59回合收斂)

## Problem 5: Actor-Critic algorithm (with bootstrapping)
```python
def discount_bootstrap(x, discount_rate, b):
b = np.append(b[1:],0)
y = x + discount_rate*b
return y
```

## Problem 6: Generalized Advantage Estimation
```python
r = util.discount_bootstrap(p["rewards"], self.discount_rate, b)
target_v = util.discount_cumsum(p["rewards"], self.discount_rate)
a = r - b
a = util.discount(a, self.discount_rate * LAMBDA)
```
<p align="left"><img src="image/p6-1.png" height="250"/><img src="image/p6-2.png" height="250"/></p>
> 使用GAE的方法,收斂時間會比較久 (95回合收斂)