This repository has been archived by the owner on Oct 27, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CartPole.py
142 lines (110 loc) · 5 KB
/
CartPole.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import numpy as np
import cPickle as pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import gym
env = gym.make('CartPole-v0')
H = 10
batch_size = 5
learning_rate = 1e-2
gamma = 0.99
D = 4
tf.reset_default_graph()
# Placeholders (All of these a variable length lists ([None, 1]), that are run through the function discount_rewards
input_y = tf.placeholder(tf.float32, [None, 1], name="input_y") # Fake label (not entirely sure what this is, but is
# is defined as y = 1 if action == 0 else 0, so it is the opposite of the action. Is stored as a list, hence "None",
# so it takes a variable length list.
advantages = tf.placeholder(tf.float32, name="reward_signal") # The rewards, also stored as a list.
observations = tf.placeholder(tf.float32, [None, D], name="input_x") # The observations, again a list.
# Weights, get variable so we can set xavier initializer and access it in tvars
W1 = tf.get_variable("W1", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable("W2", shape=[H, 1], initializer=tf.contrib.layers.xavier_initializer())
# Layers, first and only hidden uses relu, output uses sigmoid
layer1 = tf.nn.relu(tf.matmul(observations, W1))
score = tf.matmul(layer1, W2)
probability = tf.nn.sigmoid(score)
# Model (This is where logits would typically be (tf.matmul(x, weights) + biases). This sends the weights in the
# direction of making actions that gave good advantage (reward over time) more likely, and actions that didn't less
# likely
tvars = tf.trainable_variables() # the trainable variables
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y) * (input_y + probability)) # if input_y == 0, then
# return log(probability) otherwise return log(1-probability)
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss, tvars) # gradients for the trainable variables (W1 and W2)
# Optimizer
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Adam optimizer
W1Grad = tf.placeholder(tf.float32, name="batch_grad1")
W2Grad = tf.placeholder(tf.float32, name="batch_grad2")
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars)) # update the gradients
def discount_rewards(r):
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
xs, hs, dlogps, drs, ys, tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()
reward_stuff = []
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset()
gradBuffer = sess.run(tvars)
for ix, grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
while episode_number <= total_episodes:
if reward_sum/batch_size > 200 or rendering == True:
env.render()
rendering = True
break
x = np.reshape(observation,[1,D])
tfprob = sess.run(probability, feed_dict={observations: x})
action = 1 if np.random.uniform() < tfprob else 0
xs.append(x)
y = 1 if action == 0 else 0
ys.append(y)
observation, reward, done, info = env.step(action)
reward_sum +=reward
drs.append(reward)
if done:
episode_number += 1
epx = np.vstack(xs)
epy = np.vstack(ys)
epr = np.vstack(drs)
tfp = tfps
xs, hs, dlogps, drs, ys, tfps = [],[],[],[],[],[]
discounted_epr = discount_rewards(epr)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
for ix, grad in enumerate(tGrad):
gradBuffer[ix] += grad
if episode_number % batch_size == 0:
sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad:gradBuffer[1]})
for ix, grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
reward_stuff.append(reward_sum/batch_size)
print 'Average reward for episode %f. Total average reward %f.' % (reward_sum/batch_size, running_reward/batch_size)
if reward_sum/batch_size >= 200:
print "Task solved in ", episode_number,' episodes!'
break
reward_sum = 0
observation = env.reset()
print episode_number,' Episodes completed.'
def low_pass(in_array, strength):
for i in range(len(in_array) - strength * 2):
running_total = 0
for j in range(strength + 1):
running_total += in_array[i + j]
in_array[i + strength] = running_total / (strength + 1)
low_pass(reward_stuff, 5)
plt.plot(reward_stuff)
plt.show()