In [15]:
from IPython.display import HTML
# Youtube
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/5SEEwqRH8_c?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

[Game Rules](https://github.com/openai/gym/wiki/CartPole-v0)

In [20]:
import numpy as np
import pickle as pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [4]:
import gym
env = gym.make('CartPole-v0')

[2017-04-10 03:56:41,683] Making new env: CartPole-v0


In [22]:
env.reset()
random_episodes = 0
reward_sum = 0
while random_episodes < 10:
    env.render()
    observation, reward, done, _ = env.step(np.random.randint(0,2))
    reward_sum += reward
    if done:
        random_episodes += 1
        print ("Reward for this episode was:",reward_sum)
        reward_sum = 0
        env.reset()

Reward for this episode was: 16.0
Reward for this episode was: 27.0
Reward for this episode was: 19.0
Reward for this episode was: 13.0
Reward for this episode was: 17.0
Reward for this episode was: 26.0
Reward for this episode was: 12.0
Reward for this episode was: 11.0
Reward for this episode was: 42.0
Reward for this episode was: 19.0


In [7]:
# hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-2 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward

D = 4 # input dimensionality

In [8]:
tf.reset_default_graph()

#This defines the network as it goes from taking an observation of the environment to 
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages) 
newGrads = tf.gradients(loss,tvars)

# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

In [9]:
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment
    # Reset the gradient placeholder. We will collect gradients in 
    # gradBuffer until we are ready to update our policy network. 
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:
        
        # Rendering the environment slows things down, 
        # so let's only look at it once our agent is doing a good job.
        if reward_sum/batch_size > 100 and rendering == True : 
            env.render()
            rendering = True
            
        # Make sure the observation is in a shape the network can handle.
        x = np.reshape(observation,[1,D])
        
        # Run the policy network and get an action to take. 
        tfprob = sess.run(probability,feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        xs.append(x) # observation
        y = 1 if action == 0 else 0 # a "fake label"
        ys.append(y)

        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)
        reward_sum += reward

        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

        if done: 
            episode_number += 1
            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory

            # compute the discounted reward backwards through time
            discounted_epr = discount_rewards(epr)
            # size the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            # Get the gradient for this episode, and save it in the gradBuffer
            tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # If we have completed enough episodes, then update the policy network with our gradients.
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                # Give a summary of how well our network is doing for each batch of episodes.
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print ('Episode %d - Average reward: %f,Total average reward: %f.' % (episode_number,reward_sum/batch_size, running_reward/batch_size))
                
                if reward_sum/batch_size > 10000: 
                    print('Task solved in {} episodes!"'.format(episode_number))
                    break
                    
                reward_sum = 0
            
            observation = env.reset()
        
print (episode_number,'Episodes completed.')

Episode 5 - Average reward: 11.600000,Total average reward: 11.600000.
Episode 10 - Average reward: 25.200000,Total average reward: 11.736000.
Episode 15 - Average reward: 18.400000,Total average reward: 11.802640.
Episode 20 - Average reward: 28.400000,Total average reward: 11.968614.
Episode 25 - Average reward: 14.800000,Total average reward: 11.996927.
Episode 30 - Average reward: 35.400000,Total average reward: 12.230958.
Episode 35 - Average reward: 29.400000,Total average reward: 12.402649.
Episode 40 - Average reward: 29.200000,Total average reward: 12.570622.
Episode 45 - Average reward: 26.600000,Total average reward: 12.710916.
Episode 50 - Average reward: 22.400000,Total average reward: 12.807807.
Episode 55 - Average reward: 18.000000,Total average reward: 12.859729.
Episode 60 - Average reward: 21.800000,Total average reward: 12.949131.
Episode 65 - Average reward: 32.200000,Total average reward: 13.141640.
Episode 70 - Average reward: 22.800000,Total average reward: 13.2

Episode 570 - Average reward: 64.200000,Total average reward: 32.447534.
Episode 575 - Average reward: 99.400000,Total average reward: 33.117059.
Episode 580 - Average reward: 68.800000,Total average reward: 33.473888.
Episode 585 - Average reward: 70.800000,Total average reward: 33.847149.
Episode 590 - Average reward: 54.200000,Total average reward: 34.050678.
Episode 595 - Average reward: 71.600000,Total average reward: 34.426171.
Episode 600 - Average reward: 58.600000,Total average reward: 34.667909.
Episode 605 - Average reward: 77.400000,Total average reward: 35.095230.
Episode 610 - Average reward: 61.000000,Total average reward: 35.354278.
Episode 615 - Average reward: 69.400000,Total average reward: 35.694735.
Episode 620 - Average reward: 37.000000,Total average reward: 35.707788.
Episode 625 - Average reward: 69.600000,Total average reward: 36.046710.
Episode 630 - Average reward: 59.600000,Total average reward: 36.282243.
Episode 635 - Average reward: 49.200000,Total avera

Episode 1125 - Average reward: 191.800000,Total average reward: 101.872560.
Episode 1130 - Average reward: 188.800000,Total average reward: 102.741834.
Episode 1135 - Average reward: 175.400000,Total average reward: 103.468416.
Episode 1140 - Average reward: 198.200000,Total average reward: 104.415731.
Episode 1145 - Average reward: 160.400000,Total average reward: 104.975574.
Episode 1150 - Average reward: 178.200000,Total average reward: 105.707818.
Episode 1155 - Average reward: 174.000000,Total average reward: 106.390740.
Episode 1160 - Average reward: 198.000000,Total average reward: 107.306833.
Episode 1165 - Average reward: 176.600000,Total average reward: 107.999765.
Episode 1170 - Average reward: 150.200000,Total average reward: 108.421767.
Episode 1175 - Average reward: 183.800000,Total average reward: 109.175549.
Episode 1180 - Average reward: 161.600000,Total average reward: 109.699794.
Episode 1185 - Average reward: 167.800000,Total average reward: 110.280796.
Episode 1190

Episode 1665 - Average reward: 188.200000,Total average reward: 159.616013.
Episode 1670 - Average reward: 190.000000,Total average reward: 159.919853.
Episode 1675 - Average reward: 200.000000,Total average reward: 160.320654.
Episode 1680 - Average reward: 200.000000,Total average reward: 160.717448.
Episode 1685 - Average reward: 198.000000,Total average reward: 161.090273.
Episode 1690 - Average reward: 200.000000,Total average reward: 161.479370.
Episode 1695 - Average reward: 200.000000,Total average reward: 161.864577.
Episode 1700 - Average reward: 152.200000,Total average reward: 161.767931.
Episode 1705 - Average reward: 199.800000,Total average reward: 162.148252.
Episode 1710 - Average reward: 200.000000,Total average reward: 162.526769.
Episode 1715 - Average reward: 200.000000,Total average reward: 162.901501.
Episode 1720 - Average reward: 200.000000,Total average reward: 163.272486.
Episode 1725 - Average reward: 200.000000,Total average reward: 163.639762.
Episode 1730

Episode 2205 - Average reward: 194.800000,Total average reward: 184.287273.
Episode 2210 - Average reward: 200.000000,Total average reward: 184.444400.
Episode 2215 - Average reward: 186.400000,Total average reward: 184.463956.
Episode 2220 - Average reward: 200.000000,Total average reward: 184.619316.
Episode 2225 - Average reward: 197.800000,Total average reward: 184.751123.
Episode 2230 - Average reward: 188.200000,Total average reward: 184.785612.
Episode 2235 - Average reward: 196.000000,Total average reward: 184.897756.
Episode 2240 - Average reward: 200.000000,Total average reward: 185.048778.
Episode 2245 - Average reward: 200.000000,Total average reward: 185.198291.
Episode 2250 - Average reward: 200.000000,Total average reward: 185.346308.
Episode 2255 - Average reward: 200.000000,Total average reward: 185.492845.
Episode 2260 - Average reward: 200.000000,Total average reward: 185.637916.
Episode 2265 - Average reward: 200.000000,Total average reward: 185.781537.
Episode 2270

Episode 2745 - Average reward: 200.000000,Total average reward: 193.632216.
Episode 2750 - Average reward: 200.000000,Total average reward: 193.695894.
Episode 2755 - Average reward: 200.000000,Total average reward: 193.758935.
Episode 2760 - Average reward: 193.000000,Total average reward: 193.751345.
Episode 2765 - Average reward: 200.000000,Total average reward: 193.813832.
Episode 2770 - Average reward: 200.000000,Total average reward: 193.875694.
Episode 2775 - Average reward: 200.000000,Total average reward: 193.936937.
Episode 2780 - Average reward: 200.000000,Total average reward: 193.997567.
Episode 2785 - Average reward: 200.000000,Total average reward: 194.057592.
Episode 2790 - Average reward: 188.400000,Total average reward: 194.001016.
Episode 2795 - Average reward: 200.000000,Total average reward: 194.061006.
Episode 2800 - Average reward: 200.000000,Total average reward: 194.120396.
Episode 2805 - Average reward: 200.000000,Total average reward: 194.179192.
Episode 2810

Episode 3285 - Average reward: 200.000000,Total average reward: 196.941251.
Episode 3290 - Average reward: 200.000000,Total average reward: 196.971839.
Episode 3295 - Average reward: 200.000000,Total average reward: 197.002121.
Episode 3300 - Average reward: 200.000000,Total average reward: 197.032099.
Episode 3305 - Average reward: 200.000000,Total average reward: 197.061778.
Episode 3310 - Average reward: 200.000000,Total average reward: 197.091161.
Episode 3315 - Average reward: 200.000000,Total average reward: 197.120249.
Episode 3320 - Average reward: 200.000000,Total average reward: 197.149047.
Episode 3325 - Average reward: 191.400000,Total average reward: 197.091556.
Episode 3330 - Average reward: 200.000000,Total average reward: 197.120640.
Episode 3335 - Average reward: 191.000000,Total average reward: 197.059434.
Episode 3340 - Average reward: 200.000000,Total average reward: 197.088840.
Episode 3345 - Average reward: 200.000000,Total average reward: 197.117951.
Episode 3350

Episode 3825 - Average reward: 200.000000,Total average reward: 197.882846.
Episode 3830 - Average reward: 200.000000,Total average reward: 197.904018.
Episode 3835 - Average reward: 200.000000,Total average reward: 197.924978.
Episode 3840 - Average reward: 200.000000,Total average reward: 197.945728.
Episode 3845 - Average reward: 200.000000,Total average reward: 197.966271.
Episode 3850 - Average reward: 200.000000,Total average reward: 197.986608.
Episode 3855 - Average reward: 200.000000,Total average reward: 198.006742.
Episode 3860 - Average reward: 200.000000,Total average reward: 198.026674.
Episode 3865 - Average reward: 200.000000,Total average reward: 198.046408.
Episode 3870 - Average reward: 200.000000,Total average reward: 198.065943.
Episode 3875 - Average reward: 200.000000,Total average reward: 198.085284.
Episode 3880 - Average reward: 200.000000,Total average reward: 198.104431.
Episode 3885 - Average reward: 200.000000,Total average reward: 198.123387.
Episode 3890

Episode 4365 - Average reward: 200.000000,Total average reward: 198.584901.
Episode 4370 - Average reward: 200.000000,Total average reward: 198.599052.
Episode 4375 - Average reward: 200.000000,Total average reward: 198.613062.
Episode 4380 - Average reward: 200.000000,Total average reward: 198.626931.
Episode 4385 - Average reward: 200.000000,Total average reward: 198.640662.
Episode 4390 - Average reward: 200.000000,Total average reward: 198.654255.
Episode 4395 - Average reward: 200.000000,Total average reward: 198.667713.
Episode 4400 - Average reward: 200.000000,Total average reward: 198.681035.
Episode 4405 - Average reward: 200.000000,Total average reward: 198.694225.
Episode 4410 - Average reward: 200.000000,Total average reward: 198.707283.
Episode 4415 - Average reward: 200.000000,Total average reward: 198.720210.
Episode 4420 - Average reward: 200.000000,Total average reward: 198.733008.
Episode 4425 - Average reward: 190.400000,Total average reward: 198.649678.
Episode 4430

Episode 4905 - Average reward: 200.000000,Total average reward: 198.087653.
Episode 4910 - Average reward: 200.000000,Total average reward: 198.106777.
Episode 4915 - Average reward: 200.000000,Total average reward: 198.125709.
Episode 4920 - Average reward: 200.000000,Total average reward: 198.144452.
Episode 4925 - Average reward: 200.000000,Total average reward: 198.163007.
Episode 4930 - Average reward: 191.800000,Total average reward: 198.099377.
Episode 4935 - Average reward: 193.600000,Total average reward: 198.054383.
Episode 4940 - Average reward: 200.000000,Total average reward: 198.073840.
Episode 4945 - Average reward: 200.000000,Total average reward: 198.093101.
Episode 4950 - Average reward: 189.000000,Total average reward: 198.002170.
Episode 4955 - Average reward: 186.200000,Total average reward: 197.884148.
Episode 4960 - Average reward: 200.000000,Total average reward: 197.905307.
Episode 4965 - Average reward: 200.000000,Total average reward: 197.926254.
Episode 4970

Episode 5445 - Average reward: 200.000000,Total average reward: 198.799129.
Episode 5450 - Average reward: 200.000000,Total average reward: 198.811137.
Episode 5455 - Average reward: 200.000000,Total average reward: 198.823026.
Episode 5460 - Average reward: 200.000000,Total average reward: 198.834796.
Episode 5465 - Average reward: 196.800000,Total average reward: 198.814448.
Episode 5470 - Average reward: 200.000000,Total average reward: 198.826303.
Episode 5475 - Average reward: 200.000000,Total average reward: 198.838040.
Episode 5480 - Average reward: 200.000000,Total average reward: 198.849660.
Episode 5485 - Average reward: 200.000000,Total average reward: 198.861163.
Episode 5490 - Average reward: 200.000000,Total average reward: 198.872552.
Episode 5495 - Average reward: 200.000000,Total average reward: 198.883826.
Episode 5500 - Average reward: 200.000000,Total average reward: 198.894988.
Episode 5505 - Average reward: 200.000000,Total average reward: 198.906038.
Episode 5510

Episode 5985 - Average reward: 200.000000,Total average reward: 199.514870.
Episode 5990 - Average reward: 200.000000,Total average reward: 199.519721.
Episode 5995 - Average reward: 200.000000,Total average reward: 199.524524.
Episode 6000 - Average reward: 200.000000,Total average reward: 199.529278.
Episode 6005 - Average reward: 200.000000,Total average reward: 199.533986.
Episode 6010 - Average reward: 200.000000,Total average reward: 199.538646.
Episode 6015 - Average reward: 200.000000,Total average reward: 199.543259.
Episode 6020 - Average reward: 200.000000,Total average reward: 199.547827.
Episode 6025 - Average reward: 200.000000,Total average reward: 199.552348.
Episode 6030 - Average reward: 200.000000,Total average reward: 199.556825.
Episode 6035 - Average reward: 200.000000,Total average reward: 199.561257.
Episode 6040 - Average reward: 195.200000,Total average reward: 199.517644.
Episode 6045 - Average reward: 200.000000,Total average reward: 199.522468.
Episode 6050

Episode 6525 - Average reward: 200.000000,Total average reward: 199.136436.
Episode 6530 - Average reward: 200.000000,Total average reward: 199.145071.
Episode 6535 - Average reward: 200.000000,Total average reward: 199.153621.
Episode 6540 - Average reward: 196.000000,Total average reward: 199.122084.
Episode 6545 - Average reward: 200.000000,Total average reward: 199.130863.
Episode 6550 - Average reward: 200.000000,Total average reward: 199.139555.
Episode 6555 - Average reward: 200.000000,Total average reward: 199.148159.
Episode 6560 - Average reward: 200.000000,Total average reward: 199.156678.
Episode 6565 - Average reward: 200.000000,Total average reward: 199.165111.
