# Learning with Policy Gradient

I've chosen to use keras instead of tf-slim.

In [4]:
import tensorflow as tf
from tensorflow.contrib.keras.api.keras.layers import Input, Dense
from tensorflow.contrib.keras.api.keras.models import Model

import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
env = gym.make('CartPole-v0')

[2017-11-28 16:04:01,418] Making new env: CartPole-v0


### Reward function

In [6]:
gamma = 0.99

def discounted_rewards(r):
    # Create a new array for the discounted values
    discounted_r = np.zeros_like(r)
    running_add = 0
    # Traverse the list in reverse order.
    # Add the rewards together based on future actions * discout
    for t in reversed(range(len(r))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

### Agent

In [33]:
class Agent:
    
    def __init__(self, learning_rate, state_size, action_size, neurons):
        # Set up the policy gradient network.
        # Mapping state -> action
        self.input = tf.placeholder(shape=[None, state_size],dtype=tf.float32)
        hidden = Dense(neurons, activation='relu')(self.input)
        self.output = Dense(action_size, activation='softmax')(hidden)
        self.chosen_action = tf.argmax(self.output, 1)
        
        # Setup the training procedur for the network.
        # Because this is RL we need to define our loss manually.
        # Reward holder is the discounted reward for the action.
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        #
        # Action holder contains the 
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)

        # Indices contains [0 .. batch_size] * nbr_actions + selected_action
        self.indicies = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        
        # Flatten output into 1D
        flat_output = tf.reshape(self.output, [-1])
        
        # Collect the outputs from flat_out according to incidies to
        # create a vector containing all the probabilities for all selected actions
        self.responisble_outputs = tf.gather(flat_output, self.indicies)
        
        # Setup the loss as log(p(x)) * reward
        l = tf.log(self.responisble_outputs) * self.reward_holder
        
        # Minimize mean of all
        self.loss = -tf.reduce_mean(l)
        
        # Get all trainable variables
        trainable = tf.trainable_variables()
        
        # Iterate over each variable and create a variable for
        # each trainable variable's gradient
        self.gradient_holders = []
        for idx, var in enumerate(trainable):
            placeholder = tf.placeholder(dtype=tf.float32, name='%d_holder' % idx)
            self.gradient_holders.append(placeholder)
        
        # Setup the update_batch variable
        self.gradients = tf.gradients(self.loss, trainable)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, trainable))
        
        

In [44]:
# Clear tensorflow graph
tf.reset_default_graph()

agent = Agent(1e-2, 4, 2, 8) #Load the agent.
total_episodes = 5000
max_ep = 999
update_frequency = 5 # Batch size

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        
        for j in range(max_ep):
            # Sample action from our network
            actions = sess.run(agent.output, feed_dict={agent.input:[s]})
            a_idx = np.random.choice(range(len(actions[0])), p=actions[0])            
            
            s1, r, done, _ = env.step(a_idx)
            ep_history.append([s, a_idx, r, s1])
            s = s1
            running_reward += r
            
            if done:
                # Backpropagate rewards
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discounted_rewards(ep_history[:, 2])
                feed_dict = {
                    agent.reward_holder: ep_history[:, 2],
                    agent.action_holder: ep_history[:, 1],
                    agent.input: np.vstack(ep_history[:, 0])
                }
                
                gradients = sess.run(agent.gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(gradients):
                    gradBuffer[idx] += grad
                
                if i % update_frequency == 0 and i > 0:
                    feed_dict = dict(zip(agent.gradient_holders, gradBuffer))
                    _ = sess.run(agent.update_batch, feed_dict=feed_dict)
                    
                    for idx, grad in enumerate(gradients):
                        gradBuffer[idx] = 0
                
                total_reward.append(running_reward)
                total_length.append(j)
                break
                
        if i % 100 == 0:
            print('Mean reward: ', np.mean(total_reward[-100:]))
        i += 1
                    
    

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Mean reward:  22.0
Mean reward:  22.38
Mean reward:  27.61
Mean reward:  35.96
Mean reward:  52.26
Mean reward:  65.25
Mean reward:  94.88
Mean reward:  128.69
Mean reward:  177.42
Mean reward:  182.45
Mean reward:  155.45
Mean reward:  171.28
Mean reward:  115.6
Mean reward:  113.93
Mean reward:  167.25
Mean reward:  198.9
Mean reward:  190.94
Mean reward:  171.21


KeyboardInterrupt: 