In [None]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
from collections import deque
import random

np.random.seed(0)
tf.reset_default_graph()

# Experience Replay

Training RL agents using only the latest set of of state transitions S, A, R, S' generally leads to a problem known as "catastrophic forgetting" in which agents get too caught up on recent observations and overwrite their network weights with gradients generated from very specific, recent trajectories. The end result is that the agent learns, then swiftly forgets how to do a task.

To break up that temporal correlation we use memory replay to buffer the last ```N``` state transitions and then randomly sample the buffer during training.

In [None]:
class Experience(object):
    def __init__(self, bufferSize=1e5):
        self.buffer = deque([],bufferSize)

    def recall(self, batchSize=1024):
        batchSize = min(len(self.buffer), batchSize)
        
        batch = random.sample(self.buffer, batchSize)
    
        S = np.asarray([sample[0] for sample in batch]).reshape(batchSize, -1)
        A = np.asarray([sample[1] for sample in batch]).reshape(batchSize, -1)
        R = np.asarray([sample[2] for sample in batch]).reshape(batchSize)
        S_dash = np.asarray([sample[3] for sample in batch]).reshape(batchSize, -1)
        not_terminal = np.asarray([sample[4] for sample in batch]).reshape(batchSize)

        return S, A, R, S_dash, not_terminal
        
    def store(self, state, action, reward, nextState, not_terminal):
        self.buffer.append([state, action, reward, nextState, not_terminal])

# Ornstein–Uhlenbeck Process

DDPG generates a deterministic policy whereas it's usually better to allow for some stochasticity during training such that the agent explores its environment more effectively. DDPG is off-policy so technically we could use any sufficiently exploratory policy we like, but in accordance with the Deepmind paper, we'll just overlay an Ornstein–Uhlenbeck process on top of the current deterministic policy.

In [None]:
class OU(object):
    def __init__(self, dim, mu, theta, sigma):
        self.dim = dim
        self.mu, self.theta, self.sigma = mu, theta, sigma
        self.noise_process = np.zeros(dim)

    def get_noise(self):
        self.noise_process = self.theta * (self.mu - self.noise_process) + self.sigma * np.random.randn(self.dim)
        return self.noise_process

# Setup the Tensorflow Networks

## Network

DDPG uses four networks; an actor, a critic and two target networks. The Network class generates the tf forward pass operation for a fully connected multilayer network using a spec dictionary to define the number of units and activation for each layer.

## Target Networks

Directly using the critic to generate target values during training causes the learning process to destabilise. To introduce some regularisation, DDPG uses slow moving copies of the actor and critic called target networks. The ```TargetNetwork``` class below implements these networks and defines their update operations.

In [None]:
class Network(object):
    def __init__(self, input_shape, spec, scope, trainable):
        self.spec, self.scope, self.trainable = spec, scope, trainable
        
        self.get_forward_pass_op(tf.placeholder(dtype=tf.float32, shape=input_shape), False)
        self.vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)

    def get_forward_pass_op(self, inputs, reuse=True):
        with tf.variable_scope(self.scope, reuse=reuse):
            for layer in self.spec:
                inputs = tf.layers.dense(inputs, layer['units'], activation=layer['activation'], trainable=self.trainable)
                
        return inputs
    
    def sum_weights(self):
        return tf.add_n([tf.nn.l2_loss(var) for var in self.vars if not 'bias' in var.name])
    
class TargetNetwork(Network):
    def __init__(self, input_shape, spec, scope, trainable):
        super(TargetNetwork, self).__init__(input_shape, spec, scope, trainable)
        
    def get_target_train_op(self, target_network, tau):
        update_ops = []
        for i, var in enumerate(self.vars):
            update_op = var.assign(tau * target_network.vars[i] + (1 - tau) * var)
            update_ops.append(update_op)

        return tf.group(*update_ops)

# Define an Agent

The Agent represents the reinformcement learning algorithm. Like just about any RL algorithm it can be asked to make an action in a given state via ```act``` and it can observe S,A,R,S' tuples by storing them in its experience replay buffer.

## Training the Critic

The critic is trained in basically the same way as the Q-function from DQN. The loss function is defined as the td error plus a regularisation term on the critic network weights:

\begin{equation}
loss = [ r + \gamma Q_{target}(s',\pi_{target}(s')) - Q(s,a) ] + \sum{\theta_Q}
\end{equation}

Where:
* $ s, a, r, s'$ are a state, action, reward and next state sampled from the experience replay buffer, 
* $ \pi_{target} $ is the actor target network
* $ Q $ and $ Q_{target} $ is the critic and critic target network respectively
* $ \theta_Q $ is the critic network weights

The loss function is then minimised using an AdamOptimiser

## Training the Actor

To train the actor, we just use an AdamOptimiser to maximise the on-policy Q-function $ Q(s,\pi(s)) $ (or equivilentlly minimise $ - Q(s,\pi(s))$ ) along with a regularisation term on the actor network weights. To slow the actor's learning rate during training, the learning rate (the Adam step size) decays in proportion to $ \epsilon $ to the power of the episode counter.

In [None]:
class Agent(object):
    def __init__(self, env, actor_network_spec, critic_network_spec, alpha=1e-3, alpha_decay=1, gamma=0.99, tau=1e-2, l2_reg=5e-7):
        self.sess = tf.Session()
        self.env = env
        
        state_dim = np.prod(env.observation_space.shape)
        action_dim = np.prod(env.action_space.shape)
        
        # Ornstein–Uhlenbeck process
        self.OU = OU(action_dim, 0.0, 0.15, 0.2)
        
        # experience replay
        self.replay_memory = Experience(1e5)

        # episode counter
        self.episodes = tf.Variable(0.0, trainable=False)
        self.episode_inc_op = self.episodes.assign_add(1)
        
        # tf placeholders
        self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        
        # set up the networks
        critic_network = Network([None, state_dim + action_dim], critic_network_spec, 'critic_net', trainable=True)
        actor_network = Network([None, state_dim], actor_network_spec, 'actor_net', trainable=True)
        slow_critic_network = TargetNetwork([None, state_dim + action_dim], critic_network_spec, 'slow_critic_net', trainable=False)
        slow_actor_network = TargetNetwork([None, state_dim], actor_network_spec, 'slow_actor_net', trainable=False)

        # set up the actor operation
        self.policy_op = actor_network.get_forward_pass_op(self.state_ph) * (self.env.action_space.high - self.env.action_space.low)
                
        # set up the critic training operation
        slow_target_next_actions = slow_actor_network.get_forward_pass_op(self.next_state_ph)
        slow_q_values_next = slow_critic_network.get_forward_pass_op(tf.concat([self.next_state_ph, slow_target_next_actions], axis=1))
        
        critic_off_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.action_ph], axis=1))

        targets = tf.expand_dims(self.reward_ph, 1) + tf.expand_dims(self.is_not_terminal_ph, 1) * gamma * slow_q_values_next        
        td_errors = targets - critic_off_pol
        
        critic_loss = tf.reduce_mean(tf.square(td_errors)) + l2_reg * critic_network.sum_weights()
        self.critic_train_op = tf.train.AdamOptimizer(alpha * alpha_decay ** self.episodes).minimize(critic_loss)
        
        # set up the actor training operation
        critic_on_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.policy_op], axis=1))
        actor_loss = -1 * tf.reduce_mean(critic_on_pol) + l2_reg * actor_network.sum_weights()
        self.actor_train_op = tf.train.AdamOptimizer(alpha * alpha_decay ** self.episodes).minimize(actor_loss, var_list=actor_network.vars)
        
        # train slow networks
        self.slow_actor_train_op = slow_actor_network.get_target_train_op(actor_network, tau=tau)
        self.slow_critic_train_op = slow_critic_network.get_target_train_op(critic_network, tau=tau)

        self.sess.run(tf.global_variables_initializer())
        
    def act(self, state, initial_noise_scale=0.0, noise_decay=0.99):
        action = self.sess.run(self.policy_op, feed_dict = {self.state_ph: state})

        self.noise_scale = (initial_noise_scale * noise_decay ** self.sess.run(self.episodes)) * (self.env.action_space.high - self.env.action_space.low)
        action += self.noise_scale * self.OU.get_noise()
        
        return action

    def train(self, batch_size=1024):
        if len(self.replay_memory.buffer) >= batch_size:        
            # grab N (s,a,r,s') tuples from replay memory
            S, A, R, S_dash, not_terminal = self.replay_memory.recall(batch_size)

            # update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
            self.sess.run(self.critic_train_op, feed_dict = {self.state_ph: S, self.action_ph: A, self.reward_ph: R, self.next_state_ph: S_dash, self.is_not_terminal_ph: not_terminal})
            self.sess.run(self.actor_train_op, feed_dict = {self.state_ph: S})

            # update slow actor and critic targets towards current actor and critic
            self.sess.run([self.slow_actor_train_op, self.slow_critic_train_op])
        
    def increment_episode(self):
        self.sess.run(self.episode_inc_op)

# Simulate!

Finally, we simulate the agent using the Pendulum-v0 environment from the openAI gym. On every iteration of an episode for 1000 episodes, the agent acts, stores the S,A,R,S' tuple and trains its networks. After usually around 20-30 episodes it hones in on a good policy for swinging up and stabilising the pendulum and the total reward increases to around -200.

In [None]:
env = gym.make('Pendulum-v0')
env = wrappers.Monitor(env, './pendulum-v0-experiment', force=True)
env.seed(0)

critic_network_spec = [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 1, 'activation': None}]

actor_network_spec =  [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': np.prod(env.action_space.shape), 'activation': tf.nn.tanh}]

agent = Agent(env, 
              actor_network_spec,
              critic_network_spec,
              alpha=1e-3,
              alpha_decay=1,
              gamma=0.99,
              tau=1e-2, 
              l2_reg=5e-7)

num_episodes = 1000
max_steps_ep = 10000

for ep in range(num_episodes):
    total_reward = 0
    steps_in_ep = 0

    # Initial state
    s_t = env.reset()[None]
    
    if ep % 10 == 0: env.render()

    for t in range(max_steps_ep):
        a_t = agent.act(s_t)

        # take step
        s_t_plus_1, r_t, done, _info = env.step(a_t)
                
        if ep % 10 == 0: env.render()
        total_reward += r_t

        agent.replay_memory.store(s_t, a_t, r_t, s_t_plus_1.T, 0.0 if done else 1.0)
        agent.train()
            
        s_t = s_t_plus_1.T
        steps_in_ep += 1

        if done: break

    agent.increment_episode()
    print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep, total_reward, steps_in_ep))

# Finalize and upload results
env.close()