In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
from collections import deque
import random

np.random.seed(0)

In [2]:
class Experience(object):
    def __init__(self, bufferSize=1e5):
        self.buffer = deque([],bufferSize)

    def recall(self, batchSize=1024):
        batchSize = min(len(self.buffer), batchSize)
        
        batch = random.sample(self.buffer, batchSize)
    
        S = np.asarray([sample[0] for sample in batch]).reshape(batchSize, -1)
        A = np.asarray([sample[1] for sample in batch]).reshape(batchSize, -1)
        R = np.asarray([sample[2] for sample in batch]).reshape(batchSize)
        S_dash = np.asarray([sample[3] for sample in batch]).reshape(batchSize, -1)
        not_terminal = np.asarray([sample[4] for sample in batch]).reshape(batchSize)

        return S, A, R, S_dash, not_terminal
        
    def store(self, state, action, reward, nextState, not_terminal):
        self.buffer.append([state, action, reward, nextState, not_terminal])

In [3]:
class OU(object):
    def __init__(self, dim, mu, theta, sigma):
        self.dim = dim
        self.mu, self.theta, self.sigma = mu, theta, sigma
        self.noise_process = np.zeros(dim)

    def get_noise(self):
        self.noise_process = self.theta * (self.mu - self.noise_process) + self.sigma * np.random.randn(self.dim)
        return self.noise_process

In [4]:
class Network(object):
    def __init__(self, input_shape, spec, scope, trainable):
        self.spec, self.scope, self.trainable = spec, scope, trainable
        
        self.get_forward_pass_op(tf.placeholder(dtype=tf.float32, shape=input_shape), False)
        self.vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)

    def get_forward_pass_op(self, inputs, reuse=True):
        with tf.variable_scope(self.scope, reuse=reuse):
            for layer in self.spec:
                inputs = tf.layers.dense(inputs, layer['units'], activation=layer['activation'], trainable=self.trainable)
                
        return inputs
    
    def sum_weights(self):
        return tf.add_n([tf.nn.l2_loss(var) for var in self.vars if not 'bias' in var.name])
    
class TargetNetwork(Network):
    def __init__(self, input_shape, spec, scope, trainable):
        super(TargetNetwork, self).__init__(input_shape, spec, scope, trainable)
        
    def get_target_train_op(self, target_network, tau):
        update_ops = []
        for i, var in enumerate(self.vars):
            update_op = var.assign(tau * target_network.vars[i] + (1 - tau) * var)
            update_ops.append(update_op)

        return tf.group(*update_ops)

In [5]:
class Agent(object):
    def __init__(self, env, actor_network_spec, critic_network_spec, alpha=1e-3, alpha_decay=1, gamma=0.99, tau=1e-2, l2_reg=5e-7):
        self.sess = tf.Session()
        self.env = env
        
        state_dim = np.prod(env.observation_space.shape)
        action_dim = np.prod(env.action_space.shape)
        
        # Ornstein–Uhlenbeck process
        self.OU = OU(action_dim, 0.0, 0.15, 0.2)
        
        # experience replay
        self.replay_memory = Experience(1e5)

        # episode counter
        self.episodes = tf.Variable(0.0, trainable=False)
        self.episode_inc_op = self.episodes.assign_add(1)
        
        # tf placeholders
        self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        
        # set up the networks
        critic_network = Network([None, state_dim + action_dim], critic_network_spec, 'critic_net', trainable=True)
        actor_network = Network([None, state_dim], actor_network_spec, 'actor_net', trainable=True)
        slow_critic_network = TargetNetwork([None, state_dim + action_dim], critic_network_spec, 'slow_critic_net', trainable=False)
        slow_actor_network = TargetNetwork([None, state_dim], actor_network_spec, 'slow_actor_net', trainable=False)

        # actors
        self.policy_op = actor_network.get_forward_pass_op(self.state_ph) * (self.env.action_space.high - self.env.action_space.low)
        slow_target_next_actions = slow_actor_network.get_forward_pass_op(self.next_state_ph)
        
        # critics
        critic_off_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.action_ph], axis=1))
        critic_on_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.policy_op], axis=1))
        slow_q_values_next = slow_critic_network.get_forward_pass_op(tf.concat([self.next_state_ph, slow_target_next_actions], axis=1))
        
        # train critic
        targets = tf.expand_dims(self.reward_ph, 1) + tf.expand_dims(self.is_not_terminal_ph, 1) * gamma * slow_q_values_next
        td_errors = targets - critic_off_pol
        critic_loss = tf.reduce_mean(tf.square(td_errors)) + l2_reg * critic_network.sum_weights()
        self.critic_train_op = tf.train.AdamOptimizer(alpha * alpha_decay ** self.episodes).minimize(critic_loss)
        
        # train actor
        actor_loss = -1 * tf.reduce_mean(critic_on_pol) + l2_reg * actor_network.sum_weights()
        self.actor_train_op = tf.train.AdamOptimizer(alpha * alpha_decay ** self.episodes).minimize(actor_loss, var_list=actor_network.vars)
        
        # train slow networks
        self.slow_actor_train_op = slow_actor_network.get_target_train_op(actor_network, tau=tau)
        self.slow_critic_train_op = slow_critic_network.get_target_train_op(critic_network, tau=tau)

        self.sess.run(tf.global_variables_initializer())
        
    def act(self, state, initial_noise_scale=0.0, noise_decay=0.99):
        action = self.sess.run(self.policy_op, feed_dict = {self.state_ph: state})

        self.noise_scale = (initial_noise_scale * noise_decay ** self.sess.run(self.episodes)) * (self.env.action_space.high - self.env.action_space.low)
        action += self.noise_scale * self.OU.get_noise()
        
        return action

    def train(self, batch_size=1024):
        if len(self.replay_memory.buffer) >= batch_size:        
            # grab N (s,a,r,s') tuples from replay memory
            S, A, R, S_dash, not_terminal = self.replay_memory.recall(batch_size)

            # update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
            self.sess.run(self.critic_train_op, feed_dict = {self.state_ph: S, self.action_ph: A, self.reward_ph: R, self.next_state_ph: S_dash, self.is_not_terminal_ph: not_terminal})
            self.sess.run(self.actor_train_op, feed_dict = {self.state_ph: S})

            # update slow actor and critic targets towards current actor and critic
            self.sess.run([self.slow_actor_train_op, self.slow_critic_train_op])
        
    def increment_episode(self):
        self.sess.run(self.episode_inc_op)

In [6]:
# initialize session
tf.reset_default_graph()

env = gym.make('Pendulum-v0')
env = wrappers.Monitor(env, './pendulum-v0-experiment', force=True)
env.seed(0)

critic_network_spec = [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 1, 'activation': None}]

actor_network_spec =  [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': np.prod(env.action_space.shape), 'activation': tf.nn.tanh}]

agent = Agent(env, 
              actor_network_spec,
              critic_network_spec,
              alpha=1e-3,
              alpha_decay=1,
              gamma=0.99,
              tau=1e-2, 
              l2_reg=5e-7)

num_episodes = 1000
max_steps_ep = 10000

for ep in range(num_episodes):
    total_reward = 0
    steps_in_ep = 0

    # Initial state
    s_t = env.reset()[None]
    
    if ep % 10 == 0: env.render()

    for t in range(max_steps_ep):
        a_t = agent.act(s_t)

        # take step
        s_t_plus_1, r_t, done, _info = env.step(a_t)
                
        if ep % 10 == 0: env.render()
        total_reward += r_t

        agent.replay_memory.store(s_t, a_t, r_t, s_t_plus_1.T, 0.0 if done else 1.0)
        agent.train()
            
        s_t = s_t_plus_1.T
        steps_in_ep += 1

        if done: break

    agent.increment_episode()
    print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep, total_reward, steps_in_ep))

# Finalize and upload results
env.close()

Episode  0, Reward: -1459.679, Steps: 200
Episode  1, Reward: -1575.499, Steps: 200
Episode  2, Reward: -1512.019, Steps: 200
Episode  3, Reward: -1647.590, Steps: 200
Episode  4, Reward: -1465.100, Steps: 200
Episode  5, Reward: -1376.639, Steps: 200
Episode  6, Reward: -1521.474, Steps: 200
Episode  7, Reward: -1403.017, Steps: 200
Episode  8, Reward: -1529.288, Steps: 200
Episode  9, Reward: -1589.592, Steps: 200
Episode 10, Reward: -1587.410, Steps: 200
Episode 11, Reward: -1219.870, Steps: 200
Episode 12, Reward: -1702.513, Steps: 200
Episode 13, Reward: -1566.213, Steps: 200
Episode 14, Reward: -690.815, Steps: 200
Episode 15, Reward: -1493.743, Steps: 200
Episode 16, Reward: -1017.100, Steps: 200
Episode 17, Reward: -794.358, Steps: 200
Episode 18, Reward: -1140.858, Steps: 200
Episode 19, Reward: -1237.183, Steps: 200
Episode 20, Reward: -1309.831, Steps: 200
Episode 21, Reward: -1359.508, Steps: 200
Episode 22, Reward: -1152.257, Steps: 200
Episode 23, Reward: -1171.089, Steps

KeyboardInterrupt: 