In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
from collections import deque
import random

# hyperparameters
gamma = 0.99				# reward discount factor
lr_actor = 1e-3				# learning rate for the actor
lr_critic = 1e-3			# learning rate for the critic
lr_decay = 1				# learning rate decay (per episode)
l2_reg_actor = 5e-7			# L2 regularization factor for the actor
l2_reg_critic = 5e-7		# L2 regularization factor for the critic
num_episodes = 1000		# number of episodes
max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
tau = 1e-2				# soft target update rate
train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights
replay_memory_capacity = int(1e5)	# capacity of experience replay memory
minibatch_size = 1024	# size of minibatch from experience replay memory for updates
initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
noise_decay = 0.99		# decay rate (per episode) of the scale of the exploration noise process
exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

env = gym.make('Pendulum-v0')
state_dim = np.prod(np.array(env.observation_space.shape)) 	# Get total number of dimensions in state
action_dim = np.prod(np.array(env.action_space.shape))		# Assuming continuous action space

critic_network_spec = [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 1, 'activation': None}]

actor_network_spec =  [{'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': 8, 'activation': tf.nn.relu},
                       {'units': action_dim, 'activation': tf.nn.tanh}]

In [2]:
class Experience(object):
    def __init__(self, bufferSize):
        self.buffer = deque([],bufferSize)

    def recall(self, batchSize):
        batchSize = min(len(self.buffer), batchSize)
        
        batch = random.sample(self.buffer, batchSize)
    
        S = np.asarray([sample[0] for sample in batch]).reshape(batchSize, -1)
        A = np.asarray([sample[1] for sample in batch]).reshape(batchSize, -1)
        R = np.asarray([sample[2] for sample in batch]).reshape(batchSize)
        S_dash = np.asarray([sample[3] for sample in batch]).reshape(batchSize, -1)
        not_terminal = np.asarray([sample[4] for sample in batch]).reshape(batchSize)

        return S, A, R, S_dash, not_terminal
        
    def store(self, state, action, reward, nextState, not_terminal):
        self.buffer.append([state, action, reward, nextState, not_terminal])

In [3]:
class OU(object):
    def __init__(self, dim, mu, theta, sigma):
        self.dim = dim
        self.mu, self.theta, self.sigma = mu, theta, sigma
        self.noise_process = np.zeros(dim)

    def get_noise(self):
        self.noise_process = exploration_theta * (self.mu - self.noise_process) + self.sigma * np.random.randn(self.dim)
        return self.noise_process

In [4]:
class Network(object):
    def __init__(self, input_shape, spec, scope, trainable):
        self.spec, self.scope, self.trainable = spec, scope, trainable
        
        self.get_forward_pass_op(tf.placeholder(dtype=tf.float32, shape=input_shape), False)
        self.vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)

    def get_forward_pass_op(self, inp, reuse=True):
        with tf.variable_scope(self.scope, reuse=reuse):
            for layer in self.spec:
                inp = tf.layers.dense(inp, layer['units'], activation=layer['activation'], trainable=self.trainable)
                
        return inp

    def get_target_train_op(self, target_network, tau):
        update_ops = []
        for i, var in enumerate(self.vars):
            update_op = var.assign(tau * target_network.vars[i] + (1 - tau) * var)
            update_ops.append(update_op)

        return tf.group(*update_ops)
    
    def sum_weights(self):
        return tf.add_n([tf.nn.l2_loss(var) for var in self.vars if not 'bias' in var.name])

In [5]:
class Agent(object):
    def __init__(self, env):
        self.sess = tf.Session()
        self.env = env
        
        # Ornstein–Uhlenbeck process
        self.OU = OU(action_dim, exploration_mu, exploration_theta, exploration_sigma)
        
        # experience replay
        self.replay_memory = Experience(int(1e5))

        # episode counter
        self.episodes = tf.Variable(0.0, trainable=False)
        self.episode_inc_op = self.episodes.assign_add(1)
        
        # tf placeholders
        self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
        self.is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        
        # set up the networks
        critic_network = Network([None, state_dim + action_dim], critic_network_spec, 'critic_net', trainable=True)
        actor_network = Network([None, state_dim], actor_network_spec, 'actor_net', trainable=True)
        slow_critic_network = Network([None, state_dim + action_dim], critic_network_spec, 'slow_critic_net', trainable=False)
        slow_actor_network = Network([None, state_dim], actor_network_spec, 'slow_actor_net', trainable=False)

        # actors        
        self.policy_op = actor_network.get_forward_pass_op(self.state_ph) * 4
        slow_target_next_actions = slow_actor_network.get_forward_pass_op(self.next_state_ph)
        
        # critics
        critic_off_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.action_ph], axis=1))
        critic_on_pol = critic_network.get_forward_pass_op(tf.concat([self.state_ph, self.policy_op], axis=1))
        slow_q_values_next = slow_critic_network.get_forward_pass_op(tf.concat([self.next_state_ph, slow_target_next_actions], axis=1))
        
        # train critic        
        targets = tf.expand_dims(self.reward_ph, 1) + tf.expand_dims(self.is_not_terminal_ph, 1) * gamma * slow_q_values_next
        td_errors = targets - critic_off_pol
        critic_loss = tf.reduce_mean(tf.square(td_errors)) + l2_reg_critic * critic_network.sum_weights()
        self.critic_train_op = tf.train.AdamOptimizer(lr_critic * lr_decay ** self.episodes).minimize(critic_loss)
        
        # train actor
        actor_loss = -1 * tf.reduce_mean(critic_on_pol) + l2_reg_actor * actor_network.sum_weights()
        self.actor_train_op = tf.train.AdamOptimizer(lr_actor * lr_decay ** self.episodes).minimize(actor_loss, var_list=actor_network.vars)
        
        # train slow networks
        self.slow_actor_train_op = slow_actor_network.get_target_train_op(actor_network, tau)
        self.slow_critic_train_op = slow_critic_network.get_target_train_op(critic_network, tau)

        self.noise_scale = initial_noise_scale
        self.sess.run(tf.global_variables_initializer())
        
    def act(self, state, stochastic=True):
        action = self.sess.run(self.policy_op, feed_dict = {self.state_ph: state})

        if stochastic:
            self.noise_scale = (initial_noise_scale * noise_decay ** self.sess.run(self.episodes)) * (self.env.action_space.high - self.env.action_space.low)
            action += self.noise_scale * self.OU.get_noise()
        
        return action

    def train(self):
        # grab N (s,a,r,s') tuples from replay memory
        S, A, R, S_dash, not_terminal = self.replay_memory.recall(minibatch_size)

        # update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
        self.sess.run(self.critic_train_op, feed_dict = {self.state_ph: S, self.action_ph: A, self.reward_ph: R, self.next_state_ph: S_dash, self.is_not_terminal_ph: not_terminal})
        self.sess.run(self.actor_train_op, feed_dict = {self.state_ph: S})

        # update slow actor and critic targets towards current actor and critic
        self.sess.run([self.slow_actor_train_op, self.slow_critic_train_op])
        
    def increment_episode(self):
        self.sess.run(self.episode_inc_op)

In [6]:
# initialize session
tf.reset_default_graph()

env = wrappers.Monitor(env, '/home/tom/Desktop/work/reinforcement_learning/pendulum-v0')

# set seeds to 0
env.seed(0)
np.random.seed(0)

agent = Agent(env)

total_steps = 0
for ep in range(num_episodes):

    total_reward = 0
    steps_in_ep = 0

    # Initial state
    s_t = env.reset()
    if ep % 10 == 0: env.render()

    for t in range(max_steps_ep):
        a_t = agent.act(s_t.reshape(1,3), stochastic=False)

        # take step
        s_t_plus_1, r_t, done, _info = env.step(a_t)
        if ep % 10 == 0: env.render()
        total_reward += r_t

        agent.replay_memory.store(s_t.reshape(1,3), a_t, r_t, s_t_plus_1, 0.0 if done else 1.0)
        
        # update network weights to fit a minibatch of experience
        if total_steps % train_every == 0 and len(agent.replay_memory.buffer) >= minibatch_size:
            agent.train()
            
        s_t = s_t_plus_1
        total_steps += 1
        steps_in_ep += 1

        if done: break

    agent.increment_episode()
    print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep, total_reward, steps_in_ep))

# Finalize and upload results
env.close()

Episode  0, Reward: -1810.096, Steps: 200
Episode  1, Reward: -1585.307, Steps: 200
Episode  2, Reward: -1776.643, Steps: 200
Episode  3, Reward: -1592.150, Steps: 200
Episode  4, Reward: -1820.844, Steps: 200
Episode  5, Reward: -1641.447, Steps: 200
Episode  6, Reward: -1213.997, Steps: 200
Episode  7, Reward: -1507.079, Steps: 200
Episode  8, Reward: -1520.788, Steps: 200
Episode  9, Reward: -1482.488, Steps: 200
Episode 10, Reward: -1589.147, Steps: 200
Episode 11, Reward: -1072.305, Steps: 200
Episode 12, Reward: -1521.754, Steps: 200
Episode 13, Reward: -1400.231, Steps: 200
Episode 14, Reward: -1083.124, Steps: 200
Episode 15, Reward: -1163.005, Steps: 200
Episode 16, Reward: -1052.372, Steps: 200
Episode 17, Reward: -877.601, Steps: 200
Episode 18, Reward: -1497.745, Steps: 200
Episode 19, Reward:  -7.678, Steps: 200
Episode 20, Reward: -538.713, Steps: 200
Episode 21, Reward: -913.999, Steps: 200
Episode 22, Reward: -1055.146, Steps: 200
Episode 23, Reward: -1061.796, Steps: 2

Episode 197, Reward: -241.736, Steps: 200
Episode 198, Reward: -256.892, Steps: 200
Episode 199, Reward: -131.153, Steps: 200
Episode 200, Reward: -307.568, Steps: 200
Episode 201, Reward: -120.981, Steps: 200
Episode 202, Reward: -130.985, Steps: 200
Episode 203, Reward: -121.080, Steps: 200
Episode 204, Reward: -126.730, Steps: 200
Episode 205, Reward: -129.360, Steps: 200
Episode 206, Reward: -130.970, Steps: 200
Episode 207, Reward: -122.020, Steps: 200
Episode 208, Reward: -119.665, Steps: 200
Episode 209, Reward:  -0.441, Steps: 200
Episode 210, Reward: -125.703, Steps: 200
Episode 211, Reward: -121.232, Steps: 200
Episode 212, Reward: -121.071, Steps: 200
Episode 213, Reward:  -0.481, Steps: 200
Episode 214, Reward: -378.245, Steps: 200
Episode 215, Reward:  -0.254, Steps: 200
Episode 216, Reward:  -1.629, Steps: 200
Episode 217, Reward: -120.162, Steps: 200
Episode 218, Reward: -1490.886, Steps: 200
Episode 219, Reward: -280.640, Steps: 200
Episode 220, Reward: -120.978, Steps:

Episode 393, Reward: -234.350, Steps: 200
Episode 394, Reward: -116.760, Steps: 200
Episode 395, Reward: -130.960, Steps: 200
Episode 396, Reward: -351.956, Steps: 200
Episode 397, Reward: -243.873, Steps: 200
Episode 398, Reward: -114.500, Steps: 200
Episode 399, Reward: -126.738, Steps: 200
Episode 400, Reward: -127.978, Steps: 200
Episode 401, Reward: -123.208, Steps: 200
Episode 402, Reward: -128.876, Steps: 200
Episode 403, Reward:  -0.055, Steps: 200
Episode 404, Reward: -235.491, Steps: 200
Episode 405, Reward: -122.603, Steps: 200
Episode 406, Reward: -260.107, Steps: 200
Episode 407, Reward:  -0.408, Steps: 200
Episode 408, Reward: -116.048, Steps: 200
Episode 409, Reward: -242.064, Steps: 200
Episode 410, Reward: -122.194, Steps: 200
Episode 411, Reward: -119.902, Steps: 200
Episode 412, Reward: -239.573, Steps: 200
Episode 413, Reward: -288.006, Steps: 200
Episode 414, Reward: -127.550, Steps: 200
Episode 415, Reward: -116.616, Steps: 200
Episode 416, Reward: -123.411, Steps

Episode 589, Reward:  -4.596, Steps: 200
Episode 590, Reward: -358.328, Steps: 200
Episode 591, Reward: -123.349, Steps: 200
Episode 592, Reward:  -5.402, Steps: 200
Episode 593, Reward: -291.838, Steps: 200
Episode 594, Reward: -124.081, Steps: 200
Episode 595, Reward: -253.667, Steps: 200
Episode 596, Reward: -132.024, Steps: 200
Episode 597, Reward: -259.152, Steps: 200
Episode 598, Reward: -363.536, Steps: 200
Episode 599, Reward: -132.812, Steps: 200
Episode 600, Reward: -11.828, Steps: 200
Episode 601, Reward: -132.003, Steps: 200
Episode 602, Reward: -1272.291, Steps: 200
Episode 603, Reward: -1065.860, Steps: 200
Episode 604, Reward: -1083.277, Steps: 200
Episode 605, Reward: -1053.255, Steps: 200
Episode 606, Reward: -1121.589, Steps: 200
Episode 607, Reward: -915.979, Steps: 200
Episode 608, Reward: -1017.573, Steps: 200
Episode 609, Reward: -270.929, Steps: 200
Episode 610, Reward: -252.228, Steps: 200
Episode 611, Reward: -10.750, Steps: 200
Episode 612, Reward: -133.791, S

Episode 785, Reward: -128.007, Steps: 200
Episode 786, Reward: -1503.783, Steps: 200
Episode 787, Reward: -124.022, Steps: 200
Episode 788, Reward: -494.150, Steps: 200
Episode 789, Reward: -224.403, Steps: 200
Episode 790, Reward:  -1.561, Steps: 200
Episode 791, Reward: -128.308, Steps: 200
Episode 792, Reward: -225.665, Steps: 200
Episode 793, Reward: -1497.410, Steps: 200
Episode 794, Reward: -237.520, Steps: 200
Episode 795, Reward: -238.194, Steps: 200
Episode 796, Reward: -227.664, Steps: 200
Episode 797, Reward: -243.171, Steps: 200
Episode 798, Reward: -230.200, Steps: 200
Episode 799, Reward: -118.392, Steps: 200
Episode 800, Reward: -246.885, Steps: 200
Episode 801, Reward: -118.754, Steps: 200
Episode 802, Reward: -235.122, Steps: 200
Episode 803, Reward:  -0.013, Steps: 200
Episode 804, Reward: -508.942, Steps: 200
Episode 805, Reward: -253.074, Steps: 200
Episode 806, Reward: -252.168, Steps: 200
Episode 807, Reward: -240.927, Steps: 200
Episode 808, Reward: -126.246, Ste

Episode 981, Reward: -1491.419, Steps: 200
Episode 982, Reward: -391.743, Steps: 200
Episode 983, Reward: -241.576, Steps: 200
Episode 984, Reward: -234.624, Steps: 200
Episode 985, Reward: -114.078, Steps: 200
Episode 986, Reward: -244.728, Steps: 200
Episode 987, Reward: -244.967, Steps: 200
Episode 988, Reward: -121.275, Steps: 200
Episode 989, Reward: -233.381, Steps: 200
Episode 990, Reward: -119.542, Steps: 200
Episode 991, Reward: -231.355, Steps: 200
Episode 992, Reward: -681.466, Steps: 200
Episode 993, Reward: -231.554, Steps: 200
Episode 994, Reward: -340.893, Steps: 200
Episode 995, Reward: -379.802, Steps: 200
Episode 996, Reward: -243.913, Steps: 200
Episode 997, Reward: -119.213, Steps: 200
Episode 998, Reward:  -0.141, Steps: 200
Episode 999, Reward: -119.709, Steps: 200


In [7]:
gym.upload('/tmp/pendulum-v0', api_key='MY_API_KEY')

Error: [Pendulum-v0] No such video file /tmp/pendulum-v0/openaigym.video.0.4323.video000000.mp4. (HINT: Your video recorder may have broken midway through the run. You can check this with `video_recorder.functional`.)