In [1]:
import numpy as np
import tensorflow as tf
from sonnet.python.modules.basic import Linear
from sonnet.python.modules.base import AbstractModule

In [2]:
LEARNING_RATE = 1e-3

In [3]:
_EPSILON = 1e-6 # avoid nan

def swich(tensor):
    return tensor * tf.nn.sigmoid(tensor + _EPSILON)

# test for add eps
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print (sigmoid(_EPSILON) - sigmoid(0))

2.50000000035e-07


In [4]:
# shared neural network
def _build_shared_network(inputs):
    # inputs [batch_size, state_size]
    network = Linear(32, 'input_layer')(inputs)
    network = swich(network)
    network = Linear(64, 'hidden_layer')(network)
    return swich(network)

# build approximate neural network
def _build_approximate_network(inputs, action_size):
    shared_network = _build_shared_network(inputs)
    policy = Linear(action_size, 'policy')(shared_network)
    policy = tf.nn.softmax(policy + _EPSILON) # avoid nan   
    value = Linear(1, 'value')(shared_network)
    return policy, value

class simple_approximate_network(AbstractModule):
    def __init__(self, name):
        super().__init__(name=name)
    
    def _build(self, inputs, action_size):
        return _build_approximate_network(inputs, action_size)
        

In [5]:
# test 
tf.reset_default_graph()

W1 = simple_approximate_network('W1')
state = tf.placeholder(tf.float32, [5, 4], 'state')
W1(state, 2)
W1.get_variables()

(<tf.Variable 'W1/linear/w:0' shape=(4, 32) dtype=float32_ref>,
 <tf.Variable 'W1/linear/b:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'W1/linear_1/w:0' shape=(32, 64) dtype=float32_ref>,
 <tf.Variable 'W1/linear_1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'W1/linear_2/w:0' shape=(64, 2) dtype=float32_ref>,
 <tf.Variable 'W1/linear_2/b:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'W1/linear_3/w:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'W1/linear_3/b:0' shape=(1,) dtype=float32_ref>)

In [6]:
# global network for buffer weights and calculate gardients
class Access(object):
    def __init__(self, state_size, action_size, name='access'):
        #variable_scope for more clear graph, not necessary
        with tf.variable_scope(name):                   
            # placeholder for state and next state or you may like call it observation
            self.inputs = tf.placeholder(tf.float32, [None, state_size], 'inputs')     
            self.network = simple_approximate_network('global_network')
            self.policy, self.value = self.network(self.inputs, action_size)
            
        self.trainer = tf.train.RMSPropOptimizer(LEARNING_RATE, name='RMSProp')
        
    def get_trainable_variables(self):
        return self.network.get_variables()

In [7]:
# test 
A = Access(5, 3)

A.get_trainable_variables()

(<tf.Variable 'access/global_network/linear/w:0' shape=(5, 32) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear/b:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_1/w:0' shape=(32, 64) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_2/w:0' shape=(64, 3) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_2/b:0' shape=(3,) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_3/w:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'access/global_network/linear_3/b:0' shape=(1,) dtype=float32_ref>)

In [8]:
# batch gather function from https://github.com/deepmind/dnc/blob/master/util.py
def _batch_gather(values, indices):
  """Returns batched `tf.gather` for every row in the input."""
  with tf.name_scope('batch_gather', values=[values, indices]):
    unpacked = zip(tf.unstack(values), tf.unstack(indices))
    result = [tf.gather(value, index) for value, index in unpacked]
    return tf.stack(result)

In [9]:
# test
tf.reset_default_graph()
a = tf.get_variable('values', [6, 3], tf.float32)

b = np.random.randint(0, 3, 6)
b = tf.convert_to_tensor(b)

c = _batch_gather(a, b)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print (sess.run(c))


[-0.29143524 -0.70739245  0.46391261 -0.15822595 -0.08984572  0.17364264]


In [10]:
ENTROPY_BETA = 1e-2

In [17]:
# local network for advantage actor-critic which are also know as A2C
class ACNet(object):
    def __init__(self, Access, state_size, action_size, name):
        self.Access = Access
        self.state_size = state_size
        self.action_size = action_size
        # action space, we assume that action space is range(0 to action_size-1)
        self.action_space = np.arange(action_size, dtype=np.int32)
        
        #variable_scope local graph, necessary
        with tf.variable_scope(name):
            # placeholder for state and next state or you may like call it observation
            self.inputs = tf.placeholder(tf.float32, [5, state_size], 'inputs')   
            self.action = tf.placeholder(tf.int32, [5], 'action')
            # n-step reward and discounted n next step value
            self.target = tf.placeholder(tf.float32, [None, 1], 'target')
            
            self.network = simple_approximate_network('ACNet')
            self.policy, self.value = self.network(self.inputs, action_size)
            
            self._build_loss_function()
            self.update_local, self.update_access = self._build_update()
            
        
    def _build_loss_function(self):
        self.advantage = self.target - self.value
        # value loss
        self.value_loss = tf.reduce_mean(tf.square(self.advantage))
    
        # policy loss
        # get the stochastic policy action probability
        policy_action = _batch_gather(self.policy, self.action)
        log_policy_action = tf.log(policy_action + _EPSILON)
        # no grad pass through advantage in actor network 
        policy_loss = tf.stop_gradient(self.advantage) * tf.expand_dims(log_policy_action, axis=1)
        # entropy loss
        entropy_loss = tf.reduce_mean(self.policy * tf.log(self.policy + _EPSILON), axis=1, keep_dims=True)
        self.policy_loss = tf.reduce_mean(policy_loss + ENTROPY_BETA * entropy_loss)
        
        self.total_loss = self.value_loss + self.policy_loss
        # adjust some params
        self.a_policy_loss = tf.reduce_sum(policy_loss)
        self.a_entropy_loss = tf.reduce_mean(entropy_loss)
        self.a_value_loss = self.value_loss

    def _build_update(self):
        global_params = list(self.Access.get_trainable_variables())
        local_params = list(self.get_trainable_variables())
        
        # update local network weights
        zip_list = []
        for g,l in zip(global_params, local_params):
            zip_list.append(l.assign(g))
        
        # update global network gradients
        local_grads = tf.gradients(self.total_loss, local_params)
        apply_gradients = self.Access.trainer.apply_gradients(zip(local_grads, global_params))
        return zip_list, apply_gradients    
    
    def get_trainable_variables(self):
        return self.network.get_variables()

In [18]:
tf.reset_default_graph()

A = Access(5, 3)
B = ACNet(A, 5, 3, 'W1')

[<tf.Variable 'W1/ACNet/linear/w:0' shape=(5, 32) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear/b:0' shape=(32,) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_1/w:0' shape=(32, 64) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_1/b:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_2/w:0' shape=(64, 3) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_2/b:0' shape=(3,) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_3/w:0' shape=(64, 1) dtype=float32_ref>, <tf.Variable 'W1/ACNet/linear_3/b:0' shape=(1,) dtype=float32_ref>]
