In [49]:
import gym

from tensorflow.python.framework.ops import disable_eager_execution

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

In [39]:
env = gym.make('CartPole-v0')

In [47]:
env.observation_space.high

array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)

In [40]:
env.action_space

Discrete(2)

In [51]:
from tensorflow.keras.optimizers import RMSprop

class Agent:
    def __init__(self, inp_dim, out_dim, lr, tau = 0.001):
        self.inp_dim = inp_dim
        self.out_dim = out_dim
        self.tau = tau
        self.optimizer = RMSprop(lr=lr, epsilon=0.1, rho=0.99)
    
    def fit(self, inp, targ):
        """
        Performs an epoch of training
        """
        self.model.fit(self.reshape(inp), targ, epochs=1, verbose = 0)
        
    def predict(self, inp):
        return self.model.predict(self.reshape(inp))
        
    def reshape(self, x):
        if len(x.shape) < 4 and len(self.inp_dim) > 2: return np.expand_dims(x, axis=0)
        elif len(x.shape) < 2: return np.expand_dims(x, axis=0)
        else: return x

In [52]:
import tensorflow.keras.backend as K

In [93]:
class Actor(Agent):
    """
    Actor for the AC3 algorithm
    """
    def __init__(self, inp_dim, out_dim, network, lr):
        Agent.__init__(self, inp_dim, out_dim, lr)
        self.model = self.addHead(network)
        self.action_pl = K.placeholder(shape=(None, self.out_dim))
        self.advantages_pl = K.placeholder(shape=(None,))
        # Pre-compile for threading
        self.model._make_predict_function()
    
    def addHead(self, network):
        """
        Assemble Actor network to predict probability of each action
        """
        x = Dense(128, activation='relu')(network.output)
        out = Dense(self.out_dim, activation='softmax')(x)
        return Model(network.input, out)
    
    def optimizer(self):
        """ Actor Optimization: Advantages + Entropy term to encourage exploration
        (Cf. https://arxiv.org/abs/1602.01783)
        """
        weighted_actions = K.sum(self.action_pl * self.model.output, axis=1)
        eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(self.advantages_pl)
        entropy = K.sum(self.model.output * K.log(self.model.output + 1e-10), axis=1)
        loss = 0.001 * entropy - K.sum(eligibility)
        
        updates = self.optimizer.get_updates(self.model.trainable_weights, 
                                             [], loss)
        return K.function([self.model.input,
                          self.action_pl,
                          self.advantages_pl],
                          [],
                          updates=updates)
    
    def save(self, path):
        self.model.save_weights(path + '_actor.h5')
        
    def load_weights(self, path):
        self.model.load_weights(path)

In [101]:
class Critic(Agent):
    """
    Critic for the AC3 algorithm
    """
    def __init__(self, inp_dim, out_dim, network, lr):
        Agent.__init__(self, inp_dim, out_dim, lr)
        self.model = self.add_Head(network)
        self.discounted_r = K.placeholder(shape=(None,))
        # Pre-compile for threading
        self.model._make_predict_function()
    
    def addHead(self, network):
        """
        Assemble the Critic head of the network to predict the value 
        of each state
        """
        x = Dense(128, activation='relu')(network.output)
        out = Dense(1, activation='linear')(x)
        return Model(network.input, out)
    
    def optimizer(self):
        """
        Critic Optimization: Mean Squared Error over discounted 
        rewards
        """
        critic_loss = K.mean(K.square(self.discounted_r - 
                                      self.mode.output))
        updates = self.optimizer.get_updates(self.model.trainable_weights,
                                             [],
                                             critic_loss)
        return K.function([self.model.input, self.discounted_r],
                          [], 
                          updates=updates)
    
    def save(self, path):
        self.model.save_weights(path + '_critic.h5')

    def load_weights(self, path):
        self.model.load_weights(path)

In [102]:
class A3C:
    """
    Asynchronous Actor-Critic, main algo
    """
    def __init__(self, 
                 act_dim, 
                 env_dim, 
                 consecutive_frames,
                 gamma=0.99,
                 lr=0.0001):
        """
        Initialization
        """
        disable_eager_execution()
        
        self.act_dim = act_dim
        self.env_dim = (consecutive_frames,) + env_dim
        self.gamma = gamma
        self.lr = lr
        # Create shared network
        self.shared = self.build_network()
        # Create actor and critic heads
        self.actor = Actor(self.env_dim, act_dim, self.shared, lr)
        self.critic = Critic(self.env_dim, act_dim, self,shared, lr)
        # Build optimizers
        self.a_opt = self.actor.optimizer()
        self.c_opt = self.critic.optimizer()
        
    def build_network(self):
        """
        Assemble the shared layers
        """
        inp = Input(self.env_dim, name="Input")
        layer1 = Dense(512, activation="relu")(inp)
        layer2 = Dense(512, activation="relu")(layer1)
        layer3 = Dense(64, activation="relu")(layer2)
        layer4 = Dense(256, activation="relu")(layer3)
        return Model(inp, layer4)
    
    def policy_action(self, s):
        """
        Use the actor head of the network to predict
        the next action to take, using the policy
        """
        return np.random.choice(np.arange(self.act_dim),
                                1,
                                p=self.actor.predict(s).ravel())[0]
    
    def discount(self, r, done, s):
        """
        Compute the gamma-discounted rewards over an episode
        """
        discounted_r, cumul_r = np.zeros_like(r), 0
        for t in reversed(range(0, len(r))):
            cumul_r = r[t] + cumul_r * self.gamma
            discounted_r[t] = cumul_r
        return discounted_r
    
    def train_models(self, states, actions, rewards, done):
        """
        Update actor and critic heads from experience
        """
        # Compute discounted rewards and Advantage (TD, Error)
        discounted_rewards = self.discount(rewards, done, states[-1])
        state_values = self.critic.predict(np.array(states))
        advantages = discounted_rewards - np.reshape(state_values, 
                                                     len(state_values))
        # Networks optimization
        self.a_opt([states, actions, advantages])
        self.c_opt([states, discounted_rewards])
    
    def train(self, env, args, summary_writer):
        raise NotImplementedError