In [1]:
import gym
import logging

import os
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, losses, optimizers, models

In [19]:

class ProbabilityDistribution(models.Model):
    def call(self, logits):
        return tf.squeeze(tf.random.categorical(logits, 1), axis=-1)

class MyModel(models.Model):
    def __init__(self, num_actions):
        super(MyModel, self).__init__(name="mlp_policy")
        self.hidden1 = layers.Dense(units=128, activation="relu", name="hd_1")
        self.hidden2 = layers.Dense(units=128, activation="relu", name="hd_2")
        self.value = layers.Dense(units=1, name="value")
        self.logits = layers.Dense(num_actions, name="policy_logits")
        self.dist = ProbabilityDistribution()
        
    def call(self, inputs):
        # inputs is a numpy array, convert to Tensor
        x = tf.convert_to_tensor(inputs)
        # separate hidden layers from the same input tensor
        hidden_logs = self.hidden1(x)
        hidden_vals = self.hidden2(x)
        return self.logits(hidden_logs), self.value(hidden_vals)
    
    def action_value(self, obs):
        # executes call() under the hood
        logits, value = self.predict(obs)
        action = self.dist.predict(logits)
        # a simpler option, will become clear later why we don't use it
        # action = tf.random.categorical(logits, 1)
        return np.squeeze(action, axis=-1), np.squeeze(value, axis=-1)
        
class A2CAgent:
    def __init__(self, model):
        self.params = {
            "gamma": 0.99,
            "value": 0.5,
            "entropy": 0.0001
        }
        self.model = model
        self.model.compile(optimizer=optimizers.RMSprop(lr=0.0007),
                           loss=[self._logits_loss, self._value_loss])
    
    def train(self, env, batch_sz=32, updates=1000):
        # storage helpers for a single batch of data
        actions = np.empty((batch_sz,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_sz))
        observations = np.empty((batch_sz,) + env.observation_space.shape)
        # training loop: collect samples, send to optimizer, repeat updates times
        ep_rews = [0.0]
        next_obs = env.reset()
        for update in range(updates):
            for step in range(batch_sz):
                observations[step] = next_obs.copy()
                actions[step], values[step] = self.model.action_value(next_obs[None, :])
                next_obs, rewards[step], dones[step], _ = env.step(actions[step])

                ep_rews[-1] += rewards[step]
                if dones[step]:
                    ep_rews.append(0.0)
                    next_obs = env.reset()
                    logging.info("Episode: %03d, Reward: %03d" % (len(ep_rews)-1, ep_rews[-2]))

            _, next_value = self.model.action_value(next_obs[None, :])
            returns, advs = self._returns_advantages(rewards, dones, values, next_value)
            # a trick to input actions and advantages through same API
            acts_and_advs = np.concatenate([actions[:, None], advs[:, None]], axis=-1)
            # performs a full training step on the collected batch
            # note: no need to mess around with gradients, Keras API handles it
            step_losses = self.model.train_on_batch(observations, [acts_and_advs, returns])
            logging.debug("[%d/%d] Losses: %s" % (update+1, updates, step_losses))
        return ep_rews
        
    def test(self, env, render=False):
        obs, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, doen, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
        return ep_reward        
    
    def _returns_advantages(self, rewards, dones, values, next_value):
        # next_value is the bootstrap value estimate of a future state (the critic)
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)
        # returns are calculated as discounted sum of future rewards
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.params['gamma'] * returns[t+1] * (1-dones[t])
        returns = returns[:-1]
        # advantages are returns - baseline, value estimates in our case
        advantages = returns - values
        return returns, advantages
    
    def _value_loss(self, returns, value):
        # value loss is typically MSE between value estimates and returns
        return self.params['value']* losses.mean_squared_error(returns, value)

    def _logits_loss(self, acts_and_advs, logits):
        # a trick to input actions and advantages through same API
        actions, advantages = tf.split(acts_and_advs, 2, axis=-1)
        # sparse categorical CE loss obj that supports sample_weight arg on call()
        # from_logits argument ensures transformation into normalized probabilities
        weighted_sparse_ce = losses.SparseCategoricalCrossentropy(from_logits=True)
        # policy loss is defined by policy gradients, weighted by advantages
        # note: we only calculate the loss on the actions we've actually taken
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)
        # entropy loss can be calculated via CE over itself
        entropy_loss = losses.categorical_crossentropy(logits, logits, from_logits=True)
        # here signs are flipped because optimizer minimizes
        return policy_loss - self.params['entropy']*entropy_loss

In [20]:
logging.getLogger().setLevel(logging.INFO)
env = gym.make('CartPole-v0')

model = MyModel(num_actions=env.action_space.n)
agent = A2CAgent(model)

rewards_history = agent.train(env)
print("Finished training.")
print("Total Episode Reward: %d out of 200" % agent.test(env, True))

plt.style.use('seaborn')
plt.plot(np.arange(0, len(rewards_history), 25), rewards_history[::25])
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

INFO:root:Episode: 001, Reward: 010
INFO:root:Episode: 002, Reward: 022
INFO:root:Episode: 003, Reward: 016
INFO:root:Episode: 004, Reward: 015
INFO:root:Episode: 005, Reward: 019
INFO:root:Episode: 006, Reward: 010
INFO:root:Episode: 007, Reward: 024
INFO:root:Episode: 008, Reward: 021
INFO:root:Episode: 009, Reward: 010
INFO:root:Episode: 010, Reward: 010
INFO:root:Episode: 011, Reward: 011
INFO:root:Episode: 012, Reward: 009
INFO:root:Episode: 013, Reward: 017
INFO:root:Episode: 014, Reward: 028
INFO:root:Episode: 015, Reward: 026
INFO:root:Episode: 016, Reward: 018
INFO:root:Episode: 017, Reward: 016
INFO:root:Episode: 018, Reward: 049
INFO:root:Episode: 019, Reward: 018
INFO:root:Episode: 020, Reward: 020
INFO:root:Episode: 021, Reward: 015
INFO:root:Episode: 022, Reward: 015
INFO:root:Episode: 023, Reward: 013
INFO:root:Episode: 024, Reward: 013
INFO:root:Episode: 025, Reward: 036
INFO:root:Episode: 026, Reward: 024
INFO:root:Episode: 027, Reward: 031
INFO:root:Episode: 028, Rewa

INFO:root:Episode: 229, Reward: 070
INFO:root:Episode: 230, Reward: 050
INFO:root:Episode: 231, Reward: 045
INFO:root:Episode: 232, Reward: 083
INFO:root:Episode: 233, Reward: 032
INFO:root:Episode: 234, Reward: 034
INFO:root:Episode: 235, Reward: 058
INFO:root:Episode: 236, Reward: 045
INFO:root:Episode: 237, Reward: 073
INFO:root:Episode: 238, Reward: 063
INFO:root:Episode: 239, Reward: 052
INFO:root:Episode: 240, Reward: 022
INFO:root:Episode: 241, Reward: 020
INFO:root:Episode: 242, Reward: 081
INFO:root:Episode: 243, Reward: 039
INFO:root:Episode: 244, Reward: 042
INFO:root:Episode: 245, Reward: 038
INFO:root:Episode: 246, Reward: 028
INFO:root:Episode: 247, Reward: 036
INFO:root:Episode: 248, Reward: 038
INFO:root:Episode: 249, Reward: 053
INFO:root:Episode: 250, Reward: 025
INFO:root:Episode: 251, Reward: 021
INFO:root:Episode: 252, Reward: 070


KeyboardInterrupt: 