# Import libraries

In [6]:
import os
import sys
import numpy as np

import pommerman
from pommerman import agents
from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility

import tensorflow as tf
import tf_agents

from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import py_environment, tf_py_environment
from tf_agents.networks import sequential
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

# driver
from tf_agents.metrics import py_metrics
from tf_agents.drivers import py_driver
from tf_agents.policies import py_tf_eager_policy

from tf_agents.trajectories import time_step as ts

In [7]:
# set hyperparameters
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =   1# @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

# Create Environment

In [8]:
def make_np_float(feature):
    return np.array(feature).astype(np.float32)


def featurize(obs):
    board = obs["board"].reshape(-1).astype(np.float32)
    bomb_blast_strength = obs["bomb_blast_strength"].reshape(
        -1).astype(np.float32)
    bomb_life = obs["bomb_life"].reshape(-1).astype(np.float32)
    position = make_np_float(obs["position"])
    ammo = make_np_float([obs["ammo"]])
    blast_strength = make_np_float([obs["blast_strength"]])
    can_kick = make_np_float([obs["can_kick"]])

    teammate = obs["teammate"]
    if teammate is not None:
        teammate = teammate.value
    else:
        teammate = -1
    teammate = make_np_float([teammate])

    enemies = obs["enemies"]
    enemies = [e.value for e in enemies]
    if len(enemies) < 3:
        enemies = enemies + [-1]*(3 - len(enemies))
    enemies = make_np_float(enemies)

    return np.concatenate((board, bomb_blast_strength, bomb_life, position, ammo, blast_strength, can_kick, teammate, enemies))


In [9]:
# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)

# Add agents
class TFAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


# Add 3 simple agents
agents = []
for agent_id in range(3):
    agents.append(RandomAgent(config["agent"](agent_id, config["game_type"])))

# Add TFAgent
agent_id += 1
agents.append(TFAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_init_game_state(None)
env.set_training_agent(3)


In [10]:
class PommePyWrapper(py_environment.PyEnvironment):

    def __init__(self, gym, visualize=False):
        self.gym = gym
        self._episode_ended = False
        self.visualize = visualize

        # this can be implemented more directly
        self._action_spec = tf_agents.environments.gym_wrapper.GymWrapper(
            self.gym).action_spec()
        self._observation_spec = tf_agents.environments.gym_wrapper.GymWrapper(
            self.gym).observation_spec()
        self._state = None

    def _reset(self):
        obs = self.gym.reset()
        agent_obs = featurize(obs[3])
        self._state = agent_obs
        return ts.restart(agent_obs)

    def _step(self, action):
        if self.visualize:
            self.gym.render()
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, done, _ = self.gym.step(all_actions)
        agent_state = featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]

        self._state = agent_state

        if done:
            self._episode_ended = True
            if self.visualize:
                self.gym.render(close=True)
            self.gym.close()
            return ts.termination(np.array(agent_state), agent_reward)

        return ts.transition(observation=np.array(agent_state), reward=agent_reward, discount=1)

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec


In [11]:
# Transform the environment to a Tensorflow environment (train and eval environments)
train_py_env = PommePyWrapper(env)
eval_py_env = PommePyWrapper(env)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)


# Create agent

In [12]:
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

# Policies

In [14]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

# Metrics and evaluation

In [23]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
avg_return


# See also the metrics module for standard implementations of different metrics.
# https://github.com/tensorflow/agents/tree/master/tf_agents/metrics

-0.6

# Train the agent

In [24]:
from tf_agents.policies import py_tf_eager_policy

replay_buffer = []
metric = py_metrics.AverageReturnMetric()
observers = [replay_buffer.append, metric]

driver = py_driver.PyDriver(
    train_py_env, 
    py_tf_eager_policy.PyTFEagerPolicy(agent.collect_policy, use_tf_function=True), 
    observers, 
    max_steps=1000, 
    max_episodes=100)

initial_time_step = train_py_env.reset()
final_time_step, _ = driver.run(initial_time_step)

print('Average Return: ', metric.result())

replay_buffer

Average Return:  -96.5


[Trajectory(
 {'action': array(5, dtype=int64),
  'discount': array(1., dtype=float32),
  'next_step_type': array(1),
  'observation': array([ 0.,  2.,  1.,  0.,  0.,  2.,  1.,  1.,  1.,  2.,  2.,  2., 10.,
         0.,  0.,  2.,  2.,  2.,  0.,  0., 13.,  0.,  1.,  0.,  0.,  2.,
         0.,  1.,  1.,  1.,  2.,  0.,  1.,  0.,  0.,  2.,  0.,  0.,  0.,
         2.,  1.,  2.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  1.,  1.,  1.,
         1.,  2.,  1.,  2.,  2.,  1.,  0.,  1.,  0.,  2.,  0.,  0.,  2.,
         0.,  1.,  2.,  1.,  2.,  1.,  2.,  0.,  1.,  2.,  2.,  1.,  1.,
         0.,  1.,  1.,  1.,  0.,  1.,  0.,  2.,  0.,  1.,  1.,  0.,  2.,
         2.,  1.,  0.,  2.,  2.,  0.,  0.,  2.,  2., 11.,  0.,  0.,  2.,
         2.,  2.,  0.,  0., 12.,  1.,  2.,  0.,  1.,  0.,  1.,  0.,  1.,
         1.,  2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 

In [58]:
# Reset the train step.
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = train_py_env.reset()

# Create a driver to collect experience.
collect_driver = iver.PyDriver(py_dr
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
      agent.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=collect_steps_per_iteration)

for _ in range(num_iterations):

  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

NameError: name 'rb_observer' is not defined