In [10]:
import numpy as np
import os
import tensorflow as tf
import warnings

from keras.models import Model
from keras.layers import Dense, Flatten, Convolution2D, Input, Concatenate, Activation
from keras.optimizers import Adam
from pommerman.configs import ffa_v0_env
from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.constants import BOARD_SIZE
from rl.agents import DDPGAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Env, Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback
from rl.random import OrnsteinUhlenbeckProcess

In [11]:
number_of_training_steps = 500000
log_interval = 1000
file_log_path = './dqn/rl_logs/ddgp_dense_8_2/log.txt'
tensorboard_path = './dqn/logs/ddgp_dense_8_2/'
model_path = './dqn/model/ddgp_dense_8_2/model{step}.h4'

In [12]:
if not os.path.isdir(os.path.dirname(file_log_path)):
    os.makedirs(os.path.dirname(file_log_path))
if not os.path.isdir(os.path.dirname(model_path)):
    os.makedirs(os.path.dirname(model_path))

In [13]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


class TensorboardLogger(Callback):
    """Logging in tensorboard without tensorflow ops."""
    def __init__(self, log_dir):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def on_train_begin(self, logs):
        self.metrics_names = self.model.metrics_names

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []

    def on_episode_end(self, episode, logs):
        episode_steps = len(self.observations[episode])
        variables = {
            'step': self.step,
            'nb_steps': self.params['nb_steps'],
            'episode_steps': episode_steps,
            'episode_reward': np.sum(self.rewards[episode]),
            'reward_mean': np.mean(self.rewards[episode]),
            'reward_min': np.min(self.rewards[episode]),
            'reward_max': np.max(self.rewards[episode]),
            'action_mean': np.mean(np.argmax(self.actions[episode], axis=1)),
            'action_min': np.min(np.argmax(self.actions[episode], axis=1)),
            'action_max': np.max(np.argmax(self.actions[episode], axis=1)),
            'obs_mean': np.mean(self.observations[episode]),
            'obs_min': np.min(self.observations[episode]),
            'obs_max': np.max(self.observations[episode]),
        }

        # Format all metrics.
        metrics = np.array(self.metrics[episode])
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                try:
                    value = np.nanmean(metrics[:, idx])
                except Warning:
                    value = -1
                variables[name] = value
        for key, value in variables.items():
            self.log_scalar(key, value, episode + 1)

        # Free up resources.
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1

In [14]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
np.random.seed(0)
env.seed(0)

env.set_init_game_state(None)
nb_actions = env.action_space.n


def create_actor(actions, input_shape=(2369,)):
    inp = Input(input_shape)
    x = Dense(8)(inp)
    x = Activation('relu')(x)
    x = Dense(8)(x)
    x = Activation('relu')(x)    
    out = Dense(actions)(x)
    out = Activation('softmax')(out)
    model = Model(inputs=inp, outputs=out)
    return model


def create_critic(actions, input_shape=(2369,)):
    action_input = Input(shape=(actions,), name='action_input')
    observation_input = Input(shape=input_shape, name='observation_input')
    x = Concatenate()([action_input, observation_input])
    x = Dense(8)(x)
    x = Activation('relu')(x)
    x = Dense(8)(x)
    x = Activation('relu')(x)    
    x = Dense(1)(x)
    return action_input, Model(inputs=[action_input, observation_input], outputs=x)


actor = create_actor(nb_actions)
action_input, critic = create_critic(nb_actions)
print(actor.summary())
print(critic.summary())

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 2369)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 18960     
_________________________________________________________________
activation_6 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 72        
_________________________________________________________________
activation_7 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 6)                 54        
__________________________________

In [15]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        action = np.argmax(action)
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        # Add 3 random agents
        train_agent_pos = np.random.randint(0, 4)
        agents = []
        for agent_id in range(4):
            if agent_id == train_agent_pos:
                agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
            else:
                agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
        self.gym.set_agents(agents)
        self.gym.set_training_agent(agents[train_agent_pos].agent_id)
        
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    def featurize(self, obs):
        shape = (self.board_size, self.board_size, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)               # Rigid = 1
        wood_map = get_map(board, 2)                # Wood = 2
        bomb_map = get_map(board, 3)                # Bomb = 3
        flames_map = get_map(board, 4)              # Flames = 4
        fog_map = get_map(board, 5)                 # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)          # ExtraBomb = 6
        incr_range_map = get_map(board, 7)          # IncrRange = 7
        kick_map = get_map(board, 8)                # Kick = 8
        skull_map = get_map(board, 9)               # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value) # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = obs["ammo"]
        blast_strength = obs["blast_strength"]
        can_kick = int(obs["can_kick"])

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life], axis=2).flatten()
        obs = np.append(obs, [ammo, blast_strength, can_kick])
        return obs

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)


class CustomProcessor(Processor):
    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.
        # Arguments
            batch (list): List of states
        # Returns
            Processed list of states
        """
        batch = np.squeeze(batch, axis=1)
        return batch

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.
        """
        info['result'] = info['result'].value
        return info

In [16]:
env_wrapper = EnvWrapper(env, BOARD_SIZE)
processor = CustomProcessor()


memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=512, nb_steps_warmup_actor=512,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  batch_size=512, processor=processor)
agent.compile(Adam(lr=0.0001, clipnorm=1.), metrics=['mae'])


file_logger = FileLogger(file_log_path, interval=log_interval)
checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
tensorboard = TensorboardLogger(tensorboard_path)
callbacks=[file_logger, checkpoint, tensorboard]
if os.path.isfile(model_path):
    agent.load_weights(model_path)

In [None]:
history = agent.fit(env_wrapper, nb_steps=number_of_training_steps, visualize=False, verbose=2,
        nb_max_episode_steps=env._max_steps, callbacks=callbacks)

Training for 500000 steps ...
     44/500000: episode: 1, duration: 0.887s, episode steps: 44, steps per second: 50, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 0.195 [-0.265, 0.465], mean observation: 0.070 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --
    720/500000: episode: 2, duration: 15.889s, episode steps: 676, steps per second: 43, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: -0.011 [-1.055, 1.412], mean observation: 0.047 [0.000, 24.000], loss: 0.004099, mean_absolute_error: 0.039448, mean_q: 0.070418
   1115/500000: episode: 3, duration: 21.261s, episode steps: 395, steps per second: 19, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.197 [-0.775, 1.029], mean observation: 0.049 [0.000, 24.000], loss: 0.002258, mean_absolute_error: 0.029248, mean_q: 0.009115
   1140/500000: episode: 4, duration: 1.393s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean 

   1965/500000: episode: 30, duration: 1.884s, episode steps: 35, steps per second: 19, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 0.179 [-0.193, 1.023], mean observation: 0.070 [0.000, 24.000], loss: 0.003115, mean_absolute_error: 0.039058, mean_q: 0.025047
   2058/500000: episode: 31, duration: 5.025s, episode steps: 93, steps per second: 19, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 0.240 [-0.254, 1.039], mean observation: 0.066 [0.000, 24.000], loss: 0.003472, mean_absolute_error: 0.039918, mean_q: 0.024143
   2218/500000: episode: 32, duration: 8.912s, episode steps: 160, steps per second: 18, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 0.167 [-0.439, 1.439], mean observation: 0.058 [0.000, 24.000], loss: 0.003215, mean_absolute_error: 0.040020, mean_q: 0.013994
   2280/500000: episode: 33, duration: 3.333s, episode steps: 62, steps per second: 19, episode reward: -1.000, mean reward: -0.0

   3409/500000: episode: 59, duration: 1.386s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.121 [-0.288, 1.036], mean observation: 0.072 [0.000, 24.000], loss: 0.001949, mean_absolute_error: 0.032546, mean_q: -0.011619
   3434/500000: episode: 60, duration: 1.344s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.151 [-0.271, 1.232], mean observation: 0.065 [0.000, 24.000], loss: 0.002021, mean_absolute_error: 0.032848, mean_q: -0.016352
   3459/500000: episode: 61, duration: 1.393s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.194 [-0.130, 1.120], mean observation: 0.071 [0.000, 24.000], loss: 0.001951, mean_absolute_error: 0.032966, mean_q: -0.011841
   3484/500000: episode: 62, duration: 1.381s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0

   4160/500000: episode: 88, duration: 1.338s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.154 [-0.151, 1.120], mean observation: 0.070 [0.000, 24.000], loss: 0.001912, mean_absolute_error: 0.032521, mean_q: -0.011332
   4185/500000: episode: 89, duration: 1.346s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.131 [-0.372, 1.016], mean observation: 0.072 [0.000, 24.000], loss: 0.001705, mean_absolute_error: 0.031613, mean_q: -0.009297
   4210/500000: episode: 90, duration: 1.380s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.226 [-0.099, 1.150], mean observation: 0.072 [0.000, 24.000], loss: 0.001903, mean_absolute_error: 0.031903, mean_q: -0.013797
   4235/500000: episode: 91, duration: 1.384s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0

   4913/500000: episode: 117, duration: 1.328s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.235 [-0.144, 1.182], mean observation: 0.072 [0.000, 24.000], loss: 0.002043, mean_absolute_error: 0.032446, mean_q: -0.026052
   4938/500000: episode: 118, duration: 1.349s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.222 [-0.175, 1.123], mean observation: 0.072 [0.000, 24.000], loss: 0.001881, mean_absolute_error: 0.031520, mean_q: -0.021851
   4963/500000: episode: 119, duration: 1.357s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.177 [-0.178, 1.027], mean observation: 0.072 [0.000, 24.000], loss: 0.001861, mean_absolute_error: 0.031340, mean_q: -0.023628
   4988/500000: episode: 120, duration: 1.404s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward

   5638/500000: episode: 146, duration: 1.468s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.178 [-0.353, 1.106], mean observation: 0.072 [0.000, 24.000], loss: 0.001930, mean_absolute_error: 0.032335, mean_q: -0.043011
   5663/500000: episode: 147, duration: 1.498s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.227 [-0.286, 1.163], mean observation: 0.072 [0.000, 24.000], loss: 0.001881, mean_absolute_error: 0.032525, mean_q: -0.041068
   5688/500000: episode: 148, duration: 1.451s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.182 [-0.155, 1.167], mean observation: 0.072 [0.000, 24.000], loss: 0.001930, mean_absolute_error: 0.031767, mean_q: -0.040887
   5713/500000: episode: 149, duration: 1.289s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward

   6421/500000: episode: 175, duration: 1.358s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.172 [-0.184, 1.230], mean observation: 0.072 [0.000, 24.000], loss: 0.001885, mean_absolute_error: 0.032631, mean_q: -0.071815
   6446/500000: episode: 176, duration: 1.400s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.168 [-0.180, 1.304], mean observation: 0.072 [0.000, 24.000], loss: 0.001972, mean_absolute_error: 0.032485, mean_q: -0.070970
   6497/500000: episode: 177, duration: 2.696s, episode steps: 51, steps per second: 19, episode reward: -1.000, mean reward: -0.020 [-1.000, 0.000], mean action: 0.110 [-0.521, 1.002], mean observation: 0.071 [0.000, 24.000], loss: 0.001983, mean_absolute_error: 0.032806, mean_q: -0.067918
   6542/500000: episode: 178, duration: 2.390s, episode steps: 45, steps per second: 19, episode reward: -1.000, mean reward

   7650/500000: episode: 204, duration: 4.332s, episode steps: 82, steps per second: 19, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 0.149 [-0.339, 1.179], mean observation: 0.068 [0.000, 24.000], loss: 0.002170, mean_absolute_error: 0.034874, mean_q: -0.105200
   7759/500000: episode: 205, duration: 5.480s, episode steps: 109, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 0.015 [-0.675, 1.299], mean observation: 0.059 [0.000, 24.000], loss: 0.002203, mean_absolute_error: 0.034923, mean_q: -0.108401
   7786/500000: episode: 206, duration: 1.476s, episode steps: 27, steps per second: 18, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 0.179 [-0.229, 1.114], mean observation: 0.070 [0.000, 24.000], loss: 0.002039, mean_absolute_error: 0.034359, mean_q: -0.111071
   7814/500000: episode: 207, duration: 1.550s, episode steps: 28, steps per second: 18, episode reward: -1.000, mean rewar

  14938/500000: episode: 232, duration: 23.067s, episode steps: 432, steps per second: 19, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.255 [-0.899, 1.739], mean observation: 0.052 [0.000, 24.000], loss: 0.001368, mean_absolute_error: 0.025530, mean_q: -0.170953
  15100/500000: episode: 233, duration: 8.724s, episode steps: 162, steps per second: 19, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: -0.032 [-0.878, 1.273], mean observation: 0.063 [0.000, 24.000], loss: 0.001330, mean_absolute_error: 0.025395, mean_q: -0.171189
  15615/500000: episode: 234, duration: 27.531s, episode steps: 515, steps per second: 19, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.171 [-1.117, 1.964], mean observation: 0.049 [0.000, 24.000], loss: 0.001326, mean_absolute_error: 0.025888, mean_q: -0.176642
  15876/500000: episode: 235, duration: 14.367s, episode steps: 261, steps per second: 18, episode reward: -1.000, mea

  24050/500000: episode: 260, duration: 23.984s, episode steps: 451, steps per second: 19, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.096 [-1.160, 1.677], mean observation: 0.051 [0.000, 24.000], loss: 0.001064, mean_absolute_error: 0.023738, mean_q: -0.202797
  24391/500000: episode: 261, duration: 18.682s, episode steps: 341, steps per second: 18, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.283 [-0.942, 1.278], mean observation: 0.053 [0.000, 24.000], loss: 0.001088, mean_absolute_error: 0.023746, mean_q: -0.204676
  24703/500000: episode: 262, duration: 16.769s, episode steps: 312, steps per second: 19, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.226 [-0.555, 1.618], mean observation: 0.057 [0.000, 24.000], loss: 0.001094, mean_absolute_error: 0.024026, mean_q: -0.203727
  24986/500000: episode: 263, duration: 15.221s, episode steps: 283, steps per second: 19, episode reward: -1.000, mea

In [None]:
agent.save_weights(model_path, overwrite=True)