In [1]:
import numpy as np
import os
import tensorflow as tf
import warnings

from keras.layers import Dense, Concatenate, Input, Convolution2D, Flatten, Activation, BatchNormalization, Add
from keras.models import Model, load_model
from keras.optimizers import Adam
from pommerman.configs import ffa_competition_env
from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.constants import BOARD_SIZE
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.core import Env, Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback
from rl.random import OrnsteinUhlenbeckProcess

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Import error! You will not be able to render --> Cannot connect to "None"


In [2]:
number_of_training_steps = 500000
log_interval = 10000
file_log_path = './dqn/rl_logs/ddgp_go_3blocks/log.txt'
tensorboard_path = './dqn/logs/ddgp_go_3blocks/'
model_path = './dqn/model/ddgp_go_3blocks/model{step}.h4'

In [3]:
if not os.path.isdir(os.path.dirname(file_log_path)):
    os.makedirs(os.path.dirname(file_log_path))
if not os.path.isdir(os.path.dirname(model_path)):
    os.makedirs(os.path.dirname(model_path))

In [4]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


class TensorboardLogger(Callback):
    """Logging in tensorboard without tensorflow ops."""
    def __init__(self, log_dir):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def on_train_begin(self, logs):
        self.metrics_names = self.model.metrics_names

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []

    def on_episode_end(self, episode, logs):
        episode_steps = len(self.observations[episode])
        variables = {
            'step': self.step,
            'episode_steps': episode_steps,
            'episode_reward': np.sum(self.rewards[episode]),
            'action_mean': np.mean(np.argmax(self.actions[episode], axis=1)),
            'action_min': np.min(np.argmax(self.actions[episode], axis=1)),
            'action_max': np.max(np.argmax(self.actions[episode], axis=1)),
        }

        # Format all metrics.
        metrics = np.array(self.metrics[episode])
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                try:
                    value = np.nanmean(metrics[:, idx])
                except Warning:
                    value = -1
                variables[name] = value
        for key, value in variables.items():
            self.log_scalar(key, value, episode + 1)

        # Free up resources.
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1

In [5]:
# Instantiate the environment
config = ffa_competition_env()
env = Pomme(**config["env_kwargs"])
np.random.seed(0)
env.seed(0)

env.set_init_game_state(None)
nb_actions = env.action_space.n

def get_res_block(input):
    # Res block 1        
    x = Convolution2D(256, 3, padding='same')(input)
    x = Activation('elu')(x)
    x = Convolution2D(256, 3, padding='same')(x)
    x = Add()([input, x])
    x = Activation('elu')(x)
    return x

def create_actor(actions, input_shape=(BOARD_SIZE, BOARD_SIZE, 17,)):
    inp = Input(input_shape)
    x = Convolution2D(256, 3, padding='same')(inp)
    x = Activation('elu')(x)

    # Ten residual blocks
    for i in range(3):
        x = get_res_block(x)

    # Output block
    # Should be 2 filters
    x = Convolution2D(4, 1, padding='same')(x)
    x = Activation('elu')(x)
    x = Flatten()(x)
    out = Dense(actions, activation='softmax')(x)
    model = Model(inputs = inp, outputs=out)
    return model


def create_critic(actions, input_shape=(BOARD_SIZE, BOARD_SIZE, 17,)):
    action_input = Input(shape=(actions,), name='action_input')
    
    obs_inp = Input(input_shape)
    x = Convolution2D(256, 3, padding='same')(obs_inp)
    x = Activation('elu')(x)

    # Ten residual blocks
    for i in range(3):
        x = get_res_block(x)

    # Output block
    # Should be 2 filters
    x = Convolution2D(4, 1, padding='same')(x)
    x = Activation('elu')(x)
    x = Flatten()(x)
    out = Dense(actions, activation='softmax')(x)
    model = Model(inputs = obs_inp, outputs=out)
    
    x = Concatenate()([action_input, x])
    x = Dense(128, activation='elu')(x)
    out = Dense(1)(x)
    
    model = Model(inputs=[action_input, obs_inp], outputs=out)
    return action_input, model

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [6]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        action = np.argmax(action)
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        # Add 3 random agents
        train_agent_pos = np.random.randint(0, 4)
        agents = []
        for agent_id in range(4):
            if agent_id == train_agent_pos:
                agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
            else:
                agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
        self.gym.set_agents(agents)
        self.gym.set_training_agent(agents[train_agent_pos].agent_id)
        
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def featurize(self, obs):
        shape = (BOARD_SIZE, BOARD_SIZE, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)               # Rigid = 1
        wood_map = get_map(board, 2)                # Wood = 2
        bomb_map = get_map(board, 3)                # Bomb = 3
        flames_map = get_map(board, 4)              # Flames = 4
        fog_map = get_map(board, 5)                 # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)          # ExtraBomb = 6
        incr_range_map = get_map(board, 7)          # IncrRange = 7
        kick_map = get_map(board, 8)                # Kick = 8
        skull_map = get_map(board, 9)               # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value) # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = np.full((BOARD_SIZE, BOARD_SIZE, 1), obs["ammo"])
        blast_strength = np.full((BOARD_SIZE, BOARD_SIZE, 1), obs["blast_strength"])
        can_kick = np.full((BOARD_SIZE, BOARD_SIZE, 1), int(obs["can_kick"]))

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life, ammo, blast_strength, can_kick], axis=2)
        return obs 

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)


class CustomProcessor(Processor):
    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.
        # Arguments
            batch (list): List of states
        # Returns
            Processed list of states
        """
        batch = np.squeeze(batch, axis=1).astype('float')
        return batch

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.
        """
        info['result'] = info['result'].value
        return info

In [7]:
#tf.reset_default_graph()
env_wrapper = EnvWrapper(env, BOARD_SIZE)
processor = CustomProcessor()

actor = create_actor(nb_actions)
action_input, critic = create_critic(nb_actions)

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1024, nb_steps_warmup_actor=1024,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  batch_size=1024, processor=processor)
agent.compile(Adam(lr=0.0001, clipnorm=1.), metrics=['mae'])


file_logger = FileLogger(file_log_path, interval=log_interval)
checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
tensorboard = TensorboardLogger(tensorboard_path)
callbacks=[file_logger, checkpoint, tensorboard]
if os.path.isfile(model_path):
    agent.load_weights(model_path)

In [8]:
history = agent.fit(env_wrapper, nb_steps=number_of_training_steps, visualize=False, verbose=2,
        nb_max_episode_steps=env._max_steps, callbacks=callbacks)

Training for 500000 steps ...
    104/500000: episode: 1, duration: 2.738s, episode steps: 104, steps per second: 38, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 0.098 [-0.424, 0.549], mean observation: 0.213 [0.000, 9.000], loss: --, mean_absolute_error: --, mean_q: --
    143/500000: episode: 2, duration: 0.609s, episode steps: 39, steps per second: 64, episode reward: -1.000, mean reward: -0.026 [-1.000, 0.000], mean action: 0.193 [-0.076, 0.425], mean observation: 0.196 [0.000, 9.000], loss: --, mean_absolute_error: --, mean_q: --
    153/500000: episode: 3, duration: 0.140s, episode steps: 10, steps per second: 72, episode reward: -1.000, mean reward: -0.100 [-1.000, 0.000], mean action: 0.164 [-0.036, 0.418], mean observation: 0.174 [0.000, 9.000], loss: --, mean_absolute_error: --, mean_q: --
    173/500000: episode: 4, duration: 0.276s, episode steps: 20, steps per second: 73, episode reward: -1.000, mean reward: -0.050 [-1.000, 0.000], mean action

   5000/500000: episode: 32, duration: 57.293s, episode steps: 85, steps per second: 1, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 0.243 [-0.204, 1.332], mean observation: 0.217 [0.000, 9.000], loss: 0.002001, mean_absolute_error: 0.038458, mean_q: 1.518223
   5074/500000: episode: 33, duration: 49.532s, episode steps: 74, steps per second: 1, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 0.131 [-0.575, 1.349], mean observation: 0.217 [0.000, 9.000], loss: 0.002004, mean_absolute_error: 0.037872, mean_q: 1.517834
   5291/500000: episode: 34, duration: 145.200s, episode steps: 217, steps per second: 1, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 0.260 [-0.723, 1.225], mean observation: 0.211 [0.000, 9.000], loss: 0.001447, mean_absolute_error: 0.030355, mean_q: 1.507884
   5528/500000: episode: 35, duration: 157.772s, episode steps: 237, steps per second: 2, episode reward: -1.000, mean reward: -0.0

  14006/500000: episode: 61, duration: 527.952s, episode steps: 800, steps per second: 2, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 0.187 [-1.325, 1.980], mean observation: 0.202 [0.000, 9.000], loss: 0.000969, mean_absolute_error: 0.020660, mean_q: 1.292278
  14165/500000: episode: 62, duration: 106.954s, episode steps: 159, steps per second: 1, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 0.008 [-0.759, 1.060], mean observation: 0.218 [0.000, 9.000], loss: 0.001047, mean_absolute_error: 0.020665, mean_q: 1.280582


KeyError: None

In [None]:
agent.save_weights(model_path, overwrite=True)

In [None]:
actor.summary()

In [None]:
np.array([123,1,2,4]).dtype