In [1]:
import numpy as np
import os
import tensorflow as tf

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.layers import Input, Dense, Flatten, Convolution2D
from keras.models import Model
from keras.utils import to_categorical
from pommerman.agents import BaseAgent, SimpleAgent
from pommerman.configs import ffa_v0_env
from pommerman.constants import BOARD_SIZE
from pommerman.envs.v0 import Pomme

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
batch_size = 16384
epochs = 100
initial_rollouts = 80
num_rollouts = 80
dagger_iterations = 100
early_stopping = 10

log_path = './dagger/logs/cnn3dense1'
model_path = './dagger/model/cnn2dense1/model.h4'
train_data_path = './dagger/train_data/'
train_data_obs = 'obs.npy'
train_data_labels = 'labels.npy'

In [3]:
class Logger(object):
    """Logging in tensorboard without tensorflow ops."""

    def __init__(self, log_dir):
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag,
                                                     simple_value=value)])
        self.writer.add_summary(summary, step)

In [4]:
class Agent:
    def __init__(self, actions, seed=0, save_path="./dagger/model/model.h4", 
                 log_path='./dagger/logs/', save_best_only=True):
        self.log_path = log_path
        self.save_path = save_path
        self.actions = actions
        self.save_best_only = save_best_only
        self.rewards = []
        self.current_epoch = 0        
        self.logger = Logger(self.log_path)        
        
        self.model = self.create_model(actions)
        if not os.path.isdir(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))            
        if os.path.isfile(self.save_path):
            try:
                print("Trying to load model")
                self.model.load_weights(self.save_path)
                print("Model was loaded successful")
            except:
                print("Model load failed")

    def create_model(self, actions, input_shape=(13, 13, 17,)):
        inp = Input(input_shape)
        x = Convolution2D(64, 3)(inp)
        x = Convolution2D(64, 3)(x)
        x = Convolution2D(64, 3)(x)
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)         
        out = Dense(actions)(x)
        model = Model(inputs = inp, outputs=out)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def train(self, obs, labels, batch_size=16384, epochs=100, early_stopping = 10):
        early_stopping = EarlyStopping(monitor='loss', patience=early_stopping)
        checkpoint = ModelCheckpoint(self.save_path, monitor='loss', save_best_only=self.save_best_only)
        reduce_lr = ReduceLROnPlateau(monitor='loss', patience=3, factor=0.7)
        logger = CSVLogger(self.log_path + 'log.csv', append=True)
        
        history = self.model.fit(x=obs, y=labels, batch_size=batch_size, epochs=epochs, verbose=1,
                       callbacks=[early_stopping, checkpoint, reduce_lr, logger],
                       validation_split=0.2, shuffle=True)
        self.model.load_weights(self.save_path)
        self.log_history(history)
        self.current_epoch += len(history.history['lr'])
    
    def log_history(self, history):
        def log(history, name, text=None):
            if text is None:
                text = name
            for ind, el in enumerate(history[name]):
                self.add_log(text, el, self.current_epoch + ind + 1)
        log(history.history, 'val_loss')
        log(history.history, 'val_acc')
        log(history.history, 'loss')
        log(history.history, 'acc')
        log(history.history, 'lr')

    @staticmethod
    def featurize(obs):
        shape = (BOARD_SIZE, BOARD_SIZE, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)               # Rigid = 1
        wood_map = get_map(board, 2)                # Wood = 2
        bomb_map = get_map(board, 3)                # Bomb = 3
        flames_map = get_map(board, 4)              # Flames = 4
        fog_map = get_map(board, 5)                 # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)          # ExtraBomb = 6
        incr_range_map = get_map(board, 7)          # IncrRange = 7
        kick_map = get_map(board, 8)                # Kick = 8
        skull_map = get_map(board, 9)               # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value) # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = np.full((BOARD_SIZE, BOARD_SIZE, 1), obs["ammo"])
        blast_strength = np.full((BOARD_SIZE, BOARD_SIZE, 1), obs["blast_strength"])
        can_kick = np.full((BOARD_SIZE, BOARD_SIZE, 1), int(obs["can_kick"]))

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life, ammo, blast_strength, can_kick], axis=2)
        return obs

    def act(self, obs):
        obs = self.featurize(obs)
        obs = np.array([obs])
        predictions = self.model.predict(obs)
        return np.argmax(predictions), obs[0]

    def record_reward(self, reward):
        self.rewards.append(np.mean(reward))
        
    def add_log(self, tag, value, step):
        self.logger.log_scalar(tag, value, step)

In [5]:
# Simple wrapper around policy function to have an act function
class Expert:
    def __init__(self, config):
        self.__agent = SimpleAgent(config)

    def act(self, obs):
        return self.__agent.act(obs, None), None

    def record_reward(self, reward):
        pass
    
    def add_log(self, tag, value, step):
        pass    


class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


# Environment wrapper
class Stimulator:
    def __init__(self, env, config):
        self.env = env
        self.init(config)
        self.episode_number = 0

    def init(self, config):
        self.env.seed(0)
        # Add 3 random agents
        agents = []
        for agent_id in range(3):
            agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

        # Add TensorforceAgent
        agent_id += 1
        agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
        self.env.set_agents(agents)
        self.env.set_training_agent(agents[-1].agent_id)
        self.env.set_init_game_state(None)

    def stimulate(self, agent, num_rollouts, render=False, logging=False, featurized=False):
        returns = []
        observations = []
        preprocessed_obs = []
        actions = []
        
        for i in range(num_rollouts):
            self.episode_number += 1
            obs = self.env.reset()
            done = False
            total_reward = 0.
            episode_steps = 0
            
            while not done:
                if render:
                    self.env.render()                
                action, prep_obs = agent.act(obs[self.env.training_agent])                                
                all_actions = self.env.act(obs)                
                all_actions.insert(self.env.training_agent, action)                
                obs, reward, done, _ = self.env.step(all_actions)
                total_reward += reward[self.env.training_agent]
                episode_steps += 1
                
                observations.append(obs[self.env.training_agent])
                actions.append(action)
                preprocessed_obs.append(prep_obs)
            print('rollout %i/%i return=%f' % (i + 1, num_rollouts, total_reward))
            if logging:
                agent.add_log('Episode reward', total_reward, self.episode_number)
                agent.add_log('Episode length', episode_steps, self.episode_number)                
            returns.append(total_reward)
        print('Return summary: mean=%f, std=%f' % (np.mean(returns), np.std(returns)))
        agent.record_reward(returns)
        return (np.array(observations), to_categorical(actions, self.env.action_space.n),  np.array(preprocessed_obs))

    def label_obs(self, expert, obs):
        actions = []
        for o in obs:
            actions.append(expert.act(o)[0])
        return to_categorical(actions, self.env.action_space.n)

In [6]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])

agent_dagger = Agent(env.action_space.n, save_path=model_path, log_path=log_path)
expert = Expert(config["agent"](0, config["game_type"]))
stimulator = Stimulator(env, config)

if os.path.isdir(train_data_path):
    full_obs = np.load(train_data_path + train_data_obs)
    full_labels = np.load(train_data_path + train_data_labels)
else:
    # Generate training data
    training_data, _ = stimulator.stimulate(expert, num_rollouts=initial_rollouts)
    full_obs = training_data[0]
    full_labels = training_data[1]
temp = []
for obs in full_obs:
    temp.append(Agent.featurize(obs))
full_obs = np.array(temp)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
# Train DAgger Agent
for i in range(1, dagger_iterations):
    print("Train with DAgger, iter %i" % i)        
    (obs, _, prep) = stimulator.stimulate(agent_dagger, num_rollouts=num_rollouts, logging=True)
    labels = stimulator.label_obs(expert, obs)
    full_obs = np.append(full_obs, prep, axis=0)    
    full_labels = np.append(full_labels, labels, axis=0)
    agent_dagger.train(full_obs, full_labels, batch_size=batch_size, epochs=epochs, early_stopping=early_stopping)
env.close()

Train with DAgger, iter 1
rollout 1/80 return=-1.000000
rollout 2/80 return=-1.000000
rollout 3/80 return=-1.000000
rollout 4/80 return=-1.000000
rollout 5/80 return=-1.000000
rollout 6/80 return=-1.000000
rollout 7/80 return=-1.000000
rollout 8/80 return=-1.000000
rollout 9/80 return=-1.000000
rollout 10/80 return=-1.000000
rollout 11/80 return=-1.000000
rollout 12/80 return=-1.000000
rollout 13/80 return=1.000000
rollout 14/80 return=-1.000000
rollout 15/80 return=-1.000000
rollout 16/80 return=-1.000000
rollout 17/80 return=-1.000000
rollout 18/80 return=-1.000000
rollout 19/80 return=-1.000000
rollout 20/80 return=-1.000000
rollout 21/80 return=-1.000000
rollout 22/80 return=-1.000000
rollout 23/80 return=-1.000000
rollout 24/80 return=-1.000000
rollout 25/80 return=-1.000000
rollout 26/80 return=-1.000000
rollout 27/80 return=-1.000000
rollout 28/80 return=-1.000000
rollout 29/80 return=-1.000000
rollout 30/80 return=-1.000000
rollout 31/80 return=-1.000000
rollout 32/80 return=-1

rollout 76/80 return=-1.000000
rollout 77/80 return=1.000000
rollout 78/80 return=-1.000000
rollout 79/80 return=-1.000000
rollout 80/80 return=-1.000000
Return summary: mean=-0.950000, std=0.312250
{(0, 8): None, (0, 9): None, (0, 11): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (2, 5): None, (2, 6): None, (2, 10): None, (2, 11): (3, 11), (2, 12): None, (3, 6): None, (3, 7): None, (3, 8): None, (3, 9): None, (3, 10): (3, 11), (3, 11): (4, 11), (4, 4): None, (4, 5): None, (4, 8): None, (4, 9): (4, 10), (4, 10): (4, 11), (4, 11): None, (5, 4): None, (5, 5): None, (5, 6): None, (5, 8): None, (5, 11): (4, 11), (5, 12): (5, 11), (6, 5): None, (6, 6): None, (6, 7): None, (6, 8): None, (6, 11): (5, 11), (7, 6): None, (7, 7): None, (7, 8): None, (7, 11): (6, 11), (8, 7): None, (8, 8): None, (8, 11): (7, 11), (9, 9): None, (9, 11): (8, 11), (9, 12): (9, 11), (10, 9): None, (10, 11): (9, 11), (10, 12): (9, 12)} None (4, 11)
{(0, 7):

{(0, 8): None, (0, 10): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (2, 6): None, (2, 8): None, (2, 10): None, (2, 11): (3, 11), (3, 6): None, (3, 8): None, (3, 9): None, (3, 11): (4, 11), (3, 12): (3, 11), (4, 4): None, (4, 5): None, (4, 8): None, (4, 11): None, (5, 4): None, (5, 5): None, (5, 6): None, (5, 7): None, (5, 8): None, (5, 11): (4, 11), (6, 5): None, (6, 6): None, (6, 8): None, (6, 9): None, (6, 11): (5, 11), (8, 7): None, (8, 8): None, (8, 9): None, (8, 11): None, (8, 12): None, (9, 8): None, (9, 9): None, (9, 11): None, (9, 12): None, (10, 9): None, (10, 11): None, (10, 12): None, (11, 11): None, (11, 12): None} None (4, 11)
{(0, 9): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 7): None, (2, 10): None, (2, 11): (3, 11), (2, 12): None, (3, 4): None, (3, 5): None, (3, 6): None, (3, 10): (3, 11), (3, 11): (4, 11), (3, 

{(0, 7): None, (0, 8): None, (0, 9): None, (0, 10): None, (0, 11): None, (0, 12): None, (1, 6): None, (1, 7): None, (1, 8): None, (1, 9): None, (1, 10): None, (1, 11): None, (1, 12): None, (2, 5): None, (2, 7): None, (2, 10): None, (2, 11): (3, 11), (3, 5): None, (3, 6): None, (3, 7): None, (3, 8): None, (3, 10): (3, 11), (3, 11): (4, 11), (3, 12): (3, 11), (4, 4): None, (4, 5): None, (4, 8): None, (4, 11): None, (4, 12): (4, 11), (5, 4): None, (5, 5): None, (5, 6): None, (5, 7): None, (5, 8): None, (5, 9): None, (5, 11): (4, 11), (6, 5): None, (6, 6): None, (6, 8): None, (6, 10): (6, 11), (6, 11): (5, 11), (7, 7): None, (7, 10): (7, 11), (7, 11): (6, 11), (8, 8): None, (8, 9): None, (8, 10): (8, 11), (8, 11): (7, 11), (8, 12): (8, 11), (9, 8): None, (9, 9): None, (9, 10): (9, 11), (9, 11): (8, 11), (9, 12): (8, 12), (10, 9): None, (10, 10): (10, 11), (10, 11): (9, 11), (11, 10): (11, 11), (11, 11): (10, 11)} None (4, 11)
{(0, 7): None, (0, 8): None, (0, 11): None, (1, 6): None, (1, 7)