In [1]:
import os
import sys
import numpy as np

from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v0_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

dobz = None # debug observation
stateInput = 13 # used as a global variable for the observations as well!

  from ._conv import register_converters as _register_converters


In [2]:
def reward_shaping(obs, state, reward):
    """ Shape the reward based on the current state """
    #print(reward)
    return reward

In [3]:
def center_view(board, mypos, viewSize=25):
    # make sure of odd viewSize
    viewSize = viewSize + 1 if viewSize % 2 == 0 else viewSize 
    
    # assumed board's odd shape, dimensions must be odd!
    wmax, hmax = board.shape[1]*2+1, board.shape[0]*2+1 
    agentView = np.ones((wmax,hmax)) # agent centric full-world coverage
    center = (agentView.shape[0]//2+1, agentView.shape[1]//2+1)
    
    # copy board to the new view
    offset_y = center[0]-mypos[0]-1
    offset_x = center[1]-mypos[1]-1
    agentView[offset_y:offset_y+13, offset_x:offset_x+13] = board
    #np.savetxt('board.txt', agentView, fmt="%2.i") # save to file for debug
    
    # finalize view size
    r = viewSize // 2
    start, end = center[0]-r-1, center[0]+r
    agentView = agentView[start:end, start:end] 
    #np.savetxt('board_cut.txt', agentView, fmt="%2.i") # save to file for debug
    
    return np.array(agentView, dtype=np.float32)
    
# test    
# center_view(obz['board'], obz['position'], 7)

In [4]:
def featurize(obz):
    global dobz
    global stateInput
    
    dobz = obz # for debugging purposes
    viewSize = stateInput # view/state input size
    
    board = obz["board"]
    bomb_blast_strength = obz["bomb_blast_strength"]
    bomb_life = obz["bomb_life"]
    mypos = obz['position']
    
    # my powers:
    ammo = obz["ammo"] # TODO: ? how to apply
    blast_strength = obz["blast_strength"] # TODO: only around my bombs
    can_kick = int(obz["can_kick"]) # TODO: add to all channels and maps

    # agent's channel
    ch1 = center_view(board, mypos, viewSize)
    ch1 += center_view(bomb_life, mypos, viewSize)
    ch1 += center_view(bomb_blast_strength, mypos, viewSize) 
    
    # teammate channel
    def ch2_default():
        ch2 = center_view(board, mypos, viewSize)
        ch2 += center_view(bomb_life, mypos, viewSize)
        ch2 += center_view(bomb_blast_strength, mypos, viewSize)
        return ch2
    
    ch2 = np.zeros((viewSize, viewSize))
    teammate = obz["teammate"]
    teammate = teammate.value if teammate is not None else -1
    if teammate != -1:
        teammatePos = np.array(np.where(board == [[teammate]])).reshape(-1,)
        if teammatePos.size != 0:
            ch2 = center_view(board, teammatePos, viewSize)
            ch2 += center_view(bomb_life, teammatePos, viewSize)
            ch2 += center_view(bomb_blast_strength, teammatePos, viewSize)
        else:
            ch2 = ch2_default()
    ch2 = ch2_default()

    # opponents channel
    enemies = obz["enemies"]
    enemies = [e.value for e in enemies]
    enemies = enemies + [-1]*(3 - len(enemies)) if len(enemies) < 3 else enemies
    ch3 = np.zeros((viewSize, viewSize))
    for enemy in enemies:
        if enemy == -1:
            continue
        enemyPos = np.array(np.where(board == [[enemy]])).reshape(-1,)
        if enemyPos.size == 0:
            continue
        ch3 += center_view(board, enemyPos, viewSize)
        ch3 += center_view(bomb_life, enemyPos, viewSize)
        ch3 += center_view(bomb_blast_strength, enemyPos, viewSize) 
    
#     print("ch1: ", np.sum(ch1))
#     print("ch2: ", np.sum(ch2))
#     print("ch3: ", np.sum(ch3))
    state = np.dstack((ch1, ch2))
    state = np.dstack((state, ch3))
    #print(state.shape)
    
#     return state.reshape(-1,)
    return state


class TensorforceAgent(BaseAgent):
    """ Mock class for TensorforceAgent """
    def act(self, obs, action_space):
        pass

In [5]:
# https://github.com/reinforceio/tensorforce/blob/master/tensorforce/tests/test_tutorial_code.py
from tensorforce.agents import DQNAgent, PPOAgent

# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)


# Network is an ordered list of layers
network_spec = [
    dict(type='conv2d', size=64, window=8, stride=4),
    dict(type='conv2d', size=32, window=4, stride=2),
    dict(type='flatten')
#     dict(type='dense', size=64),
#     dict(type='dense', size=64)
]

# Define a state
states = dict(shape=(stateInput,stateInput,3), type='float')
# states = dict(shape=(stateInput*stateInput*3,), type='float') # for linear state

# Define an action
actions = dict(type='int', num_actions=env.action_space.n)

agent_dqn = DQNAgent(
    states=states,
    actions=actions,
    network=network_spec,
    update_mode=dict(
        unit='timesteps',
        batch_size=1,
        frequency=1
    ),
    memory=dict(
        type='latest',
        include_next_states=True,
        capacity=100
    ),
    target_sync_frequency=10
)


# Create a Proximal Policy Optimization agent
agent_ppo = PPOAgent(
    states=states,
    actions=actions,
    network=network_spec,
    batching_capacity=1000,
    step_optimizer=dict(
        type='adam',
        learning_rate=1e-4
    )
)

agent = agent_ppo

# Add 3 simple agents
agents = []
for agent_id in range(3):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

# Add TensorforceAgent
agent_id += 1
agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_training_agent(agents[-1].agent_id)
env.set_init_game_state(None)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [6]:
class WrappedEnv(OpenAIGym):    
    def __init__(self, gym, visualize=False):
        self.gym = gym
        self.visualize = visualize
    
    def execute(self, actions):
        if self.visualize:
            self.gym.render()

        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, actions)
        state, reward, terminal, _ = self.gym.step(all_actions)
        reward = reward_shaping(obs, state, reward)
        agent_state = featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, terminal, agent_reward
    
    def reset(self):
        obs = self.gym.reset()
        agent_obs = featurize(obs[3])
        return agent_obs

In [7]:
# Instantiate and run the environment for 5 episodes.
wrapped_env = WrappedEnv(env, True)
runner = Runner(agent=agent, environment=wrapped_env)
runner.run(episodes=5, max_episode_timesteps=2000)
print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times)

try:
    runner.close()
except AttributeError as e:
    pass

Stats:  [-1, -1, -1, -1, -1] [30, 54, 27, 28, 28] [4.317109107971191, 7.399764537811279, 3.5615615844726562, 3.1949238777160645, 3.3633885383605957]


In [9]:
dobz

{'ammo': 1,
 'blast_strength': 3,
 'board': array([[ 0,  8,  1,  1,  2,  1,  1,  0,  0,  1,  1,  1,  1],
        [ 0,  0,  0,  0,  2,  2,  2,  2,  4,  4,  4,  4,  2],
        [ 1,  0, 11,  1,  2,  1,  1,  2,  0,  4,  1,  0,  1],
        [ 1,  0,  1,  0,  1,  1,  2,  1,  2,  4,  1,  0,  2],
        [ 2,  2,  2,  1,  0,  2,  0,  1,  0,  2,  2,  2,  0],
        [ 1,  2,  1,  1,  2,  0,  2,  2,  2,  0,  0,  2,  0],
        [ 1,  2,  1,  2,  0,  2,  0,  0,  0,  0,  2,  2,  1],
        [ 0,  2,  2,  1,  1,  2,  0,  0,  1,  1,  0,  2,  1],
        [ 0,  2,  0,  2,  0,  2,  0,  1,  0,  1,  0,  2,  1],
        [ 1,  0,  0,  0,  2,  0,  0,  1,  1,  0,  0,  3,  0],
        [ 1,  0,  1,  1,  2,  0,  2,  0, 13,  0,  0,  0,  1],
        [ 1, 12,  0,  0,  2,  2,  2,  2,  2,  0,  0,  0,  0],
        [ 1,  2,  1,  2,  0,  0,  1,  1,  1,  0,  1,  0,  0]], dtype=uint8),
 'bomb_blast_strength': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.