In [10]:
import numpy as np
import os
import shutil

from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.configs import ffa_v0_env
from pommerman.constants import BOARD_SIZE, GameType
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

In [11]:
num_episodes = 30000
batching_capacity = 1000
save_seconds = 300
main_dir = './ppoPcrnn/'
log_path = main_dir + 'logs/'
model_path = main_dir + 'model'

In [12]:
if not os.path.isdir(main_dir):
    os.mkdir(main_dir)
if os.path.isdir(log_path):
    shutil.rmtree(log_path, ignore_errors=True)
os.mkdir(log_path)

In [15]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)

# Create a Proximal Policy Optimization agent
network = dict(type='pomm_network.PommNetwork')
#TODO size
states = {
    "board": dict(shape=(25, 25, 3, ), type='float'),
    "state": dict(shape=(3,), type='float')
}
saver = {
    "directory": model_path,
    "seconds": save_seconds,
    "load": os.path.isdir(model_path)
}
agent = PPOAgent(
    states=states,
    actions=dict(type='int', num_actions=env.action_space.n),
    network=network,
    batching_capacity=batching_capacity,
    step_optimizer=dict(
        type='adam',
        learning_rate=1e-4
    ),
    saver=saver
)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


TensorForceError: Error: object PommNetwork not found in predefined objects: []

In [None]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass
# Add 3 random agents
agents = []
for agent_id in range(3):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

# Add TensorforceAgent
agent_id += 1
agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_training_agent(agents[-1].agent_id)
env.set_init_game_state(None)

In [6]:
class WrappedEnv(OpenAIGym):
    def __init__(self, gym, visualize=False):
        self.gym = gym
        self.visualize = visualize

    def execute(self, actions):
        if self.visualize:
            self.gym.render()

        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, actions)
        state, reward, terminal, _ = self.gym.step(all_actions)
        agent_state = WrappedEnv.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        # If nobody die, use some "smart" reward
        if agent_reward == 0:
            agent_reward = self.gym.train_reward
        return agent_state, terminal, agent_reward

    def reset(self):
        obs = self.gym.reset()
        agent_obs = WrappedEnv.featurize(obs[3])
        return agent_obs
    
    @staticmethod
    def center_view(board, mypos, viewSize=25):
        # make sure of odd viewSize
        viewSize = viewSize + 1 if viewSize % 2 == 0 else viewSize 

        # assumed board's odd shape, dimensions must be odd!        
        wmax, hmax = board.shape[1]*2+1, board.shape[0]*2+1 
        agentView = np.ones((wmax,hmax,1)) # agent centric full-world coverage
        center = (agentView.shape[0]//2+1, agentView.shape[1]//2+1)

        # copy board to the new view
        offset_y = center[0]-mypos[0]-1
        offset_x = center[1]-mypos[1]-1
        agentView[offset_y:offset_y+13, offset_x:offset_x+13] = board
        #np.savetxt('board.txt', agentView, fmt="%2.i") # save to file for debug

        # finalize view size
        r = viewSize // 2
        start, end = center[0]-r-1, center[0]+r
        agentView = agentView[start:end, start:end] 
        #np.savetxt('board_cut.txt', agentView, fmt="%2.i") # save to file for debug

        return np.array(agentView, dtype=np.float32)

    @staticmethod
    def featurize(obs):
        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(res.shape[0], res.shape[1], 1).astype(np.float32)

        board = get_matrix(obs, 'board')
        teammate_position = None
        teammate = obs["teammate"]
        if teammate is not None:
            teammate = teammate.value
            if teammate > 10 and teammate < 15:
                teammate_position = np.argwhere(board == teammate)[0]
        else:
            teammate = None
        # My self - 11
        # Team mate - 12
        # Enemy - 13

        # Everyone enemy
        board[(board > 10) & (board < 15)] = 13
        # I'm not enemy
        my_position = obs['position']
        board[my_position[0], my_position[1], 0] = 11
        # Set teammate
        if teammate_position is not None:
            board[teammate_position[0], teammate_position[1], teammate_position[2]] = 12

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')
        
        board = WrappedEnv.center_view(board, my_position)
        bomb_blast_strength = WrappedEnv.center_view(bomb_blast_strength, my_position)
        bomb_life = WrappedEnv.center_view(bomb_life, my_position)
        
        conv_inp = np.concatenate([board, bomb_blast_strength, bomb_life], axis=2)
        state = np.array([obs["ammo"], obs["blast_strength"], obs["can_kick"]]).astype(np.float32)        
        return dict(board=conv_inp, state=state)

In [7]:
def episode_finished(r):
    if r.episode % 10 == 0:
        print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts = r.timestep + 1))
        print("Episode reward: {}".format(r.episode_rewards[-1]))
        print("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[10:])))
    return True

In [None]:
# Instantiate and run the environment for 5 episodes.
wrapped_env = WrappedEnv(env, False)
runner = Runner(agent=agent, environment=wrapped_env)
runner.run(num_episodes=num_episodes, episode_finished=episode_finished, max_episode_timesteps=env._max_steps)
print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times)

try:
    runner.close()
except AttributeError as e:
    pass

Finished episode 11 after 500 timesteps
Episode reward: -1.1320000000000001
Average of last 10 rewards: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Finished episode 21 after 1030 timesteps
Episode reward: -1.177
Average of last 10 rewards: -1.1018999999999999
Finished episode 31 after 1684 timesteps
Episode reward: -1.051
Average of last 10 rewards: -1.0785
Finished episode 41 after 2511 timesteps
Episode reward: -0.6439999999999999
Average of last 10 rewards: -1.0694333333333332
Finished episode 51 after 3041 timesteps
Episode reward: -1.032
Average of last 10 rewards: -1.0682500000000001
Finished episode 61 after 3608 timesteps
Episode reward: -1.2750000000000001
Average of last 10 rewards: -1.0659
Finished episode 71 after 4113 timesteps
Episode reward: -1.039
Average of last 10 rewards: -1.0701666666666667
Finished episode 81 after 4545 timesteps
Episode reward: -1.027
Average of last 10 rewards: -1.0708571428571425
Finished episode 91 after 4942 timesteps
Episode reward: -1.103
Average of last 10 rewards: -1.0714875
Finished episode 101 after 5541 timesteps
Episode reward: -1.2460000000000002
Average of last 10 rewards: -1.06

Finished episode 711 after 45020 timesteps
Episode reward: -1.199
Average of last 10 rewards: -1.0586442857142857
Finished episode 721 after 45635 timesteps
Episode reward: -1.034
Average of last 10 rewards: -1.058723943661972
Finished episode 731 after 46145 timesteps
Episode reward: -1.058
Average of last 10 rewards: -1.0585597222222223
Finished episode 741 after 46526 timesteps
Episode reward: -1.04
Average of last 10 rewards: -1.058146575342466
Finished episode 751 after 47273 timesteps
Episode reward: -1.059
Average of last 10 rewards: -1.0588297297297298
Finished episode 761 after 48000 timesteps
Episode reward: -0.8660000000000001
Average of last 10 rewards: -1.0591439999999999
Finished episode 771 after 48591 timesteps
Episode reward: -1.074
Average of last 10 rewards: -1.059742105263158
Finished episode 781 after 49242 timesteps
Episode reward: -1.247
Average of last 10 rewards: -1.0592623376623378
Finished episode 791 after 49746 timesteps
Episode reward: -0.6719999999999999


Finished episode 1381 after 86847 timesteps
Episode reward: -1.092
Average of last 10 rewards: -1.0637992700729926
Finished episode 1391 after 87329 timesteps
Episode reward: -1.052
Average of last 10 rewards: -1.0636949275362317
Finished episode 1401 after 87873 timesteps
Episode reward: -0.6839999999999999
Average of last 10 rewards: -1.0635352517985612
Finished episode 1411 after 88342 timesteps
Episode reward: -1.047
Average of last 10 rewards: -1.0633771428571428
INFO:tensorflow:Saving checkpoints for 88963 into ./ppo/model/model.ckpt.
Finished episode 1421 after 88964 timesteps
Episode reward: -1.057
Average of last 10 rewards: -1.0636234042553192
Finished episode 1431 after 89775 timesteps
Episode reward: -1.065
Average of last 10 rewards: -1.0639
Finished episode 1441 after 90308 timesteps
Episode reward: -1.052
Average of last 10 rewards: -1.0637335664335665
Finished episode 1451 after 90756 timesteps
Episode reward: -1.034
Average of last 10 rewards: -1.0637340277777778
Finis

Finished episode 2051 after 125877 timesteps
Episode reward: -1.074
Average of last 10 rewards: -1.063444607843137
Finished episode 2061 after 126565 timesteps
Episode reward: -1.062
Average of last 10 rewards: -1.063361951219512
Finished episode 2071 after 127214 timesteps
Episode reward: -1.079
Average of last 10 rewards: -1.0630757281553398
INFO:tensorflow:Saving checkpoints for 127677 into ./ppo/model/model.ckpt.
Finished episode 2081 after 127971 timesteps
Episode reward: -1.043
Average of last 10 rewards: -1.062919806763285
Finished episode 2091 after 128584 timesteps
Episode reward: -1.3860000000000003
Average of last 10 rewards: -1.0627783653846155
Finished episode 2151 after 131846 timesteps
Episode reward: -1.039
Average of last 10 rewards: -1.0633509345794392
Finished episode 2161 after 132601 timesteps
Episode reward: -1.041
Average of last 10 rewards: -1.0635972093023256
Finished episode 2171 after 133087 timesteps
Episode reward: -1.137
Average of last 10 rewards: -1.0632