In [1]:
import numpy as np
import os
import shutil

from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.configs import ffa_v0_env
from pommerman.constants import BOARD_SIZE, GameType
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
num_episodes = 30000
batching_capacity = 1000
save_seconds = 300
main_dir = './ppo/'
log_path = main_dir + 'logs/'
model_path = main_dir + 'model'

In [3]:
if not os.path.isdir(main_dir):
    os.mkdir(main_dir)
if os.path.isdir(log_path):
    shutil.rmtree(log_path, ignore_errors=True)
os.mkdir(log_path)

In [4]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)

# Create a Proximal Policy Optimization agent
network = dict(type='pomm_network.PommNetwork')
states = {
    "board": dict(shape=(BOARD_SIZE, BOARD_SIZE, 3, ), type='float'),
    "state": dict(shape=(3,), type='float')
}
saver = {
    "directory": model_path,
    "seconds": save_seconds,
    "load": os.path.isdir(model_path)
}
agent = PPOAgent(
    states=states,
    actions=dict(type='int', num_actions=env.action_space.n),
    network=network,
    batching_capacity=batching_capacity,
    step_optimizer=dict(
        type='adam',
        learning_rate=1e-4
    ),
    saver=saver
)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 0 into ./ppo/model/model.ckpt.


In [5]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass
# Add 3 random agents
agents = []
for agent_id in range(3):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

# Add TensorforceAgent
agent_id += 1
agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_training_agent(agents[-1].agent_id)
env.set_init_game_state(None)

In [6]:
class WrappedEnv(OpenAIGym):
    def __init__(self, gym, visualize=False):
        self.gym = gym
        self.visualize = visualize

    def execute(self, actions):
        if self.visualize:
            self.gym.render()

        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, actions)
        state, reward, terminal, _ = self.gym.step(all_actions)
        agent_state = WrappedEnv.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        # If nobody die, use some "smart" reward
        if agent_reward == 0:
            agent_reward = self.gym.train_reward
        return agent_state, terminal, agent_reward

    def reset(self):
        obs = self.gym.reset()
        agent_obs = WrappedEnv.featurize(obs[3])
        return agent_obs

    @staticmethod
    def featurize(obs):
        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(res.shape[0], res.shape[1], 1).astype(np.float32)

        board = get_matrix(obs, 'board')
        teammate_position = None
        teammate = obs["teammate"]
        if teammate is not None:
            teammate = teammate.value
            if teammate > 10 and teammate < 15:
                teammate_position = np.argwhere(board == teammate)[0]
        else:
            teammate = None
        # My self - 11
        # Team mate - 12
        # Enemy - 13

        # Everyone enemy
        board[(board > 10) & (board < 15)] = 13
        # I'm not enemy
        my_position = obs['position']
        board[my_position[0], my_position[1], 0] = 11
        # Set teammate
        if teammate_position is not None:
            board[teammate_position[0], teammate_position[1], teammate_position[2]] = 12

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')
        conv_inp = np.concatenate([board, bomb_blast_strength, bomb_life], axis=2)
        state = np.array([obs["ammo"], obs["blast_strength"], obs["can_kick"]]).astype(np.float32)
        return dict(board=conv_inp, state=state)

In [7]:
def episode_finished(r):
    if r.episode % 10 == 0:
        print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts = r.timestep + 1))
        print("Episode reward: {}".format(r.episode_rewards[-1]))
        print("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[10:])))
    return True

In [None]:
# Instantiate and run the environment for 5 episodes.
wrapped_env = WrappedEnv(env, False)
runner = Runner(agent=agent, environment=wrapped_env)
runner.run(num_episodes=num_episodes, episode_finished=episode_finished, max_episode_timesteps=env._max_steps)
print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times)

try:
    runner.close()
except AttributeError as e:
    pass

Finished episode 11 after 509 timesteps
Episode reward: -1.041
Average of last 10 rewards: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Finished episode 21 after 1215 timesteps
Episode reward: -1.033
Average of last 10 rewards: -1.1287
Finished episode 31 after 1755 timesteps
Episode reward: -1.08
Average of last 10 rewards: -1.0877500000000002
Finished episode 41 after 2342 timesteps
Episode reward: -1.051
Average of last 10 rewards: -1.0775000000000001
Finished episode 51 after 2770 timesteps
Episode reward: -1.112
Average of last 10 rewards: -1.0662749999999999
Finished episode 61 after 3589 timesteps
Episode reward: -1.065
Average of last 10 rewards: -1.0710799999999998
Finished episode 71 after 4270 timesteps
Episode reward: -1.095
Average of last 10 rewards: -1.0727499999999999
Finished episode 81 after 4695 timesteps
Episode reward: -1.072
Average of last 10 rewards: -1.0671714285714284
Finished episode 91 after 5136 timesteps
Episode reward: -1.115
Average of last 10 rewards: -1.0688625
Finished episode 101 after 5899 timesteps
Episode reward: -1.058
Average of last 10 rewards: -1.0764333333333331
Finished epis

Finished episode 721 after 45280 timesteps
Episode reward: -1.057
Average of last 10 rewards: -1.0741732394366197
Finished episode 731 after 45738 timesteps
Episode reward: -1.137
Average of last 10 rewards: -1.0744194444444444
Finished episode 741 after 46238 timesteps
Episode reward: -1.061
Average of last 10 rewards: -1.0742205479452056
Finished episode 751 after 47109 timesteps
Episode reward: -1.1980000000000002
Average of last 10 rewards: -1.074372972972973
Finished episode 761 after 48084 timesteps
Episode reward: -0.7010000000000001
Average of last 10 rewards: -1.0748146666666667
Finished episode 771 after 48440 timesteps
Episode reward: -1.077
Average of last 10 rewards: -1.074775
Finished episode 781 after 48745 timesteps
Episode reward: -1.111
Average of last 10 rewards: -1.0746376623376623
Finished episode 791 after 49140 timesteps
Episode reward: -0.655
Average of last 10 rewards: -1.073575641025641
Finished episode 801 after 49494 timesteps
Episode reward: -1.053
Average 

INFO:tensorflow:Saving checkpoints for 88617 into ./ppo/model/model.ckpt.
Finished episode 1411 after 89472 timesteps
Episode reward: -1.065
Average of last 10 rewards: -1.0741828571428573
Finished episode 1421 after 90376 timesteps
Episode reward: -1.342
Average of last 10 rewards: -1.0744836879432624
Finished episode 1431 after 90829 timesteps
Episode reward: -1.112
Average of last 10 rewards: -1.0745577464788734
Finished episode 1441 after 91526 timesteps
Episode reward: -1.07
Average of last 10 rewards: -1.074323776223776
Finished episode 1451 after 92118 timesteps
Episode reward: -1.047
Average of last 10 rewards: -1.0745701388888889
Finished episode 1461 after 92931 timesteps
Episode reward: -1.2240000000000002
Average of last 10 rewards: -1.075353103448276
Finished episode 1471 after 93553 timesteps
Episode reward: -0.7690000000000001
Average of last 10 rewards: -1.0753027397260273
Finished episode 1481 after 94210 timesteps
Episode reward: -1.11
Average of last 10 rewards: -1.0

Finished episode 2081 after 129569 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.0738859903381643
Finished episode 2091 after 130346 timesteps
Episode reward: -1.05
Average of last 10 rewards: -1.0740298076923076
Finished episode 2101 after 130883 timesteps
Episode reward: -1.066
Average of last 10 rewards: -1.073967942583732
Finished episode 2111 after 131269 timesteps
Episode reward: -0.673
Average of last 10 rewards: -1.0737895238095239
Finished episode 2121 after 131695 timesteps
Episode reward: -0.671
Average of last 10 rewards: -1.0734488151658768
Finished episode 2131 after 132114 timesteps
Episode reward: -1.037
Average of last 10 rewards: -1.073233490566038
Finished episode 2141 after 132512 timesteps
Episode reward: -1.1560000000000001
Average of last 10 rewards: -1.073038028169014
Finished episode 2151 after 133011 timesteps
Episode reward: -1.117
Average of last 10 rewards: -1.0732018691588785
Finished episode 2161 after 133691 timesteps
Episode reward: -1

Finished episode 2761 after 172674 timesteps
Episode reward: -1.2480000000000002
Average of last 10 rewards: -1.0733407272727273
Finished episode 2771 after 173167 timesteps
Episode reward: -1.143
Average of last 10 rewards: -1.073253623188406
Finished episode 2781 after 173730 timesteps
Episode reward: -1.065
Average of last 10 rewards: -1.0733064981949458
Finished episode 2791 after 174439 timesteps
Episode reward: -1.055
Average of last 10 rewards: -1.0735061151079137
Finished episode 2801 after 175014 timesteps
Episode reward: -1.116
Average of last 10 rewards: -1.0734491039426521
Finished episode 2811 after 175655 timesteps
Episode reward: -1.033
Average of last 10 rewards: -1.073559285714286
Finished episode 2821 after 176180 timesteps
Episode reward: -0.677
Average of last 10 rewards: -1.0735419928825625
Finished episode 2831 after 176881 timesteps
Episode reward: -1.2480000000000002
Average of last 10 rewards: -1.0735514184397164
INFO:tensorflow:Saving checkpoints for 177921 in

Finished episode 3431 after 214748 timesteps
Episode reward: -1.1920000000000002
Average of last 10 rewards: -1.072011403508772
Finished episode 3441 after 215309 timesteps
Episode reward: -1.032
Average of last 10 rewards: -1.0719932944606414
Finished episode 3451 after 215753 timesteps
Episode reward: -1.059
Average of last 10 rewards: -1.072011046511628
INFO:tensorflow:Saving checkpoints for 216563 into ./ppo/model/model.ckpt.
Finished episode 3461 after 216793 timesteps
Episode reward: -1.064
Average of last 10 rewards: -1.072003768115942
Finished episode 3471 after 217535 timesteps
Episode reward: -1.06
Average of last 10 rewards: -1.0722156069364162
Finished episode 3481 after 217925 timesteps
Episode reward: -1.045
Average of last 10 rewards: -1.0722236311239193
Finished episode 3491 after 218587 timesteps
Episode reward: -1.072
Average of last 10 rewards: -1.0720683908045976
Finished episode 3501 after 219082 timesteps
Episode reward: -1.098
Average of last 10 rewards: -1.07213

Finished episode 4101 after 255733 timesteps
Episode reward: -1.025
Average of last 10 rewards: -1.070319315403423
Finished episode 4111 after 256454 timesteps
Episode reward: -0.74
Average of last 10 rewards: -1.0703082926829268
Finished episode 4121 after 256914 timesteps
Episode reward: -1.1480000000000001
Average of last 10 rewards: -1.0703576642335766
Finished episode 4131 after 257592 timesteps
Episode reward: -0.742
Average of last 10 rewards: -1.07023786407767
Finished episode 4141 after 258212 timesteps
Episode reward: -1.2000000000000002
Average of last 10 rewards: -1.0701249394673122
Finished episode 4151 after 258640 timesteps
Episode reward: -1.053
Average of last 10 rewards: -1.069969082125604
Finished episode 4161 after 259303 timesteps
Episode reward: -1.068
Average of last 10 rewards: -1.0699824096385542
Finished episode 4171 after 259821 timesteps
Episode reward: -1.053
Average of last 10 rewards: -1.0699346153846154
Finished episode 4181 after 260393 timesteps
Episod

Finished episode 4781 after 296144 timesteps
Episode reward: -1.118
Average of last 10 rewards: -1.0698394129979036
Finished episode 4791 after 296821 timesteps
Episode reward: -1.048
Average of last 10 rewards: -1.0699121338912134
Finished episode 4801 after 297415 timesteps
Episode reward: -1.114
Average of last 10 rewards: -1.0697945720250523
Finished episode 4811 after 298366 timesteps
Episode reward: -1.076
Average of last 10 rewards: -1.06993875
Finished episode 4821 after 298924 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.0699467775467777
Finished episode 4831 after 299577 timesteps
Episode reward: -1.115
Average of last 10 rewards: -1.0698703319502074
Finished episode 4841 after 300215 timesteps
Episode reward: -1.089
Average of last 10 rewards: -1.069813457556936
Finished episode 4851 after 300726 timesteps
Episode reward: -0.351
Average of last 10 rewards: -1.0696491735537192
Finished episode 4861 after 301369 timesteps
Episode reward: -1.1700000000000002


Finished episode 5461 after 338398 timesteps
Episode reward: -1.145
Average of last 10 rewards: -1.0714357798165137
Finished episode 5471 after 338835 timesteps
Episode reward: -1.1380000000000001
Average of last 10 rewards: -1.0714461538461537
Finished episode 5481 after 339519 timesteps
Episode reward: -1.043
Average of last 10 rewards: -1.0714670932358317
Finished episode 5491 after 340161 timesteps
Episode reward: -1.081
Average of last 10 rewards: -1.0713645985401459
Finished episode 5501 after 340856 timesteps
Episode reward: -1.127
Average of last 10 rewards: -1.0713754098360657
Finished episode 5511 after 341479 timesteps
Episode reward: -1.088
Average of last 10 rewards: -1.0712790909090908
Finished episode 5521 after 342263 timesteps
Episode reward: -0.29500000000000026
Average of last 10 rewards: -1.0711562613430128
Finished episode 5531 after 342701 timesteps
Episode reward: -1.049
Average of last 10 rewards: -1.0711528985507246
Finished episode 5541 after 343162 timesteps


Finished episode 6131 after 380713 timesteps
Episode reward: -1.3940000000000003
Average of last 10 rewards: -1.0707071895424836
Finished episode 6141 after 381644 timesteps
Episode reward: -1.02
Average of last 10 rewards: -1.0705977161500817
Finished episode 6151 after 382644 timesteps
Episode reward: -1.2810000000000001
Average of last 10 rewards: -1.0706379478827361
Finished episode 6161 after 383271 timesteps
Episode reward: -1.052
Average of last 10 rewards: -1.070629918699187
Finished episode 6171 after 383835 timesteps
Episode reward: -1.04
Average of last 10 rewards: -1.0705521103896105
INFO:tensorflow:Saving checkpoints for 384030 into ./ppo/model/model.ckpt.
Finished episode 6181 after 384288 timesteps
Episode reward: -1.107
Average of last 10 rewards: -1.070569692058347
Finished episode 6191 after 384716 timesteps
Episode reward: -1.035
Average of last 10 rewards: -1.0705692556634305
Finished episode 6201 after 385305 timesteps
Episode reward: -1.1340000000000001
Average of

INFO:tensorflow:Saving checkpoints for 422563 into ./ppo/model/model.ckpt.
Finished episode 6801 after 422687 timesteps
Episode reward: -1.143
Average of last 10 rewards: -1.0711082474226805
Finished episode 6811 after 423290 timesteps
Episode reward: -0.6799999999999999
Average of last 10 rewards: -1.0711307352941177
Finished episode 6821 after 423752 timesteps
Episode reward: -1.169
Average of last 10 rewards: -1.0711189427312777
Finished episode 6831 after 424183 timesteps
Episode reward: -1.034
Average of last 10 rewards: -1.071082991202346
Finished episode 6841 after 424741 timesteps
Episode reward: -1.137
Average of last 10 rewards: -1.071132503660322
Finished episode 6851 after 425155 timesteps
Episode reward: -1.113
Average of last 10 rewards: -1.071106725146199
Finished episode 6861 after 425680 timesteps
Episode reward: -1.079
Average of last 10 rewards: -1.0710982481751827
Finished episode 6871 after 426554 timesteps
Episode reward: -1.081
Average of last 10 rewards: -1.0712

Finished episode 7471 after 463447 timesteps
Episode reward: -1.077
Average of last 10 rewards: -1.0711231903485254
Finished episode 7481 after 464098 timesteps
Episode reward: -1.068
Average of last 10 rewards: -1.071188219544846
Finished episode 7491 after 464915 timesteps
Episode reward: -1.059
Average of last 10 rewards: -1.0711208556149734
Finished episode 7501 after 465277 timesteps
Episode reward: -1.04
Average of last 10 rewards: -1.0711248331108145
Finished episode 7511 after 466211 timesteps
Episode reward: -1.219
Average of last 10 rewards: -1.071169866666667
Finished episode 7521 after 466743 timesteps
Episode reward: -1.055
Average of last 10 rewards: -1.0712203728362184
Finished episode 7531 after 467424 timesteps
Episode reward: -1.035
Average of last 10 rewards: -1.0713063829787235
Finished episode 7541 after 467856 timesteps
Episode reward: -1.041
Average of last 10 rewards: -1.071272908366534
Finished episode 7551 after 468299 timesteps
Episode reward: -1.141
Average 

Finished episode 8141 after 505822 timesteps
Episode reward: -1.049
Average of last 10 rewards: -1.0720680196801968
Finished episode 8151 after 506436 timesteps
Episode reward: -1.076
Average of last 10 rewards: -1.0721159705159706
Finished episode 8161 after 506930 timesteps
Episode reward: -1.143
Average of last 10 rewards: -1.072130920245399
Finished episode 8171 after 507548 timesteps
Episode reward: -1.04
Average of last 10 rewards: -1.0721708333333335
Finished episode 8181 after 508330 timesteps
Episode reward: -1.096
Average of last 10 rewards: -1.0722571603427173
Finished episode 8191 after 508908 timesteps
Episode reward: -1.1760000000000002
Average of last 10 rewards: -1.0722940097799512
Finished episode 8201 after 509443 timesteps
Episode reward: -1.085
Average of last 10 rewards: -1.0723091575091575
Finished episode 8211 after 510206 timesteps
Episode reward: -1.1920000000000002
Average of last 10 rewards: -1.0723219512195121
Finished episode 8221 after 510541 timesteps
Epi

Finished episode 8831 after 543915 timesteps
Episode reward: -1.081
Average of last 10 rewards: -1.071544671201814
Finished episode 8841 after 544294 timesteps
Episode reward: -1.023
Average of last 10 rewards: -1.0715347678369196
Finished episode 8851 after 545036 timesteps
Episode reward: -1.088
Average of last 10 rewards: -1.0714157239819002
Finished episode 8861 after 545606 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.0714131073446327
Finished episode 8871 after 546280 timesteps
Episode reward: -1.1340000000000001
Average of last 10 rewards: -1.0714628668171557
Finished episode 8881 after 547067 timesteps
Episode reward: -1.029
Average of last 10 rewards: -1.0714880496054113
Finished episode 8891 after 547909 timesteps
Episode reward: -1.139
Average of last 10 rewards: -1.0715280405405405
Finished episode 8901 after 548482 timesteps
Episode reward: -1.069
Average of last 10 rewards: -1.071573453318335
Finished episode 8911 after 549225 timesteps
Episode reward: 

Finished episode 9501 after 587138 timesteps
Episode reward: -0.806
Average of last 10 rewards: -1.07244531085353
Finished episode 9511 after 587735 timesteps
Episode reward: -1.05
Average of last 10 rewards: -1.072446
Finished episode 9521 after 588200 timesteps
Episode reward: -1.111
Average of last 10 rewards: -1.0724211356466875
Finished episode 9531 after 588856 timesteps
Episode reward: -1.282
Average of last 10 rewards: -1.0724713235294117
Finished episode 9541 after 589448 timesteps
Episode reward: -1.1540000000000001
Average of last 10 rewards: -1.0724895068205664
Finished episode 9551 after 589882 timesteps
Episode reward: -1.12
Average of last 10 rewards: -1.0724912997903564
Finished episode 9561 after 590381 timesteps
Episode reward: -0.75
Average of last 10 rewards: -1.0724633507853403
Finished episode 9571 after 590991 timesteps
Episode reward: -1.101
Average of last 10 rewards: -1.0723748953974894
Finished episode 9581 after 591436 timesteps
Episode reward: -1.083
Averag

Finished episode 10181 after 629376 timesteps
Episode reward: -1.073
Average of last 10 rewards: -1.0723455260570305
Finished episode 10191 after 629923 timesteps
Episode reward: -1.1620000000000001
Average of last 10 rewards: -1.0723594302554027
Finished episode 10201 after 630635 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.0724182531894013
Finished episode 10211 after 631286 timesteps
Episode reward: -1.121
Average of last 10 rewards: -1.0724564705882353
Finished episode 10221 after 632210 timesteps
Episode reward: -1.0310000000000004
Average of last 10 rewards: -1.0724255631733595
Finished episode 10231 after 632823 timesteps
Episode reward: -1.091
Average of last 10 rewards: -1.072434637964775
Finished episode 10241 after 633500 timesteps
Episode reward: -1.076
Average of last 10 rewards: -1.0724802541544476
Finished episode 10251 after 633957 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.0724829101562499
Finished episode 10261 after 634496 tim

Finished episode 10851 after 670996 timesteps
Episode reward: -1.161
Average of last 10 rewards: -1.0723689114391144
Finished episode 10861 after 671735 timesteps
Episode reward: -1.078
Average of last 10 rewards: -1.0724280184331796
Finished episode 10871 after 672622 timesteps
Episode reward: -0.7070000000000001
Average of last 10 rewards: -1.072446132596685
Finished episode 10881 after 673125 timesteps
Episode reward: -1.038
Average of last 10 rewards: -1.0724731370745169
Finished episode 10891 after 673708 timesteps
Episode reward: -1.211
Average of last 10 rewards: -1.0724269301470588
Finished episode 10901 after 674502 timesteps
Episode reward: -1.039
Average of last 10 rewards: -1.0724706152433425
Finished episode 10911 after 675084 timesteps
Episode reward: -1.121
Average of last 10 rewards: -1.0724543119266055
Finished episode 10921 after 675725 timesteps
Episode reward: -1.149
Average of last 10 rewards: -1.0724561869844178
Finished episode 10931 after 676518 timesteps
Episod

Finished episode 11531 after 714387 timesteps
Episode reward: -1.086
Average of last 10 rewards: -1.072487673611111
Finished episode 11541 after 715014 timesteps
Episode reward: -1.068
Average of last 10 rewards: -1.0724888117953164
Finished episode 11551 after 715741 timesteps
Episode reward: -1.03
Average of last 10 rewards: -1.0724742634315423
INFO:tensorflow:Saving checkpoints for 715969 into ./ppo/model/model.ckpt.
Finished episode 11561 after 716221 timesteps
Episode reward: -1.106
Average of last 10 rewards: -1.0724526406926407
Finished episode 11571 after 717160 timesteps
Episode reward: -1.051
Average of last 10 rewards: -1.0724175605536332
Finished episode 11581 after 717962 timesteps
Episode reward: -1.109
Average of last 10 rewards: -1.0723778738115817
Finished episode 11591 after 718395 timesteps
Episode reward: -1.147
Average of last 10 rewards: -1.072389896373057
Finished episode 11601 after 718800 timesteps
Episode reward: -1.099
Average of last 10 rewards: -1.072404400

Finished episode 12261 after 758063 timesteps
Episode reward: -1.093
Average of last 10 rewards: -1.0721808163265307
Finished episode 12271 after 758651 timesteps
Episode reward: -1.11
Average of last 10 rewards: -1.0722122349102774
Finished episode 12281 after 759241 timesteps
Episode reward: -0.679
Average of last 10 rewards: -1.0722000814995925
Finished episode 12291 after 759653 timesteps
Episode reward: -1.015
Average of last 10 rewards: -1.0722115635179152
Finished episode 12301 after 760147 timesteps
Episode reward: -1.058
Average of last 10 rewards: -1.0721906427990235
Finished episode 12311 after 760967 timesteps
Episode reward: -0.8710000000000002
Average of last 10 rewards: -1.0721734959349594
Finished episode 12321 after 761477 timesteps
Episode reward: -1.035
Average of last 10 rewards: -1.0721838342810721
Finished episode 12331 after 762213 timesteps
Episode reward: -1.019
Average of last 10 rewards: -1.072175081168831
Finished episode 12341 after 763207 timesteps
Episode

Finished episode 12931 after 800888 timesteps
Episode reward: -0.671
Average of last 10 rewards: -1.0718631578947369
Finished episode 12941 after 801353 timesteps
Episode reward: -1.023
Average of last 10 rewards: -1.0718372003093581
Finished episode 12951 after 801943 timesteps
Episode reward: -1.084
Average of last 10 rewards: -1.0718374806800617
Finished episode 12961 after 802514 timesteps
Episode reward: -1.1540000000000001
Average of last 10 rewards: -1.0718643243243244
Finished episode 12971 after 803069 timesteps
Episode reward: -1.045
Average of last 10 rewards: -1.071858564814815
Finished episode 12981 after 803471 timesteps
Episode reward: -1.045
Average of last 10 rewards: -1.0718297609868928
Finished episode 12991 after 803938 timesteps
Episode reward: -1.057
Average of last 10 rewards: -1.0718441448382126
Finished episode 13001 after 804655 timesteps
Episode reward: -1.07
Average of last 10 rewards: -1.0718685912240185
Finished episode 13011 after 805447 timesteps
Episode

Finished episode 13611 after 843011 timesteps
Episode reward: -1.081
Average of last 10 rewards: -1.0716211029411764
Finished episode 13621 after 843635 timesteps
Episode reward: -1.109
Average of last 10 rewards: -1.071663335782513
Finished episode 13631 after 844234 timesteps
Episode reward: -1.06
Average of last 10 rewards: -1.0716522026431716
Finished episode 13641 after 844729 timesteps
Episode reward: -1.1280000000000001
Average of last 10 rewards: -1.0716473954512105
Finished episode 13651 after 845430 timesteps
Episode reward: -1.056
Average of last 10 rewards: -1.0716552785923754
Finished episode 13661 after 846003 timesteps
Episode reward: -1.027
Average of last 10 rewards: -1.071658315018315
INFO:tensorflow:Saving checkpoints for 846194 into ./ppo/model/model.ckpt.
Finished episode 13671 after 846652 timesteps
Episode reward: -0.806
Average of last 10 rewards: -1.071590336749634
Finished episode 13681 after 847218 timesteps
Episode reward: -1.081
Average of last 10 rewards: 

Finished episode 14271 after 888044 timesteps
Episode reward: -1.061
Average of last 10 rewards: -1.0716612201963533
Finished episode 14281 after 888451 timesteps
Episode reward: -1.087
Average of last 10 rewards: -1.0716379117028731
Finished episode 14291 after 888967 timesteps
Episode reward: -1.053
Average of last 10 rewards: -1.0716519607843136
Finished episode 14301 after 890125 timesteps
Episode reward: -1.1780000000000002
Average of last 10 rewards: -1.0717324702589224
Finished episode 14311 after 890912 timesteps
Episode reward: -1.058
Average of last 10 rewards: -1.0717037762237762
Finished episode 14321 after 891303 timesteps
Episode reward: -0.644
Average of last 10 rewards: -1.0716715583508036
Finished episode 14331 after 891781 timesteps
Episode reward: -1.034
Average of last 10 rewards: -1.0716519553072625
Finished episode 14341 after 892238 timesteps
Episode reward: -1.049
Average of last 10 rewards: -1.0716642009769715
Finished episode 14351 after 893320 timesteps
Episo

Finished episode 14941 after 930104 timesteps
Episode reward: -1.055
Average of last 10 rewards: -1.0718738111185533
Finished episode 14951 after 930776 timesteps
Episode reward: -1.2140000000000002
Average of last 10 rewards: -1.071913922356091
Finished episode 14961 after 931261 timesteps
Episode reward: -1.122
Average of last 10 rewards: -1.0718985284280935
Finished episode 14971 after 931861 timesteps
Episode reward: -1.1640000000000001
Average of last 10 rewards: -1.071916577540107
Finished episode 14981 after 932425 timesteps
Episode reward: -1.045
Average of last 10 rewards: -1.0718832331329327
Finished episode 14991 after 933165 timesteps
Episode reward: -1.071
Average of last 10 rewards: -1.0719074098798398
Finished episode 15001 after 933571 timesteps
Episode reward: -1.101
Average of last 10 rewards: -1.0719032021347565
Finished episode 15011 after 934757 timesteps
Episode reward: -1.2520000000000002
Average of last 10 rewards: -1.0719222666666666
Finished episode 15021 afte

Finished episode 15611 after 975227 timesteps
Episode reward: -1.057
Average of last 10 rewards: -1.0722746153846154
Finished episode 15621 after 976245 timesteps
Episode reward: -1.085
Average of last 10 rewards: -1.072206982703395
INFO:tensorflow:Saving checkpoints for 976880 into ./ppo/model/model.ckpt.
Finished episode 15631 after 977033 timesteps
Episode reward: -1.1580000000000001
Average of last 10 rewards: -1.0722095390524968
Finished episode 15641 after 977858 timesteps
Episode reward: -1.3850000000000002
Average of last 10 rewards: -1.0722360204734485
Finished episode 15651 after 978498 timesteps
Episode reward: -1.084
Average of last 10 rewards: -1.072283567774936
Finished episode 15661 after 979093 timesteps
Episode reward: -0.6619999999999999
Average of last 10 rewards: -1.0722305431309904
Finished episode 15671 after 980129 timesteps
Episode reward: -1.049
Average of last 10 rewards: -1.0723049808429117
Finished episode 15681 after 980594 timesteps
Episode reward: -1.042


Finished episode 16281 after 1015155 timesteps
Episode reward: -1.081
Average of last 10 rewards: -1.0720819913952058
INFO:tensorflow:Saving checkpoints for 1015373 into ./ppo/model/model.ckpt.
Finished episode 16291 after 1015795 timesteps
Episode reward: -1.046
Average of last 10 rewards: -1.072105405405405
Finished episode 16301 after 1016551 timesteps
Episode reward: -1.093
Average of last 10 rewards: -1.072141927562922
Finished episode 16311 after 1017286 timesteps
Episode reward: -1.4060000000000004
Average of last 10 rewards: -1.072158773006135
Finished episode 16321 after 1017955 timesteps
Episode reward: -1.031
Average of last 10 rewards: -1.07218467198038
Finished episode 16331 after 1018523 timesteps
Episode reward: -1.074
Average of last 10 rewards: -1.0721792892156863
Finished episode 16341 after 1019205 timesteps
Episode reward: -1.245
Average of last 10 rewards: -1.0721590324556032
Finished episode 16351 after 1019867 timesteps
Episode reward: -1.089
Average of last 10 r

Finished episode 16941 after 1057420 timesteps
Episode reward: -1.077
Average of last 10 rewards: -1.07228812758417
Finished episode 16951 after 1058271 timesteps
Episode reward: -1.062
Average of last 10 rewards: -1.0723133412042503
Finished episode 16961 after 1059154 timesteps
Episode reward: -1.067
Average of last 10 rewards: -1.0722945132743362
Finished episode 16971 after 1059576 timesteps
Episode reward: -1.05
Average of last 10 rewards: -1.0722972287735848
Finished episode 16981 after 1060382 timesteps
Episode reward: -1.2650000000000001
Average of last 10 rewards: -1.0723166175604006
Finished episode 16991 after 1061048 timesteps
Episode reward: -1.2400000000000002
Average of last 10 rewards: -1.0723025323910482
Finished episode 17001 after 1061706 timesteps
Episode reward: -1.1660000000000001
Average of last 10 rewards: -1.0723335491465569
Finished episode 17011 after 1062615 timesteps
Episode reward: -0.6620000000000001
Average of last 10 rewards: -1.0723377647058825
Finishe