In [1]:
#!/usr/bin/env python3
import gym, gym.spaces
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [4]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)
    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [5]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
    def forward(self, x):
        return self.net(x)

In [6]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [7]:
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs

In [8]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [9]:
if __name__ == "__main__":
    env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n
    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=0.01)
    writer = SummaryWriter(comment="-frozenlake-naive")
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        if reward_m > 0.8:
            print("Solved!")
            break
    writer.close()

0: loss=1.397, reward_mean=0.0, reward_bound=0.0
1: loss=1.383, reward_mean=0.0, reward_bound=0.0
2: loss=1.383, reward_mean=0.0, reward_bound=0.0
3: loss=1.361, reward_mean=0.0, reward_bound=0.0
4: loss=1.326, reward_mean=0.0, reward_bound=0.0
5: loss=1.357, reward_mean=0.0, reward_bound=0.0
6: loss=1.347, reward_mean=0.1, reward_bound=0.0
7: loss=1.306, reward_mean=0.0, reward_bound=0.0
8: loss=1.292, reward_mean=0.0, reward_bound=0.0
9: loss=1.301, reward_mean=0.0, reward_bound=0.0
10: loss=1.283, reward_mean=0.0, reward_bound=0.0
11: loss=1.287, reward_mean=0.0, reward_bound=0.0
12: loss=1.242, reward_mean=0.0, reward_bound=0.0
13: loss=1.174, reward_mean=0.0, reward_bound=0.0
14: loss=1.204, reward_mean=0.0, reward_bound=0.0
15: loss=1.186, reward_mean=0.0, reward_bound=0.0
16: loss=1.106, reward_mean=0.0, reward_bound=0.0
17: loss=1.089, reward_mean=0.0, reward_bound=0.0
18: loss=1.148, reward_mean=0.0, reward_bound=0.0
19: loss=1.076, reward_mean=0.0, reward_bound=0.0
20: loss=0

167: loss=0.868, reward_mean=0.1, reward_bound=0.0
168: loss=0.913, reward_mean=0.0, reward_bound=0.0
169: loss=0.952, reward_mean=0.0, reward_bound=0.0
170: loss=0.883, reward_mean=0.1, reward_bound=0.0
171: loss=0.906, reward_mean=0.0, reward_bound=0.0
172: loss=0.924, reward_mean=0.1, reward_bound=0.0
173: loss=0.866, reward_mean=0.0, reward_bound=0.0
174: loss=0.890, reward_mean=0.0, reward_bound=0.0
175: loss=0.928, reward_mean=0.0, reward_bound=0.0
176: loss=0.801, reward_mean=0.0, reward_bound=0.0
177: loss=0.787, reward_mean=0.1, reward_bound=0.0
178: loss=0.919, reward_mean=0.1, reward_bound=0.0
179: loss=0.887, reward_mean=0.0, reward_bound=0.0
180: loss=0.663, reward_mean=0.0, reward_bound=0.0
181: loss=0.743, reward_mean=0.1, reward_bound=0.0
182: loss=1.005, reward_mean=0.0, reward_bound=0.0
183: loss=0.754, reward_mean=0.0, reward_bound=0.0
184: loss=0.652, reward_mean=0.0, reward_bound=0.0
185: loss=0.731, reward_mean=0.1, reward_bound=0.0
186: loss=0.841, reward_mean=0.

336: loss=0.678, reward_mean=0.0, reward_bound=0.0
337: loss=0.704, reward_mean=0.0, reward_bound=0.0
338: loss=0.662, reward_mean=0.0, reward_bound=0.0
339: loss=0.714, reward_mean=0.0, reward_bound=0.0
340: loss=0.729, reward_mean=0.0, reward_bound=0.0
341: loss=0.677, reward_mean=0.0, reward_bound=0.0
342: loss=0.749, reward_mean=0.0, reward_bound=0.0
343: loss=0.760, reward_mean=0.0, reward_bound=0.0
344: loss=0.696, reward_mean=0.0, reward_bound=0.0
345: loss=0.752, reward_mean=0.0, reward_bound=0.0
346: loss=0.756, reward_mean=0.0, reward_bound=0.0
347: loss=0.624, reward_mean=0.0, reward_bound=0.0
348: loss=0.711, reward_mean=0.0, reward_bound=0.0
349: loss=0.807, reward_mean=0.0, reward_bound=0.0
350: loss=0.738, reward_mean=0.0, reward_bound=0.0
351: loss=0.763, reward_mean=0.0, reward_bound=0.0
352: loss=0.736, reward_mean=0.0, reward_bound=0.0
353: loss=0.618, reward_mean=0.0, reward_bound=0.0
354: loss=0.628, reward_mean=0.0, reward_bound=0.0
355: loss=0.811, reward_mean=0.

503: loss=0.367, reward_mean=0.0, reward_bound=0.0
504: loss=0.392, reward_mean=0.0, reward_bound=0.0
505: loss=0.489, reward_mean=0.1, reward_bound=0.0
506: loss=0.454, reward_mean=0.0, reward_bound=0.0
507: loss=0.504, reward_mean=0.0, reward_bound=0.0
508: loss=0.501, reward_mean=0.0, reward_bound=0.0
509: loss=0.465, reward_mean=0.0, reward_bound=0.0
510: loss=0.494, reward_mean=0.0, reward_bound=0.0
511: loss=0.542, reward_mean=0.0, reward_bound=0.0
512: loss=0.480, reward_mean=0.0, reward_bound=0.0
513: loss=0.466, reward_mean=0.0, reward_bound=0.0
514: loss=0.485, reward_mean=0.0, reward_bound=0.0
515: loss=0.479, reward_mean=0.0, reward_bound=0.0
516: loss=0.430, reward_mean=0.0, reward_bound=0.0
517: loss=0.464, reward_mean=0.0, reward_bound=0.0
518: loss=0.422, reward_mean=0.0, reward_bound=0.0
519: loss=0.562, reward_mean=0.0, reward_bound=0.0
520: loss=0.402, reward_mean=0.0, reward_bound=0.0
521: loss=0.427, reward_mean=0.0, reward_bound=0.0
522: loss=0.524, reward_mean=0.

671: loss=0.007, reward_mean=0.0, reward_bound=0.0
672: loss=0.007, reward_mean=0.1, reward_bound=0.0
673: loss=0.068, reward_mean=0.0, reward_bound=0.0
674: loss=0.050, reward_mean=0.0, reward_bound=0.0
675: loss=0.141, reward_mean=0.0, reward_bound=0.0
676: loss=0.005, reward_mean=0.0, reward_bound=0.0
677: loss=0.070, reward_mean=0.1, reward_bound=0.0
678: loss=0.007, reward_mean=0.1, reward_bound=0.0
679: loss=0.005, reward_mean=0.0, reward_bound=0.0
680: loss=0.006, reward_mean=0.1, reward_bound=0.0
681: loss=0.086, reward_mean=0.1, reward_bound=0.0
682: loss=0.008, reward_mean=0.1, reward_bound=0.0
683: loss=0.006, reward_mean=0.1, reward_bound=0.0
684: loss=0.076, reward_mean=0.0, reward_bound=0.0
685: loss=0.006, reward_mean=0.0, reward_bound=0.0
686: loss=0.053, reward_mean=0.1, reward_bound=0.0
687: loss=0.125, reward_mean=0.0, reward_bound=0.0
688: loss=0.006, reward_mean=0.0, reward_bound=0.0
689: loss=0.005, reward_mean=0.0, reward_bound=0.0
690: loss=0.004, reward_mean=0.

842: loss=0.005, reward_mean=0.0, reward_bound=0.0
843: loss=0.047, reward_mean=0.0, reward_bound=0.0
844: loss=0.009, reward_mean=0.1, reward_bound=0.0
845: loss=0.006, reward_mean=0.1, reward_bound=0.0
846: loss=0.004, reward_mean=0.1, reward_bound=0.0
847: loss=0.098, reward_mean=0.0, reward_bound=0.0
848: loss=0.003, reward_mean=0.1, reward_bound=0.0
849: loss=0.237, reward_mean=0.0, reward_bound=0.0
850: loss=0.005, reward_mean=0.0, reward_bound=0.0
851: loss=0.007, reward_mean=0.0, reward_bound=0.0
852: loss=0.007, reward_mean=0.0, reward_bound=0.0
853: loss=0.006, reward_mean=0.1, reward_bound=0.0
854: loss=0.009, reward_mean=0.0, reward_bound=0.0
855: loss=0.042, reward_mean=0.1, reward_bound=0.0
856: loss=0.005, reward_mean=0.0, reward_bound=0.0
857: loss=0.007, reward_mean=0.0, reward_bound=0.0
858: loss=0.047, reward_mean=0.1, reward_bound=0.0
859: loss=0.004, reward_mean=0.0, reward_bound=0.0
860: loss=0.054, reward_mean=0.0, reward_bound=0.0
861: loss=0.005, reward_mean=0.

1012: loss=0.135, reward_mean=0.0, reward_bound=0.0
1013: loss=0.163, reward_mean=0.1, reward_bound=0.0
1014: loss=0.100, reward_mean=0.0, reward_bound=0.0
1015: loss=0.148, reward_mean=0.0, reward_bound=0.0
1016: loss=0.193, reward_mean=0.1, reward_bound=0.0
1017: loss=0.268, reward_mean=0.0, reward_bound=0.0
1018: loss=0.081, reward_mean=0.0, reward_bound=0.0
1019: loss=0.091, reward_mean=0.0, reward_bound=0.0
1020: loss=0.109, reward_mean=0.0, reward_bound=0.0
1021: loss=0.142, reward_mean=0.0, reward_bound=0.0
1022: loss=0.074, reward_mean=0.0, reward_bound=0.0
1023: loss=0.148, reward_mean=0.0, reward_bound=0.0
1024: loss=0.271, reward_mean=0.0, reward_bound=0.0
1025: loss=0.096, reward_mean=0.0, reward_bound=0.0
1026: loss=0.181, reward_mean=0.0, reward_bound=0.0
1027: loss=0.228, reward_mean=0.1, reward_bound=0.0
1028: loss=0.105, reward_mean=0.0, reward_bound=0.0
1029: loss=0.099, reward_mean=0.0, reward_bound=0.0
1030: loss=0.178, reward_mean=0.0, reward_bound=0.0
1031: loss=0

1173: loss=0.010, reward_mean=0.0, reward_bound=0.0
1174: loss=0.043, reward_mean=0.0, reward_bound=0.0
1175: loss=0.104, reward_mean=0.1, reward_bound=0.0
1176: loss=0.070, reward_mean=0.0, reward_bound=0.0
1177: loss=0.038, reward_mean=0.1, reward_bound=0.0
1178: loss=0.048, reward_mean=0.0, reward_bound=0.0
1179: loss=0.080, reward_mean=0.0, reward_bound=0.0
1180: loss=0.093, reward_mean=0.1, reward_bound=0.0
1181: loss=0.114, reward_mean=0.0, reward_bound=0.0
1182: loss=0.060, reward_mean=0.1, reward_bound=0.0
1183: loss=0.013, reward_mean=0.0, reward_bound=0.0
1184: loss=0.034, reward_mean=0.1, reward_bound=0.0
1185: loss=0.035, reward_mean=0.1, reward_bound=0.0
1186: loss=0.055, reward_mean=0.1, reward_bound=0.0
1187: loss=0.041, reward_mean=0.0, reward_bound=0.0
1188: loss=0.130, reward_mean=0.0, reward_bound=0.0
1189: loss=0.042, reward_mean=0.0, reward_bound=0.0
1190: loss=0.059, reward_mean=0.0, reward_bound=0.0
1191: loss=0.112, reward_mean=0.0, reward_bound=0.0
1192: loss=0

1342: loss=0.084, reward_mean=0.1, reward_bound=0.0
1343: loss=0.124, reward_mean=0.0, reward_bound=0.0
1344: loss=0.057, reward_mean=0.1, reward_bound=0.0
1345: loss=0.077, reward_mean=0.0, reward_bound=0.0
1346: loss=0.037, reward_mean=0.1, reward_bound=0.0
1347: loss=0.118, reward_mean=0.1, reward_bound=0.0
1348: loss=0.030, reward_mean=0.0, reward_bound=0.0
1349: loss=0.097, reward_mean=0.1, reward_bound=0.0
1350: loss=0.049, reward_mean=0.1, reward_bound=0.0
1351: loss=0.047, reward_mean=0.0, reward_bound=0.0
1352: loss=0.049, reward_mean=0.1, reward_bound=0.0
1353: loss=0.062, reward_mean=0.1, reward_bound=0.0
1354: loss=0.011, reward_mean=0.1, reward_bound=0.0
1355: loss=0.073, reward_mean=0.1, reward_bound=0.0
1356: loss=0.026, reward_mean=0.1, reward_bound=0.0
1357: loss=0.089, reward_mean=0.0, reward_bound=0.0
1358: loss=0.104, reward_mean=0.0, reward_bound=0.0
1359: loss=0.051, reward_mean=0.0, reward_bound=0.0
1360: loss=0.031, reward_mean=0.0, reward_bound=0.0
1361: loss=0

1511: loss=0.000, reward_mean=0.1, reward_bound=0.0
1512: loss=0.001, reward_mean=0.0, reward_bound=0.0
1513: loss=0.000, reward_mean=0.0, reward_bound=0.0
1514: loss=0.000, reward_mean=0.0, reward_bound=0.0
1515: loss=0.000, reward_mean=0.1, reward_bound=0.0
1516: loss=0.000, reward_mean=0.1, reward_bound=0.0
1517: loss=0.000, reward_mean=0.1, reward_bound=0.0
1518: loss=0.000, reward_mean=0.1, reward_bound=0.0
1519: loss=0.000, reward_mean=0.0, reward_bound=0.0
1520: loss=0.108, reward_mean=0.0, reward_bound=0.0
1521: loss=0.001, reward_mean=0.0, reward_bound=0.0
1522: loss=0.000, reward_mean=0.1, reward_bound=0.0
1523: loss=0.000, reward_mean=0.0, reward_bound=0.0
1524: loss=0.000, reward_mean=0.1, reward_bound=0.0
1525: loss=0.001, reward_mean=0.1, reward_bound=0.0
1526: loss=0.000, reward_mean=0.1, reward_bound=0.0
1527: loss=0.000, reward_mean=0.0, reward_bound=0.0
1528: loss=0.000, reward_mean=0.0, reward_bound=0.0
1529: loss=0.000, reward_mean=0.0, reward_bound=0.0
1530: loss=0

1678: loss=0.000, reward_mean=0.0, reward_bound=0.0
1679: loss=0.000, reward_mean=0.0, reward_bound=0.0
1680: loss=0.000, reward_mean=0.1, reward_bound=0.0
1681: loss=0.000, reward_mean=0.0, reward_bound=0.0
1682: loss=0.000, reward_mean=0.0, reward_bound=0.0
1683: loss=0.000, reward_mean=0.0, reward_bound=0.0
1684: loss=0.000, reward_mean=0.1, reward_bound=0.0
1685: loss=0.000, reward_mean=0.1, reward_bound=0.0
1686: loss=0.000, reward_mean=0.0, reward_bound=0.0
1687: loss=0.000, reward_mean=0.0, reward_bound=0.0
1688: loss=0.000, reward_mean=0.0, reward_bound=0.0
1689: loss=0.000, reward_mean=0.1, reward_bound=0.0
1690: loss=0.000, reward_mean=0.1, reward_bound=0.0
1691: loss=0.000, reward_mean=0.0, reward_bound=0.0
1692: loss=0.000, reward_mean=0.0, reward_bound=0.0
1693: loss=0.000, reward_mean=0.0, reward_bound=0.0
1694: loss=0.000, reward_mean=0.0, reward_bound=0.0
1695: loss=0.000, reward_mean=0.0, reward_bound=0.0
1696: loss=0.000, reward_mean=0.0, reward_bound=0.0
1697: loss=0

1840: loss=0.000, reward_mean=0.0, reward_bound=0.0
1841: loss=0.000, reward_mean=0.0, reward_bound=0.0
1842: loss=0.000, reward_mean=0.0, reward_bound=0.0
1843: loss=0.000, reward_mean=0.1, reward_bound=0.0
1844: loss=0.000, reward_mean=0.1, reward_bound=0.0
1845: loss=0.000, reward_mean=0.0, reward_bound=0.0
1846: loss=0.000, reward_mean=0.0, reward_bound=0.0
1847: loss=0.000, reward_mean=0.0, reward_bound=0.0
1848: loss=0.000, reward_mean=0.0, reward_bound=0.0
1849: loss=0.000, reward_mean=0.0, reward_bound=0.0
1850: loss=0.000, reward_mean=0.2, reward_bound=0.0
1851: loss=0.000, reward_mean=0.1, reward_bound=0.0
1852: loss=0.000, reward_mean=0.0, reward_bound=0.0
1853: loss=0.000, reward_mean=0.1, reward_bound=0.0
1854: loss=0.000, reward_mean=0.1, reward_bound=0.0
1855: loss=0.000, reward_mean=0.1, reward_bound=0.0
1856: loss=0.000, reward_mean=0.0, reward_bound=0.0
1857: loss=0.000, reward_mean=0.1, reward_bound=0.0
1858: loss=0.000, reward_mean=0.1, reward_bound=0.0
1859: loss=0

2005: loss=0.000, reward_mean=0.1, reward_bound=0.0
2006: loss=0.000, reward_mean=0.2, reward_bound=0.0
2007: loss=0.000, reward_mean=0.0, reward_bound=0.0
2008: loss=0.000, reward_mean=0.0, reward_bound=0.0
2009: loss=0.000, reward_mean=0.1, reward_bound=0.0
2010: loss=0.000, reward_mean=0.0, reward_bound=0.0
2011: loss=0.000, reward_mean=0.0, reward_bound=0.0
2012: loss=0.000, reward_mean=0.0, reward_bound=0.0
2013: loss=0.000, reward_mean=0.2, reward_bound=0.0
2014: loss=0.000, reward_mean=0.1, reward_bound=0.0
2015: loss=0.000, reward_mean=0.0, reward_bound=0.0
2016: loss=0.000, reward_mean=0.1, reward_bound=0.0
2017: loss=0.000, reward_mean=0.1, reward_bound=0.0
2018: loss=0.000, reward_mean=0.1, reward_bound=0.0
2019: loss=0.000, reward_mean=0.1, reward_bound=0.0
2020: loss=0.000, reward_mean=0.0, reward_bound=0.0
2021: loss=0.000, reward_mean=0.1, reward_bound=0.0
2022: loss=0.000, reward_mean=0.2, reward_bound=0.0
2023: loss=0.000, reward_mean=0.0, reward_bound=0.0
2024: loss=0

2169: loss=0.000, reward_mean=0.1, reward_bound=0.0
2170: loss=0.000, reward_mean=0.1, reward_bound=0.0
2171: loss=0.000, reward_mean=0.1, reward_bound=0.0
2172: loss=0.000, reward_mean=0.1, reward_bound=0.0
2173: loss=0.000, reward_mean=0.1, reward_bound=0.0
2174: loss=0.000, reward_mean=0.1, reward_bound=0.0
2175: loss=0.000, reward_mean=0.0, reward_bound=0.0
2176: loss=0.000, reward_mean=0.0, reward_bound=0.0
2177: loss=0.000, reward_mean=0.0, reward_bound=0.0
2178: loss=0.000, reward_mean=0.0, reward_bound=0.0
2179: loss=0.000, reward_mean=0.0, reward_bound=0.0
2180: loss=0.000, reward_mean=0.0, reward_bound=0.0
2181: loss=0.000, reward_mean=0.1, reward_bound=0.0
2182: loss=0.000, reward_mean=0.0, reward_bound=0.0
2183: loss=0.000, reward_mean=0.0, reward_bound=0.0
2184: loss=0.000, reward_mean=0.0, reward_bound=0.0
2185: loss=0.000, reward_mean=0.1, reward_bound=0.0
2186: loss=0.000, reward_mean=0.1, reward_bound=0.0
2187: loss=0.000, reward_mean=0.1, reward_bound=0.0
2188: loss=0

2331: loss=0.000, reward_mean=0.0, reward_bound=0.0
2332: loss=0.000, reward_mean=0.1, reward_bound=0.0
2333: loss=0.000, reward_mean=0.0, reward_bound=0.0
2334: loss=0.000, reward_mean=0.0, reward_bound=0.0
2335: loss=0.000, reward_mean=0.0, reward_bound=0.0
2336: loss=0.000, reward_mean=0.1, reward_bound=0.0
2337: loss=0.000, reward_mean=0.0, reward_bound=0.0
2338: loss=0.000, reward_mean=0.1, reward_bound=0.0
2339: loss=0.000, reward_mean=0.1, reward_bound=0.0
2340: loss=0.000, reward_mean=0.1, reward_bound=0.0
2341: loss=0.000, reward_mean=0.0, reward_bound=0.0
2342: loss=0.000, reward_mean=0.0, reward_bound=0.0
2343: loss=0.000, reward_mean=0.0, reward_bound=0.0
2344: loss=0.000, reward_mean=0.0, reward_bound=0.0
2345: loss=0.000, reward_mean=0.1, reward_bound=0.0
2346: loss=0.000, reward_mean=0.0, reward_bound=0.0
2347: loss=0.000, reward_mean=0.0, reward_bound=0.0
2348: loss=0.000, reward_mean=0.0, reward_bound=0.0
2349: loss=0.000, reward_mean=0.1, reward_bound=0.0
2350: loss=0

2500: loss=0.000, reward_mean=0.1, reward_bound=0.0
2501: loss=0.000, reward_mean=0.1, reward_bound=0.0
2502: loss=0.000, reward_mean=0.1, reward_bound=0.0
2503: loss=0.000, reward_mean=0.1, reward_bound=0.0
2504: loss=0.000, reward_mean=0.0, reward_bound=0.0
2505: loss=0.000, reward_mean=0.0, reward_bound=0.0
2506: loss=0.001, reward_mean=0.1, reward_bound=0.0
2507: loss=0.000, reward_mean=0.1, reward_bound=0.0
2508: loss=0.000, reward_mean=0.1, reward_bound=0.0
2509: loss=0.000, reward_mean=0.0, reward_bound=0.0
2510: loss=0.000, reward_mean=0.1, reward_bound=0.0
2511: loss=0.000, reward_mean=0.0, reward_bound=0.0
2512: loss=0.001, reward_mean=0.0, reward_bound=0.0
2513: loss=0.000, reward_mean=0.0, reward_bound=0.0
2514: loss=0.000, reward_mean=0.2, reward_bound=0.0
2515: loss=0.000, reward_mean=0.0, reward_bound=0.0
2516: loss=0.000, reward_mean=0.0, reward_bound=0.0
2517: loss=0.001, reward_mean=0.0, reward_bound=0.0
2518: loss=0.000, reward_mean=0.1, reward_bound=0.0
2519: loss=0

2661: loss=0.000, reward_mean=0.0, reward_bound=0.0
2662: loss=0.000, reward_mean=0.1, reward_bound=0.0
2663: loss=0.000, reward_mean=0.0, reward_bound=0.0
2664: loss=0.000, reward_mean=0.1, reward_bound=0.0
2665: loss=0.000, reward_mean=0.0, reward_bound=0.0
2666: loss=0.000, reward_mean=0.0, reward_bound=0.0
2667: loss=0.000, reward_mean=0.0, reward_bound=0.0
2668: loss=0.000, reward_mean=0.0, reward_bound=0.0
2669: loss=0.000, reward_mean=0.0, reward_bound=0.0
2670: loss=0.000, reward_mean=0.0, reward_bound=0.0
2671: loss=0.000, reward_mean=0.0, reward_bound=0.0
2672: loss=0.000, reward_mean=0.0, reward_bound=0.0
2673: loss=0.000, reward_mean=0.0, reward_bound=0.0
2674: loss=0.000, reward_mean=0.1, reward_bound=0.0
2675: loss=0.000, reward_mean=0.0, reward_bound=0.0
2676: loss=0.000, reward_mean=0.0, reward_bound=0.0
2677: loss=0.000, reward_mean=0.0, reward_bound=0.0
2678: loss=0.000, reward_mean=0.1, reward_bound=0.0
2679: loss=0.000, reward_mean=0.1, reward_bound=0.0
2680: loss=0

2825: loss=0.000, reward_mean=0.0, reward_bound=0.0
2826: loss=0.000, reward_mean=0.0, reward_bound=0.0
2827: loss=0.000, reward_mean=0.0, reward_bound=0.0
2828: loss=0.000, reward_mean=0.1, reward_bound=0.0
2829: loss=0.000, reward_mean=0.2, reward_bound=0.0
2830: loss=0.000, reward_mean=0.1, reward_bound=0.0
2831: loss=0.000, reward_mean=0.1, reward_bound=0.0
2832: loss=0.093, reward_mean=0.0, reward_bound=0.0
2833: loss=0.000, reward_mean=0.0, reward_bound=0.0
2834: loss=0.000, reward_mean=0.1, reward_bound=0.0
2835: loss=0.000, reward_mean=0.1, reward_bound=0.0
2836: loss=0.000, reward_mean=0.1, reward_bound=0.0
2837: loss=0.000, reward_mean=0.0, reward_bound=0.0
2838: loss=0.000, reward_mean=0.0, reward_bound=0.0
2839: loss=0.000, reward_mean=0.0, reward_bound=0.0
2840: loss=0.000, reward_mean=0.0, reward_bound=0.0
2841: loss=0.000, reward_mean=0.0, reward_bound=0.0
2842: loss=0.000, reward_mean=0.0, reward_bound=0.0
2843: loss=0.000, reward_mean=0.1, reward_bound=0.0
2844: loss=0

2990: loss=0.000, reward_mean=0.1, reward_bound=0.0
2991: loss=0.000, reward_mean=0.0, reward_bound=0.0
2992: loss=0.000, reward_mean=0.0, reward_bound=0.0
2993: loss=0.000, reward_mean=0.1, reward_bound=0.0
2994: loss=0.000, reward_mean=0.0, reward_bound=0.0
2995: loss=0.000, reward_mean=0.0, reward_bound=0.0
2996: loss=0.000, reward_mean=0.0, reward_bound=0.0
2997: loss=0.000, reward_mean=0.0, reward_bound=0.0
2998: loss=0.000, reward_mean=0.0, reward_bound=0.0
2999: loss=0.000, reward_mean=0.1, reward_bound=0.0
3000: loss=0.000, reward_mean=0.0, reward_bound=0.0
3001: loss=0.000, reward_mean=0.0, reward_bound=0.0
3002: loss=0.000, reward_mean=0.2, reward_bound=0.0
3003: loss=0.000, reward_mean=0.0, reward_bound=0.0
3004: loss=0.000, reward_mean=0.1, reward_bound=0.0
3005: loss=0.000, reward_mean=0.1, reward_bound=0.0
3006: loss=0.000, reward_mean=0.0, reward_bound=0.0
3007: loss=0.000, reward_mean=0.0, reward_bound=0.0
3008: loss=0.000, reward_mean=0.1, reward_bound=0.0
3009: loss=0

3150: loss=0.001, reward_mean=0.1, reward_bound=0.0
3151: loss=0.001, reward_mean=0.1, reward_bound=0.0
3152: loss=0.002, reward_mean=0.0, reward_bound=0.0
3153: loss=0.040, reward_mean=0.0, reward_bound=0.0
3154: loss=0.052, reward_mean=0.0, reward_bound=0.0
3155: loss=0.003, reward_mean=0.0, reward_bound=0.0
3156: loss=0.002, reward_mean=0.0, reward_bound=0.0
3157: loss=0.004, reward_mean=0.0, reward_bound=0.0
3158: loss=0.004, reward_mean=0.0, reward_bound=0.0
3159: loss=0.003, reward_mean=0.0, reward_bound=0.0
3160: loss=0.046, reward_mean=0.1, reward_bound=0.0
3161: loss=0.004, reward_mean=0.0, reward_bound=0.0
3162: loss=0.003, reward_mean=0.0, reward_bound=0.0
3163: loss=0.003, reward_mean=0.0, reward_bound=0.0
3164: loss=0.002, reward_mean=0.1, reward_bound=0.0
3165: loss=0.004, reward_mean=0.1, reward_bound=0.0
3166: loss=0.003, reward_mean=0.0, reward_bound=0.0
3167: loss=0.039, reward_mean=0.0, reward_bound=0.0
3168: loss=0.003, reward_mean=0.1, reward_bound=0.0
3169: loss=0

3310: loss=0.000, reward_mean=0.1, reward_bound=0.0
3311: loss=0.000, reward_mean=0.1, reward_bound=0.0
3312: loss=0.000, reward_mean=0.1, reward_bound=0.0
3313: loss=0.001, reward_mean=0.0, reward_bound=0.0
3314: loss=0.001, reward_mean=0.1, reward_bound=0.0
3315: loss=0.046, reward_mean=0.1, reward_bound=0.0
3316: loss=0.001, reward_mean=0.0, reward_bound=0.0
3317: loss=0.001, reward_mean=0.0, reward_bound=0.0
3318: loss=0.001, reward_mean=0.0, reward_bound=0.0
3319: loss=0.001, reward_mean=0.0, reward_bound=0.0
3320: loss=0.001, reward_mean=0.0, reward_bound=0.0
3321: loss=0.001, reward_mean=0.1, reward_bound=0.0
3322: loss=0.001, reward_mean=0.0, reward_bound=0.0
3323: loss=0.001, reward_mean=0.1, reward_bound=0.0
3324: loss=0.001, reward_mean=0.0, reward_bound=0.0
3325: loss=0.001, reward_mean=0.0, reward_bound=0.0
3326: loss=0.001, reward_mean=0.1, reward_bound=0.0
3327: loss=0.001, reward_mean=0.1, reward_bound=0.0
3328: loss=0.002, reward_mean=0.1, reward_bound=0.0
3329: loss=0

3480: loss=0.001, reward_mean=0.0, reward_bound=0.0
3481: loss=0.000, reward_mean=0.1, reward_bound=0.0
3482: loss=0.000, reward_mean=0.0, reward_bound=0.0
3483: loss=0.000, reward_mean=0.0, reward_bound=0.0
3484: loss=0.053, reward_mean=0.0, reward_bound=0.0
3485: loss=0.000, reward_mean=0.0, reward_bound=0.0
3486: loss=0.001, reward_mean=0.0, reward_bound=0.0
3487: loss=0.001, reward_mean=0.0, reward_bound=0.0
3488: loss=0.001, reward_mean=0.0, reward_bound=0.0
3489: loss=0.000, reward_mean=0.1, reward_bound=0.0
3490: loss=0.001, reward_mean=0.0, reward_bound=0.0
3491: loss=0.000, reward_mean=0.0, reward_bound=0.0
3492: loss=0.000, reward_mean=0.0, reward_bound=0.0
3493: loss=0.001, reward_mean=0.0, reward_bound=0.0
3494: loss=0.001, reward_mean=0.1, reward_bound=0.0
3495: loss=0.000, reward_mean=0.0, reward_bound=0.0
3496: loss=0.000, reward_mean=0.0, reward_bound=0.0
3497: loss=0.000, reward_mean=0.1, reward_bound=0.0
3498: loss=0.001, reward_mean=0.1, reward_bound=0.0
3499: loss=0

3639: loss=0.000, reward_mean=0.0, reward_bound=0.0
3640: loss=0.000, reward_mean=0.1, reward_bound=0.0
3641: loss=0.000, reward_mean=0.0, reward_bound=0.0
3642: loss=0.000, reward_mean=0.0, reward_bound=0.0
3643: loss=0.000, reward_mean=0.0, reward_bound=0.0
3644: loss=0.000, reward_mean=0.0, reward_bound=0.0
3645: loss=0.000, reward_mean=0.0, reward_bound=0.0
3646: loss=0.000, reward_mean=0.1, reward_bound=0.0
3647: loss=0.000, reward_mean=0.0, reward_bound=0.0
3648: loss=0.000, reward_mean=0.0, reward_bound=0.0
3649: loss=0.000, reward_mean=0.0, reward_bound=0.0
3650: loss=0.000, reward_mean=0.1, reward_bound=0.0
3651: loss=0.000, reward_mean=0.0, reward_bound=0.0
3652: loss=0.000, reward_mean=0.0, reward_bound=0.0
3653: loss=0.000, reward_mean=0.0, reward_bound=0.0
3654: loss=0.000, reward_mean=0.0, reward_bound=0.0
3655: loss=0.000, reward_mean=0.1, reward_bound=0.0
3656: loss=0.000, reward_mean=0.1, reward_bound=0.0
3657: loss=0.000, reward_mean=0.1, reward_bound=0.0
3658: loss=0

3806: loss=0.000, reward_mean=0.0, reward_bound=0.0
3807: loss=0.108, reward_mean=0.0, reward_bound=0.0
3808: loss=0.000, reward_mean=0.1, reward_bound=0.0
3809: loss=0.000, reward_mean=0.0, reward_bound=0.0
3810: loss=0.000, reward_mean=0.0, reward_bound=0.0
3811: loss=0.000, reward_mean=0.1, reward_bound=0.0
3812: loss=0.000, reward_mean=0.1, reward_bound=0.0
3813: loss=0.000, reward_mean=0.0, reward_bound=0.0
3814: loss=0.000, reward_mean=0.0, reward_bound=0.0
3815: loss=0.000, reward_mean=0.0, reward_bound=0.0
3816: loss=0.000, reward_mean=0.1, reward_bound=0.0
3817: loss=0.000, reward_mean=0.0, reward_bound=0.0
3818: loss=0.000, reward_mean=0.0, reward_bound=0.0
3819: loss=0.000, reward_mean=0.0, reward_bound=0.0
3820: loss=0.000, reward_mean=0.0, reward_bound=0.0
3821: loss=0.000, reward_mean=0.1, reward_bound=0.0
3822: loss=0.000, reward_mean=0.1, reward_bound=0.0
3823: loss=0.000, reward_mean=0.0, reward_bound=0.0
3824: loss=0.000, reward_mean=0.0, reward_bound=0.0
3825: loss=0

3973: loss=0.001, reward_mean=0.1, reward_bound=0.0
3974: loss=0.000, reward_mean=0.0, reward_bound=0.0
3975: loss=0.001, reward_mean=0.0, reward_bound=0.0
3976: loss=0.001, reward_mean=0.1, reward_bound=0.0
3977: loss=0.000, reward_mean=0.1, reward_bound=0.0
3978: loss=0.001, reward_mean=0.1, reward_bound=0.0
3979: loss=0.000, reward_mean=0.1, reward_bound=0.0
3980: loss=0.001, reward_mean=0.1, reward_bound=0.0
3981: loss=0.057, reward_mean=0.0, reward_bound=0.0
3982: loss=0.000, reward_mean=0.0, reward_bound=0.0
3983: loss=0.001, reward_mean=0.1, reward_bound=0.0
3984: loss=0.000, reward_mean=0.0, reward_bound=0.0
3985: loss=0.001, reward_mean=0.1, reward_bound=0.0
3986: loss=0.001, reward_mean=0.0, reward_bound=0.0
3987: loss=0.050, reward_mean=0.0, reward_bound=0.0
3988: loss=0.001, reward_mean=0.0, reward_bound=0.0
3989: loss=0.059, reward_mean=0.0, reward_bound=0.0
3990: loss=0.062, reward_mean=0.1, reward_bound=0.0
3991: loss=0.001, reward_mean=0.0, reward_bound=0.0
3992: loss=0

4137: loss=0.003, reward_mean=0.0, reward_bound=0.0
4138: loss=0.001, reward_mean=0.0, reward_bound=0.0
4139: loss=0.003, reward_mean=0.0, reward_bound=0.0
4140: loss=0.003, reward_mean=0.0, reward_bound=0.0
4141: loss=0.048, reward_mean=0.0, reward_bound=0.0
4142: loss=0.002, reward_mean=0.1, reward_bound=0.0
4143: loss=0.052, reward_mean=0.1, reward_bound=0.0
4144: loss=0.003, reward_mean=0.0, reward_bound=0.0
4145: loss=0.002, reward_mean=0.1, reward_bound=0.0
4146: loss=0.002, reward_mean=0.0, reward_bound=0.0
4147: loss=0.001, reward_mean=0.0, reward_bound=0.0
4148: loss=0.000, reward_mean=0.1, reward_bound=0.0
4149: loss=0.001, reward_mean=0.0, reward_bound=0.0
4150: loss=0.002, reward_mean=0.0, reward_bound=0.0
4151: loss=0.048, reward_mean=0.0, reward_bound=0.0
4152: loss=0.001, reward_mean=0.0, reward_bound=0.0
4153: loss=0.049, reward_mean=0.1, reward_bound=0.0
4154: loss=0.001, reward_mean=0.0, reward_bound=0.0
4155: loss=0.001, reward_mean=0.0, reward_bound=0.0
4156: loss=0

4301: loss=0.000, reward_mean=0.1, reward_bound=0.0
4302: loss=0.000, reward_mean=0.1, reward_bound=0.0
4303: loss=0.000, reward_mean=0.0, reward_bound=0.0
4304: loss=0.000, reward_mean=0.0, reward_bound=0.0
4305: loss=0.000, reward_mean=0.1, reward_bound=0.0
4306: loss=0.000, reward_mean=0.0, reward_bound=0.0
4307: loss=0.000, reward_mean=0.0, reward_bound=0.0
4308: loss=0.000, reward_mean=0.1, reward_bound=0.0
4309: loss=0.000, reward_mean=0.1, reward_bound=0.0
4310: loss=0.000, reward_mean=0.1, reward_bound=0.0
4311: loss=0.000, reward_mean=0.1, reward_bound=0.0
4312: loss=0.000, reward_mean=0.1, reward_bound=0.0
4313: loss=0.000, reward_mean=0.1, reward_bound=0.0
4314: loss=0.000, reward_mean=0.0, reward_bound=0.0
4315: loss=0.000, reward_mean=0.1, reward_bound=0.0
4316: loss=0.058, reward_mean=0.1, reward_bound=0.0
4317: loss=0.000, reward_mean=0.1, reward_bound=0.0
4318: loss=0.000, reward_mean=0.1, reward_bound=0.0
4319: loss=0.000, reward_mean=0.1, reward_bound=0.0
4320: loss=0

4464: loss=0.002, reward_mean=0.2, reward_bound=0.0
4465: loss=0.003, reward_mean=0.1, reward_bound=0.0
4466: loss=0.030, reward_mean=0.0, reward_bound=0.0
4467: loss=0.003, reward_mean=0.0, reward_bound=0.0
4468: loss=0.002, reward_mean=0.0, reward_bound=0.0
4469: loss=0.002, reward_mean=0.0, reward_bound=0.0
4470: loss=0.001, reward_mean=0.0, reward_bound=0.0
4471: loss=0.002, reward_mean=0.1, reward_bound=0.0
4472: loss=0.003, reward_mean=0.0, reward_bound=0.0
4473: loss=0.050, reward_mean=0.0, reward_bound=0.0
4474: loss=0.043, reward_mean=0.0, reward_bound=0.0
4475: loss=0.001, reward_mean=0.1, reward_bound=0.0
4476: loss=0.004, reward_mean=0.0, reward_bound=0.0
4477: loss=0.001, reward_mean=0.1, reward_bound=0.0
4478: loss=0.004, reward_mean=0.1, reward_bound=0.0
4479: loss=0.040, reward_mean=0.0, reward_bound=0.0
4480: loss=0.001, reward_mean=0.0, reward_bound=0.0
4481: loss=0.003, reward_mean=0.0, reward_bound=0.0
4482: loss=0.001, reward_mean=0.0, reward_bound=0.0
4483: loss=0

4623: loss=0.000, reward_mean=0.0, reward_bound=0.0
4624: loss=0.000, reward_mean=0.1, reward_bound=0.0
4625: loss=0.000, reward_mean=0.0, reward_bound=0.0
4626: loss=0.000, reward_mean=0.0, reward_bound=0.0
4627: loss=0.000, reward_mean=0.0, reward_bound=0.0
4628: loss=0.000, reward_mean=0.0, reward_bound=0.0
4629: loss=0.000, reward_mean=0.0, reward_bound=0.0
4630: loss=0.000, reward_mean=0.2, reward_bound=0.0
4631: loss=0.000, reward_mean=0.0, reward_bound=0.0
4632: loss=0.000, reward_mean=0.0, reward_bound=0.0
4633: loss=0.000, reward_mean=0.0, reward_bound=0.0
4634: loss=0.000, reward_mean=0.0, reward_bound=0.0
4635: loss=0.000, reward_mean=0.1, reward_bound=0.0
4636: loss=0.000, reward_mean=0.0, reward_bound=0.0
4637: loss=0.000, reward_mean=0.1, reward_bound=0.0
4638: loss=0.000, reward_mean=0.0, reward_bound=0.0
4639: loss=0.000, reward_mean=0.1, reward_bound=0.0
4640: loss=0.000, reward_mean=0.0, reward_bound=0.0
4641: loss=0.000, reward_mean=0.0, reward_bound=0.0
4642: loss=0

4785: loss=0.000, reward_mean=0.1, reward_bound=0.0
4786: loss=0.000, reward_mean=0.0, reward_bound=0.0
4787: loss=0.000, reward_mean=0.1, reward_bound=0.0
4788: loss=0.000, reward_mean=0.0, reward_bound=0.0
4789: loss=0.000, reward_mean=0.0, reward_bound=0.0
4790: loss=0.000, reward_mean=0.0, reward_bound=0.0
4791: loss=0.000, reward_mean=0.1, reward_bound=0.0
4792: loss=0.000, reward_mean=0.0, reward_bound=0.0
4793: loss=0.000, reward_mean=0.0, reward_bound=0.0
4794: loss=0.000, reward_mean=0.0, reward_bound=0.0
4795: loss=0.000, reward_mean=0.0, reward_bound=0.0
4796: loss=0.000, reward_mean=0.1, reward_bound=0.0
4797: loss=0.000, reward_mean=0.1, reward_bound=0.0
4798: loss=0.000, reward_mean=0.0, reward_bound=0.0
4799: loss=0.000, reward_mean=0.0, reward_bound=0.0
4800: loss=0.000, reward_mean=0.0, reward_bound=0.0
4801: loss=0.000, reward_mean=0.0, reward_bound=0.0
4802: loss=0.000, reward_mean=0.0, reward_bound=0.0
4803: loss=0.000, reward_mean=0.1, reward_bound=0.0
4804: loss=0

4950: loss=0.000, reward_mean=0.1, reward_bound=0.0
4951: loss=0.000, reward_mean=0.0, reward_bound=0.0
4952: loss=0.000, reward_mean=0.0, reward_bound=0.0
4953: loss=0.000, reward_mean=0.0, reward_bound=0.0
4954: loss=0.000, reward_mean=0.0, reward_bound=0.0
4955: loss=0.000, reward_mean=0.0, reward_bound=0.0
4956: loss=0.000, reward_mean=0.0, reward_bound=0.0
4957: loss=0.000, reward_mean=0.1, reward_bound=0.0
4958: loss=0.000, reward_mean=0.0, reward_bound=0.0
4959: loss=0.000, reward_mean=0.1, reward_bound=0.0
4960: loss=0.000, reward_mean=0.0, reward_bound=0.0
4961: loss=0.000, reward_mean=0.1, reward_bound=0.0
4962: loss=0.000, reward_mean=0.1, reward_bound=0.0
4963: loss=0.000, reward_mean=0.0, reward_bound=0.0
4964: loss=0.000, reward_mean=0.1, reward_bound=0.0
4965: loss=0.000, reward_mean=0.0, reward_bound=0.0
4966: loss=0.000, reward_mean=0.0, reward_bound=0.0
4967: loss=0.000, reward_mean=0.0, reward_bound=0.0
4968: loss=0.000, reward_mean=0.1, reward_bound=0.0
4969: loss=0

5108: loss=0.000, reward_mean=0.0, reward_bound=0.0
5109: loss=0.000, reward_mean=0.0, reward_bound=0.0
5110: loss=0.000, reward_mean=0.0, reward_bound=0.0
5111: loss=0.000, reward_mean=0.1, reward_bound=0.0
5112: loss=0.000, reward_mean=0.0, reward_bound=0.0
5113: loss=0.000, reward_mean=0.0, reward_bound=0.0
5114: loss=0.000, reward_mean=0.0, reward_bound=0.0
5115: loss=0.000, reward_mean=0.0, reward_bound=0.0
5116: loss=0.000, reward_mean=0.0, reward_bound=0.0
5117: loss=0.000, reward_mean=0.1, reward_bound=0.0
5118: loss=0.000, reward_mean=0.1, reward_bound=0.0
5119: loss=0.000, reward_mean=0.1, reward_bound=0.0
5120: loss=0.000, reward_mean=0.1, reward_bound=0.0
5121: loss=0.000, reward_mean=0.0, reward_bound=0.0
5122: loss=0.000, reward_mean=0.0, reward_bound=0.0
5123: loss=0.000, reward_mean=0.0, reward_bound=0.0
5124: loss=0.000, reward_mean=0.0, reward_bound=0.0
5125: loss=0.000, reward_mean=0.1, reward_bound=0.0
5126: loss=0.000, reward_mean=0.0, reward_bound=0.0
5127: loss=0

5266: loss=0.000, reward_mean=0.1, reward_bound=0.0
5267: loss=0.000, reward_mean=0.0, reward_bound=0.0
5268: loss=0.000, reward_mean=0.0, reward_bound=0.0
5269: loss=0.000, reward_mean=0.1, reward_bound=0.0
5270: loss=0.000, reward_mean=0.0, reward_bound=0.0
5271: loss=0.000, reward_mean=0.0, reward_bound=0.0
5272: loss=0.000, reward_mean=0.0, reward_bound=0.0
5273: loss=0.000, reward_mean=0.1, reward_bound=0.0
5274: loss=0.000, reward_mean=0.1, reward_bound=0.0
5275: loss=0.000, reward_mean=0.0, reward_bound=0.0
5276: loss=0.000, reward_mean=0.1, reward_bound=0.0
5277: loss=0.000, reward_mean=0.0, reward_bound=0.0
5278: loss=0.000, reward_mean=0.1, reward_bound=0.0
5279: loss=0.000, reward_mean=0.0, reward_bound=0.0
5280: loss=0.000, reward_mean=0.0, reward_bound=0.0
5281: loss=0.000, reward_mean=0.0, reward_bound=0.0
5282: loss=0.000, reward_mean=0.0, reward_bound=0.0
5283: loss=0.000, reward_mean=0.1, reward_bound=0.0
5284: loss=0.000, reward_mean=0.1, reward_bound=0.0
5285: loss=0

5428: loss=0.000, reward_mean=0.1, reward_bound=0.0
5429: loss=0.000, reward_mean=0.0, reward_bound=0.0
5430: loss=0.000, reward_mean=0.1, reward_bound=0.0
5431: loss=0.000, reward_mean=0.0, reward_bound=0.0
5432: loss=0.000, reward_mean=0.1, reward_bound=0.0
5433: loss=0.000, reward_mean=0.0, reward_bound=0.0
5434: loss=0.000, reward_mean=0.1, reward_bound=0.0
5435: loss=0.000, reward_mean=0.0, reward_bound=0.0
5436: loss=0.000, reward_mean=0.0, reward_bound=0.0
5437: loss=0.000, reward_mean=0.0, reward_bound=0.0
5438: loss=0.000, reward_mean=0.1, reward_bound=0.0
5439: loss=0.000, reward_mean=0.1, reward_bound=0.0
5440: loss=0.000, reward_mean=0.0, reward_bound=0.0
5441: loss=0.000, reward_mean=0.1, reward_bound=0.0
5442: loss=0.000, reward_mean=0.0, reward_bound=0.0
5443: loss=0.000, reward_mean=0.1, reward_bound=0.0
5444: loss=0.000, reward_mean=0.1, reward_bound=0.0
5445: loss=0.000, reward_mean=0.1, reward_bound=0.0
5446: loss=0.000, reward_mean=0.0, reward_bound=0.0
5447: loss=0

KeyboardInterrupt: 