In [1]:
#!/usr/bin/env python3
from lib import wrappers
from lib import dqn_model

In [2]:
import argparse
import time
import numpy as np
import collections

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
from tensorboardX import SummaryWriter

In [5]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.5

In [6]:
GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

In [7]:
EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

In [8]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [9]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
    def __len__(self):
        return len(self.buffer)
    def append(self, experience):
        self.buffer.append(experience)
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [10]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward
        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [11]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [14]:
if __name__ == "__main__":
    class args:
        env = DEFAULT_ENV_NAME
        cuda = True
        reward = MEAN_REWARD_BOUND
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
#     parser.add_argument("--env", default=DEFAULT_ENV_NAME,
#                         help="Name of the environment, default=" + DEFAULT_ENV_NAME)
#     parser.add_argument("--reward", type=float, default=MEAN_REWARD_BOUND,
#                         help="Mean reward boundary for stop of training, default=%.2f" % MEAN_REWARD_BOUND)
#     args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    env = wrappers.make_env(args.env)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + args.env)
    print(net)
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        reward = agent.play_step(net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, epsilon,
                speed
            ))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), args.env + "-best.dat")
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > args.reward:
                print("Solved in %d frames!" % frame_idx)
                break
        if len(buffer) < REPLAY_START_SIZE:
            continue
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()
    writer.close()

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
835: done 1 games, mean reward -20.000, eps 0.99, speed 948.37 f/s
1650: done 2 games, mean reward -20.500, eps 0.98, speed 726.60 f/s
2571: done 3 games, mean reward -20.667, eps 0.97, speed 832.32 f/s
3333: done 4 games, mean reward -20.750, eps 0.97, speed 972.70 f/s
4554: done 5 games, mean reward -20.400, eps 0.95, speed 867.56 f/s
5376: done 6 games, mean reward -20.500, eps 0.95, speed 950.58 f/s
6216: done 7 games, mean reward -20.429, eps 0.94, speed 961.03 f/s
7038: done 8 games, mean reward -20.500, eps 0.93, speed 941.12 f/s
7800: done 9 games, mean reward -20.5

  next_state_values[done_mask] = 0.0


10263: done 12 games, mean reward -20.667, eps 0.90, speed 301.01 f/s
11025: done 13 games, mean reward -20.692, eps 0.89, speed 112.60 f/s
12057: done 14 games, mean reward -20.571, eps 0.88, speed 121.46 f/s
12921: done 15 games, mean reward -20.533, eps 0.87, speed 117.20 f/s
13771: done 16 games, mean reward -20.562, eps 0.86, speed 118.38 f/s
14533: done 17 games, mean reward -20.588, eps 0.85, speed 115.75 f/s
15447: done 18 games, mean reward -20.611, eps 0.85, speed 119.51 f/s
16506: done 19 games, mean reward -20.579, eps 0.83, speed 118.15 f/s
17287: done 20 games, mean reward -20.600, eps 0.83, speed 122.01 f/s
18095: done 21 games, mean reward -20.619, eps 0.82, speed 104.46 f/s
19102: done 22 games, mean reward -20.591, eps 0.81, speed 110.85 f/s
20122: done 23 games, mean reward -20.565, eps 0.80, speed 109.08 f/s
20940: done 24 games, mean reward -20.583, eps 0.79, speed 110.92 f/s
21924: done 25 games, mean reward -20.560, eps 0.78, speed 102.45 f/s
22898: done 26 games

146879: done 98 games, mean reward -17.561, eps 0.02, speed 90.08 f/s
Best mean reward updated -17.639 -> -17.561, model saved
149551: done 99 games, mean reward -17.485, eps 0.02, speed 86.27 f/s
Best mean reward updated -17.561 -> -17.485, model saved
152622: done 100 games, mean reward -17.260, eps 0.02, speed 55.77 f/s
Best mean reward updated -17.485 -> -17.260, model saved
156040: done 101 games, mean reward -17.030, eps 0.02, speed 77.06 f/s
Best mean reward updated -17.260 -> -17.030, model saved
159053: done 102 games, mean reward -16.710, eps 0.02, speed 94.04 f/s
Best mean reward updated -17.030 -> -16.710, model saved
161392: done 103 games, mean reward -16.360, eps 0.02, speed 44.69 f/s
Best mean reward updated -16.710 -> -16.360, model saved
163914: done 104 games, mean reward -16.050, eps 0.02, speed 90.12 f/s
Best mean reward updated -16.360 -> -16.050, model saved
166392: done 105 games, mean reward -15.780, eps 0.02, speed 54.90 f/s
Best mean reward updated -16.050 ->

300775: done 164 games, mean reward 4.370, eps 0.02, speed 101.92 f/s
Best mean reward updated 4.140 -> 4.370, model saved
302726: done 165 games, mean reward 4.710, eps 0.02, speed 98.32 f/s
Best mean reward updated 4.370 -> 4.710, model saved
304587: done 166 games, mean reward 5.070, eps 0.02, speed 94.89 f/s
Best mean reward updated 4.710 -> 5.070, model saved
306557: done 167 games, mean reward 5.380, eps 0.02, speed 98.32 f/s
Best mean reward updated 5.070 -> 5.380, model saved
308908: done 168 games, mean reward 5.720, eps 0.02, speed 100.80 f/s
Best mean reward updated 5.380 -> 5.720, model saved
310832: done 169 games, mean reward 6.080, eps 0.02, speed 101.64 f/s
Best mean reward updated 5.720 -> 6.080, model saved
312986: done 170 games, mean reward 6.420, eps 0.02, speed 101.21 f/s
Best mean reward updated 6.080 -> 6.420, model saved
314716: done 171 games, mean reward 6.800, eps 0.02, speed 100.01 f/s
Best mean reward updated 6.420 -> 6.800, model saved
316700: done 172 ga

435792: done 232 games, mean reward 17.160, eps 0.02, speed 94.16 f/s
437568: done 233 games, mean reward 17.190, eps 0.02, speed 102.48 f/s
Best mean reward updated 17.160 -> 17.190, model saved
439312: done 234 games, mean reward 17.260, eps 0.02, speed 102.09 f/s
Best mean reward updated 17.190 -> 17.260, model saved
440952: done 235 games, mean reward 17.300, eps 0.02, speed 96.55 f/s
Best mean reward updated 17.260 -> 17.300, model saved
442592: done 236 games, mean reward 17.340, eps 0.02, speed 101.65 f/s
Best mean reward updated 17.300 -> 17.340, model saved
444414: done 237 games, mean reward 17.400, eps 0.02, speed 104.21 f/s
Best mean reward updated 17.340 -> 17.400, model saved
446456: done 238 games, mean reward 17.460, eps 0.02, speed 105.08 f/s
Best mean reward updated 17.400 -> 17.460, model saved
448327: done 239 games, mean reward 17.530, eps 0.02, speed 106.92 f/s
Best mean reward updated 17.460 -> 17.530, model saved
450278: done 240 games, mean reward 17.630, eps 0

638518: done 336 games, mean reward 17.800, eps 0.02, speed 98.86 f/s
640320: done 337 games, mean reward 17.800, eps 0.02, speed 97.11 f/s
642014: done 338 games, mean reward 17.820, eps 0.02, speed 95.92 f/s
643654: done 339 games, mean reward 17.850, eps 0.02, speed 94.71 f/s
645651: done 340 games, mean reward 17.860, eps 0.02, speed 96.66 f/s
647424: done 341 games, mean reward 17.850, eps 0.02, speed 102.07 f/s
649226: done 342 games, mean reward 17.920, eps 0.02, speed 101.61 f/s
651020: done 343 games, mean reward 17.960, eps 0.02, speed 98.76 f/s
653067: done 344 games, mean reward 17.930, eps 0.02, speed 101.78 f/s
655114: done 345 games, mean reward 17.920, eps 0.02, speed 101.93 f/s
656758: done 346 games, mean reward 17.950, eps 0.02, speed 99.19 f/s
658597: done 347 games, mean reward 17.980, eps 0.02, speed 91.47 f/s
660400: done 348 games, mean reward 17.980, eps 0.02, speed 90.85 f/s
662110: done 349 games, mean reward 17.970, eps 0.02, speed 90.74 f/s
664120: done 350

823483: done 436 games, mean reward 18.540, eps 0.02, speed 90.36 f/s
825129: done 437 games, mean reward 18.550, eps 0.02, speed 88.52 f/s
826768: done 438 games, mean reward 18.560, eps 0.02, speed 91.68 f/s
828501: done 439 games, mean reward 18.550, eps 0.02, speed 94.75 f/s
830334: done 440 games, mean reward 18.560, eps 0.02, speed 96.31 f/s
831973: done 441 games, mean reward 18.580, eps 0.02, speed 96.97 f/s
833612: done 442 games, mean reward 18.590, eps 0.02, speed 94.56 f/s
835309: done 443 games, mean reward 18.610, eps 0.02, speed 92.05 f/s
837471: done 444 games, mean reward 18.600, eps 0.02, speed 90.42 f/s
839104: done 445 games, mean reward 18.640, eps 0.02, speed 93.18 f/s
840951: done 446 games, mean reward 18.620, eps 0.02, speed 85.63 f/s
842692: done 447 games, mean reward 18.620, eps 0.02, speed 92.22 f/s
844329: done 448 games, mean reward 18.640, eps 0.02, speed 99.04 f/s
845969: done 449 games, mean reward 18.650, eps 0.02, speed 101.25 f/s
847711: done 450 ga

998897: done 535 games, mean reward 19.450, eps 0.02, speed 101.53 f/s
Best mean reward updated 19.430 -> 19.450, model saved
1000854: done 536 games, mean reward 19.410, eps 0.02, speed 98.69 f/s
1002793: done 537 games, mean reward 19.380, eps 0.02, speed 99.86 f/s
1004634: done 538 games, mean reward 19.360, eps 0.02, speed 98.26 f/s
1006505: done 539 games, mean reward 19.350, eps 0.02, speed 102.85 f/s
1008353: done 540 games, mean reward 19.360, eps 0.02, speed 96.66 f/s
1010232: done 541 games, mean reward 19.330, eps 0.02, speed 96.01 f/s
1012230: done 542 games, mean reward 19.300, eps 0.02, speed 100.47 f/s
1013866: done 543 games, mean reward 19.300, eps 0.02, speed 100.73 f/s
1016064: done 544 games, mean reward 19.310, eps 0.02, speed 96.54 f/s
1017700: done 545 games, mean reward 19.310, eps 0.02, speed 95.11 f/s
1019338: done 546 games, mean reward 19.330, eps 0.02, speed 96.28 f/s
1021118: done 547 games, mean reward 19.330, eps 0.02, speed 90.25 f/s
1022832: done 548 g

1208895: done 650 games, mean reward 18.920, eps 0.02, speed 91.72 f/s
1210528: done 651 games, mean reward 18.930, eps 0.02, speed 91.45 f/s
1212306: done 652 games, mean reward 18.910, eps 0.02, speed 91.85 f/s
1214256: done 653 games, mean reward 18.870, eps 0.02, speed 91.80 f/s
1215984: done 654 games, mean reward 18.880, eps 0.02, speed 94.81 f/s
1217733: done 655 games, mean reward 18.870, eps 0.02, speed 92.76 f/s
1219953: done 656 games, mean reward 18.910, eps 0.02, speed 94.73 f/s
1221694: done 657 games, mean reward 18.910, eps 0.02, speed 93.42 f/s
1223338: done 658 games, mean reward 18.990, eps 0.02, speed 90.37 f/s
1225390: done 659 games, mean reward 18.970, eps 0.02, speed 90.13 f/s
1227301: done 660 games, mean reward 18.960, eps 0.02, speed 91.29 f/s
1229152: done 661 games, mean reward 18.940, eps 0.02, speed 93.68 f/s
1230786: done 662 games, mean reward 18.940, eps 0.02, speed 88.73 f/s
1232419: done 663 games, mean reward 18.940, eps 0.02, speed 95.06 f/s
123405

1412989: done 765 games, mean reward 19.280, eps 0.02, speed 99.91 f/s
1414623: done 766 games, mean reward 19.310, eps 0.02, speed 102.28 f/s
1416254: done 767 games, mean reward 19.320, eps 0.02, speed 91.82 f/s
1417979: done 768 games, mean reward 19.310, eps 0.02, speed 94.45 f/s
1419723: done 769 games, mean reward 19.300, eps 0.02, speed 74.12 f/s
1421419: done 770 games, mean reward 19.310, eps 0.02, speed 96.77 f/s
1423339: done 771 games, mean reward 19.270, eps 0.02, speed 81.95 f/s
1425023: done 772 games, mean reward 19.340, eps 0.02, speed 81.50 f/s
1426656: done 773 games, mean reward 19.370, eps 0.02, speed 95.25 f/s
1428286: done 774 games, mean reward 19.370, eps 0.02, speed 105.54 f/s
1429916: done 775 games, mean reward 19.380, eps 0.02, speed 95.78 f/s
1431858: done 776 games, mean reward 19.370, eps 0.02, speed 91.05 f/s
1433618: done 777 games, mean reward 19.360, eps 0.02, speed 93.11 f/s
1435248: done 778 games, mean reward 19.380, eps 0.02, speed 94.05 f/s
1437

1621636: done 880 games, mean reward 18.910, eps 0.02, speed 93.90 f/s
1623401: done 881 games, mean reward 18.920, eps 0.02, speed 100.99 f/s
1625335: done 882 games, mean reward 18.930, eps 0.02, speed 96.10 f/s
1627271: done 883 games, mean reward 18.890, eps 0.02, speed 101.95 f/s
1629000: done 884 games, mean reward 18.900, eps 0.02, speed 98.78 f/s
1631128: done 885 games, mean reward 18.860, eps 0.02, speed 104.57 f/s
1632787: done 886 games, mean reward 18.890, eps 0.02, speed 104.66 f/s
1634482: done 887 games, mean reward 18.900, eps 0.02, speed 104.14 f/s
1636172: done 888 games, mean reward 18.910, eps 0.02, speed 103.27 f/s
1637802: done 889 games, mean reward 18.910, eps 0.02, speed 103.12 f/s
1639499: done 890 games, mean reward 18.930, eps 0.02, speed 101.55 f/s
1641388: done 891 games, mean reward 18.930, eps 0.02, speed 96.30 f/s
1643148: done 892 games, mean reward 19.000, eps 0.02, speed 87.04 f/s
1645087: done 893 games, mean reward 18.980, eps 0.02, speed 96.76 f/

1826651: done 995 games, mean reward 19.290, eps 0.02, speed 107.80 f/s
1828459: done 996 games, mean reward 19.310, eps 0.02, speed 105.76 f/s
1830499: done 997 games, mean reward 19.280, eps 0.02, speed 107.07 f/s
1832285: done 998 games, mean reward 19.280, eps 0.02, speed 108.45 f/s
1833911: done 999 games, mean reward 19.300, eps 0.02, speed 106.30 f/s
1835727: done 1000 games, mean reward 19.320, eps 0.02, speed 105.50 f/s
1837689: done 1001 games, mean reward 19.310, eps 0.02, speed 104.63 f/s
1839318: done 1002 games, mean reward 19.310, eps 0.02, speed 105.12 f/s
1841064: done 1003 games, mean reward 19.300, eps 0.02, speed 106.48 f/s
1842747: done 1004 games, mean reward 19.290, eps 0.02, speed 101.85 f/s
1844469: done 1005 games, mean reward 19.290, eps 0.02, speed 106.21 f/s
1846097: done 1006 games, mean reward 19.300, eps 0.02, speed 108.21 f/s
1848057: done 1007 games, mean reward 19.270, eps 0.02, speed 107.69 f/s
1849686: done 1008 games, mean reward 19.280, eps 0.02, 

2028317: done 1108 games, mean reward 19.280, eps 0.02, speed 107.53 f/s
2029946: done 1109 games, mean reward 19.290, eps 0.02, speed 109.28 f/s
2031572: done 1110 games, mean reward 19.300, eps 0.02, speed 100.34 f/s
2033262: done 1111 games, mean reward 19.300, eps 0.02, speed 101.63 f/s
2034888: done 1112 games, mean reward 19.320, eps 0.02, speed 105.93 f/s
2036610: done 1113 games, mean reward 19.310, eps 0.02, speed 107.23 f/s
2038341: done 1114 games, mean reward 19.300, eps 0.02, speed 106.77 f/s
2040042: done 1115 games, mean reward 19.320, eps 0.02, speed 101.30 f/s
2042157: done 1116 games, mean reward 19.250, eps 0.02, speed 104.95 f/s
2044408: done 1117 games, mean reward 19.190, eps 0.02, speed 103.24 f/s
2046331: done 1118 games, mean reward 19.240, eps 0.02, speed 102.81 f/s
2048187: done 1119 games, mean reward 19.240, eps 0.02, speed 108.49 f/s
2049836: done 1120 games, mean reward 19.240, eps 0.02, speed 107.05 f/s
2051735: done 1121 games, mean reward 19.260, eps 0

2229154: done 1221 games, mean reward 19.200, eps 0.02, speed 108.64 f/s
2231247: done 1222 games, mean reward 19.160, eps 0.02, speed 100.76 f/s
2233174: done 1223 games, mean reward 19.130, eps 0.02, speed 105.92 f/s
2235052: done 1224 games, mean reward 19.100, eps 0.02, speed 106.40 f/s
2236686: done 1225 games, mean reward 19.110, eps 0.02, speed 106.32 f/s
2238485: done 1226 games, mean reward 19.100, eps 0.02, speed 105.33 f/s
2240405: done 1227 games, mean reward 19.100, eps 0.02, speed 111.51 f/s
2242074: done 1228 games, mean reward 19.140, eps 0.02, speed 105.49 f/s
2243970: done 1229 games, mean reward 19.140, eps 0.02, speed 105.05 f/s
2245598: done 1230 games, mean reward 19.170, eps 0.02, speed 106.66 f/s
2247324: done 1231 games, mean reward 19.180, eps 0.02, speed 109.09 f/s
2249007: done 1232 games, mean reward 19.190, eps 0.02, speed 101.71 f/s
2250813: done 1233 games, mean reward 19.190, eps 0.02, speed 108.72 f/s
2252633: done 1234 games, mean reward 19.180, eps 0

2430697: done 1333 games, mean reward 19.260, eps 0.02, speed 99.96 f/s
2432489: done 1334 games, mean reward 19.250, eps 0.02, speed 109.44 f/s
2434389: done 1335 games, mean reward 19.230, eps 0.02, speed 105.86 f/s
2436298: done 1336 games, mean reward 19.210, eps 0.02, speed 108.53 f/s
2437948: done 1337 games, mean reward 19.210, eps 0.02, speed 103.58 f/s
2439591: done 1338 games, mean reward 19.210, eps 0.02, speed 104.25 f/s
2441417: done 1339 games, mean reward 19.200, eps 0.02, speed 104.19 f/s
2443048: done 1340 games, mean reward 19.220, eps 0.02, speed 99.78 f/s
2445015: done 1341 games, mean reward 19.200, eps 0.02, speed 104.98 f/s
2447015: done 1342 games, mean reward 19.160, eps 0.02, speed 100.27 f/s
2448643: done 1343 games, mean reward 19.160, eps 0.02, speed 103.41 f/s
2450370: done 1344 games, mean reward 19.160, eps 0.02, speed 104.36 f/s
2452057: done 1345 games, mean reward 19.190, eps 0.02, speed 106.97 f/s
2453829: done 1346 games, mean reward 19.240, eps 0.0