In [82]:
#https://blog.paperspace.com/building-double-deep-q-network-super-mario-bros/
from IPython import display
!pip install nes-py==0.2.6
!pip install gym-super-mario-bros
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!apt install -y libgl1-mesa-glx
!pip install opencv-python

Collecting nes-py==0.2.6
  Using cached nes_py-0.2.6-cp39-cp39-macosx_10_9_x86_64.whl
Installing collected packages: nes-py
  Attempting uninstall: nes-py
    Found existing installation: nes-py 8.1.8
    Uninstalling nes-py-8.1.8:
      Successfully uninstalled nes-py-8.1.8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gym-super-mario-bros 7.3.0 requires nes-py>=8.0.0, but you have nes-py 0.2.6 which is incompatible.[0m[31m
[0mSuccessfully installed nes-py-0.2.6
You should consider upgrading via the '/Users/23zhou/PycharmProjects/JupyterMario/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
Collecting nes-py>=8.0.0
  Using cached nes_py-8.1.8-cp39-cp39-macosx_10_9_x86_64.whl
Installing collected packages: nes-py
  Attempting uninstall: nes-py
    Found existing installation: nes-py 0.2.6
    Uninstalling nes-py-0.2.6:


In [83]:
import torch
import torch.nn as nn
import random
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from tqdm import tqdm
import pickle
from gym_super_mario_bros.actions import RIGHT_ONLY
import gym
import numpy as np
import collections
import cv2
import matplotlib.pyplot as plt

In [84]:
class MaxandSkipEnv(gym.Wrapper):
    def __init__(self, env=None,skip=4):
        'return every 4th frame'
        super(MaxandSkipEnv, self).__init__(env)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        "Clears frames"
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

class ProcessFrame84(gym.ObservationWrapper):
    "returns downsized image to 84x84, grayscaled"
    def __init__(self,env=None):
        super(ProcessFrame84,self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 240 * 256 * 3:
            img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)

class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)
    def observation(self, observation):
        return np.moveaxis(observation,2,0)

class ScaledFloatFrame(gym.ObservationWrapper):

    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

class BufferWrapper(gym.ObservationWrapper):
    def __init__(self,env,n_steps,dtype=np.float32):
        super(BufferWrapper,self).__init__(env)
        self.dtype = dtype
        oldSpace = env.observation_space
        self.observation_space = gym.spaces.Box(oldSpace.low.repeat(n_steps, axis=0),
                                                oldSpace.high.repeat(n_steps, axis=0),
                                                dtype=dtype)
    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

In [85]:
def make_env(env):
    env = MaxandSkipEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env,4)
    env = ScaledFloatFrame(env)
    return JoypadSpace(env,RIGHT_ONLY)


In [86]:
class DQNSolver(nn.Module):

    def __init__(self, input_shape, n_actions):
        super(DQNSolver, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)


In [87]:
class DQNAgent:

    def __init__(self, stateSpace, actionSpace, maxMemorySize, batchSize, gamma, lr, dropout, explorationMax, explorationMin, explorationDecay, doubleDQ, pretrained):

        #Defines DQN Layers
        self.stateSpace = stateSpace
        self.actionSpace = actionSpace
        self.doubleDQ = doubleDQ
        self.pretrained = pretrained
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.doubleDQ:
            self.local_net = DQNSolver(stateSpace,actionSpace).to(self.device)
            self.target_net = DQNSolver(stateSpace,actionSpace).to(self.device)

            if self.pretrained:
                self.local_net.load_state_dict(torch.load("./betterMarioModels/dq1.pt", map_location=torch.device(self.device)))
                self.target_net.load_state_dict(torch.load("./betterMarioModels/dq2.pt", map_location=torch.device(self.device)))

            self.optimizer = torch.optim.Adam(self.local_net.parameters(), lr=lr)
            self.copy = 5000 #steps until copy local weights
            self.step = 0

        else:
            self.dqn = DQNSolver(stateSpace, actionSpace).to(self.device)

            if self.pretrained:
                self.dqn.load_state_dict(torch.load("./betterMarioModels/dq.pt", map_location=torch.device(self.device)))
            self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=lr)

        self.max_memory_size = maxMemorySize
        if self.pretrained:
            self.STATE_MEM = torch.load("./betterMarioModels/STATE_MEM.pt")
            self.ACTION_MEM = torch.load("./betterMarioModels/ACTION_MEM.pt")
            self.REWARD_MEM = torch.load("./betterMarioModels/REWARD_MEM.pt")
            self.STATE2_MEM = torch.load("./betterMarioModels/STATE2_MEM.pt")
            self.DONE_MEM = torch.load("./betterMarioModels/DONE_MEM.pt")
            with open("ending_position.pkl", 'rb') as f:
                self.ending_position = pickle.load(f)
            with open("num_in_queue.pkl", 'rb') as f:
                self.num_in_queue = pickle.load(f)
        else:
            self.STATE_MEM = torch.zeros(maxMemorySize, *self.stateSpace)
            self.ACTION_MEM = torch.zeros(maxMemorySize, 1)
            self.REWARD_MEM = torch.zeros(maxMemorySize, 1)
            self.STATE2_MEM = torch.zeros(maxMemorySize, *self.stateSpace)
            self.DONE_MEM = torch.zeros(maxMemorySize, 1)
            self.ending_position = 0
            self.num_in_queue = 0

        self.memory_sample_size = batchSize

        #learning parameters
        self.gamma = gamma
        #Huber loss
        self.l1 = nn.SmoothL1Loss().to(self.device)
        self.exploration_max = explorationMax
        self.exploration_min = explorationMin
        self.exploration_rate = explorationMax
        self.exploration_decay = explorationDecay

    def remember(self, state, action, reward, state2, done):
        self.STATE_MEM[self.ending_position] = state.float()
        self.ACTION_MEM[self.ending_position] = action.float()
        self.REWARD_MEM[self.ending_position] = reward.float()
        self.STATE2_MEM[self.ending_position] = state2.float()
        self.DONE_MEM[self.ending_position] = done.float()
        # FIFO tensor
        self.ending_position = (self.ending_position + 1) % self.max_memory_size
        self.num_in_queue = min(self.num_in_queue + 1, self.max_memory_size)

    def recall(self):
        #randomly sample batches
        idx = random.choices(range(self.num_in_queue), k=self.memory_sample_size)

        STATE = self.STATE_MEM[idx]
        ACTION = self.ACTION_MEM[idx]
        REWARD = self.REWARD_MEM[idx]
        STATE2 = self.STATE2_MEM[idx]
        DONE = self.DONE_MEM[idx]

        return STATE, ACTION, REWARD, STATE2, DONE

    def act(self,state):
        #Epsilon-greedy action

        if self.doubleDQ:
            self.step += 1
        if random.random() < self.exploration_rate:
            return torch.tensor([[random.randrange(self.actionSpace)]])
        if self.doubleDQ:
            #use local net for policy if doubleDQ
            return torch.argmax(self.local_net(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()
        else:
            return torch.argmax(self.dqn(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()

    def copy_model(self):
        #copy model weights into target net
        self.target_net.load_state_dict(self.local_net.state_dict())

    def experience_replay(self):

        if self.doubleDQ and self.step % self.copy == 0:
            self.copy_model()

        if self.memory_sample_size > self.num_in_queue:
            return

        STATE, ACTION, REWARD, STATE2, DONE = self.recall()
        STATE = STATE.to(self.device)
        ACTION = ACTION.to(self.device)
        REWARD = REWARD.to(self.device)
        STATE2 = STATE2.to(self.device)
        DONE = DONE.to(self.device)

        self.optimizer.zero_grad()

        if self.doubleDQ:
            #Double Q-Learning target is Q*(S, A) <- r + γ max_a Q_target(S', a)
            target = REWARD + torch.mul((self.gamma *
                                        self.target_net(STATE2).max(1).values.unsqueeze(1)),
                                        1 - DONE)
            current = self.local_net(STATE).gather(1, ACTION.long()) # Local net approximation of Q-value
        else:
            # Q-Learning target is Q*(S, A) <- r + γ max_a Q(S', a)
            target = REWARD + torch.mul((self.gamma *
                                        self.dqn(STATE2).max(1).values.unsqueeze(1)),
                                        1 - DONE)
            current = self.dqn(STATE).gather(1, ACTION.long())

        loss = self.l1(current, target)
        #gradient
        loss.backward()
        #backprogogation error
        self.optimizer.step()
        self.exploration_rate *= self.exploration_decay

        # Makes sure that exploration rate is always at least 'exploration min'
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)




In [88]:
def vectorize_action(action, action_space):
    #given a scalar action, returns a one-hot encoded action
    return [0 for _ in range(action)] + [1] + [0 for _ in range(action +1, action_space)]



In [89]:
def show_state(env, ep=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Episode: %d %s" % (ep, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())


In [90]:
def run(trainingMode,preTrained):

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = make_env(env)
    observationSpace = env.observation_space.shape
    actionSpace = env.action_space.n
    agent = DQNAgent(stateSpace=observationSpace,
                     actionSpace=actionSpace,
                     maxMemorySize=30000,
                     batchSize= 32,
                     gamma= 0.90,
                     lr = 0.00025,
                     dropout= 0.,
                     explorationMax= 1.0,
                     explorationMin=0.02,
                     explorationDecay= 0.99,
                     doubleDQ= True,
                     pretrained= preTrained)

    numEpisodes = 6000
    env.reset()
    totalRewards = []

    for epNum in tqdm(range(numEpisodes)):
        state = env.reset()
        state = torch.Tensor([state])
        totalReward = 0
        steps = 0
        while True:
            if not trainingMode:
                show_state(env, epNum)
            action = agent.act(state)
            steps += 1

            state_next, reward, terminal, info = env.step(int(action[0]))
            totalReward += reward
            state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)

            terminal = torch.tensor([int(terminal)]).unsqueeze(0)

            if trainingMode:
                agent.remember(state, action, reward, state_next, terminal)
                agent.experience_replay()

            state = state_next
            if terminal:
                break

        totalRewards.append(totalReward)

        print("Total reward after episode {} is {}".format(epNum +1, totalRewards[-1]))
        numEpisodes += 1

    if trainingMode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.ending_position, f)
        with open("num_in_queue.pkl", "wb") as f:
            pickle.dump(agent.num_in_queue, f)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(totalRewards, f)
        if agent.doubleDQ:
            torch.save(agent.local_net.state_dict(), "./betterMarioModels/dq1.pt")
            torch.save(agent.target_net.state_dict(), "./betterMarioModels/dq2.pt")
        else:
            torch.save(agent.dqn.state_dict(), "./betterMarioModels/dq.pt")
        torch.save(agent.STATE_MEM,  "./betterMarioModels/STATE_MEM.pt")
        torch.save(agent.ACTION_MEM, "./betterMarioModels/ACTION_MEM.pt")
        torch.save(agent.REWARD_MEM, "./betterMarioModels/REWARD_MEM.pt")
        torch.save(agent.STATE2_MEM, "./betterMarioModels/STATE2_MEM.pt")
        torch.save(agent.DONE_MEM,   "./betterMarioModels/DONE_MEM.pt")

    env.close()

    if numEpisodes > 500:
        plt.title("Episodes trained vs. Average Rewards (per 500 eps)")
        plt.plot([0 for _ in range(500)] +
                 np.convolve(totalRewards,np.ones((500,))/500, mode = "valid").tolist())
        plt.show()


In [91]:
run(trainingMode = True, preTrained = False )


  0%|          | 1/6000 [00:13<22:13:51, 13.34s/it]

Total reward after episode 1 is 602.0


  0%|          | 2/6000 [00:19<15:18:29,  9.19s/it]

Total reward after episode 2 is 226.0


  0%|          | 3/6000 [00:24<11:40:58,  7.01s/it]

Total reward after episode 3 is 240.0


  0%|          | 4/6000 [00:39<16:59:39, 10.20s/it]

Total reward after episode 4 is 712.0


  0%|          | 5/6000 [00:56<21:18:37, 12.80s/it]

Total reward after episode 5 is 581.0


  0%|          | 6/6000 [01:00<16:33:19,  9.94s/it]

Total reward after episode 6 is 230.0


  0%|          | 7/6000 [01:04<12:55:58,  7.77s/it]

Total reward after episode 7 is 248.0


  0%|          | 8/6000 [01:07<10:23:41,  6.25s/it]

Total reward after episode 8 is 252.0


  0%|          | 9/6000 [01:10<8:40:01,  5.21s/it] 

Total reward after episode 9 is 248.0


  0%|          | 10/6000 [01:13<7:32:50,  4.54s/it]

Total reward after episode 10 is 247.0


  0%|          | 11/6000 [01:16<6:46:57,  4.08s/it]

Total reward after episode 11 is 248.0


  0%|          | 12/6000 [01:19<6:17:05,  3.78s/it]

Total reward after episode 12 is 249.0


  0%|          | 13/6000 [01:22<6:02:47,  3.64s/it]

Total reward after episode 13 is 248.0


  0%|          | 14/6000 [01:25<5:52:43,  3.54s/it]

Total reward after episode 14 is 251.0


  0%|          | 15/6000 [01:29<5:39:20,  3.40s/it]

Total reward after episode 15 is 250.0


  0%|          | 16/6000 [01:32<5:32:55,  3.34s/it]

Total reward after episode 16 is 251.0


  0%|          | 17/6000 [01:35<5:29:56,  3.31s/it]

Total reward after episode 17 is 251.0


  0%|          | 18/6000 [01:38<5:27:11,  3.28s/it]

Total reward after episode 18 is 252.0


  0%|          | 19/6000 [01:41<5:25:12,  3.26s/it]

Total reward after episode 19 is 251.0


  0%|          | 20/6000 [01:45<5:23:52,  3.25s/it]

Total reward after episode 20 is 251.0


  0%|          | 21/6000 [01:48<5:36:23,  3.38s/it]

Total reward after episode 21 is 252.0


  0%|          | 22/6000 [01:51<5:28:33,  3.30s/it]

Total reward after episode 22 is 252.0


  0%|          | 23/6000 [01:55<5:35:47,  3.37s/it]

Total reward after episode 23 is 252.0


  0%|          | 24/6000 [01:58<5:35:00,  3.36s/it]

Total reward after episode 24 is 252.0


  0%|          | 25/6000 [02:01<5:30:51,  3.32s/it]

Total reward after episode 25 is 249.0


  0%|          | 26/6000 [02:05<5:29:34,  3.31s/it]

Total reward after episode 26 is 251.0


  0%|          | 27/6000 [02:08<5:26:01,  3.27s/it]

Total reward after episode 27 is 248.0


  0%|          | 28/6000 [02:11<5:24:09,  3.26s/it]

Total reward after episode 28 is 252.0


  0%|          | 29/6000 [02:14<5:22:03,  3.24s/it]

Total reward after episode 29 is 251.0


  0%|          | 30/6000 [02:18<5:21:15,  3.23s/it]

Total reward after episode 30 is 251.0


  1%|          | 31/6000 [02:21<5:21:16,  3.23s/it]

Total reward after episode 31 is 251.0


  1%|          | 32/6000 [02:25<5:56:32,  3.58s/it]

Total reward after episode 32 is 251.0


  1%|          | 33/6000 [02:29<5:48:02,  3.50s/it]

Total reward after episode 33 is 251.0


  1%|          | 34/6000 [02:32<5:39:59,  3.42s/it]

Total reward after episode 34 is 248.0


  1%|          | 35/6000 [02:35<5:35:17,  3.37s/it]

Total reward after episode 35 is 251.0


  1%|          | 36/6000 [02:38<5:33:10,  3.35s/it]

Total reward after episode 36 is 251.0


  1%|          | 37/6000 [02:42<5:29:29,  3.32s/it]

Total reward after episode 37 is 251.0


  1%|          | 38/6000 [02:45<5:27:51,  3.30s/it]

Total reward after episode 38 is 252.0


  1%|          | 39/6000 [02:48<5:37:21,  3.40s/it]

Total reward after episode 39 is 251.0


  1%|          | 40/6000 [02:52<5:33:50,  3.36s/it]

Total reward after episode 40 is 250.0


  1%|          | 41/6000 [02:55<5:28:49,  3.31s/it]

Total reward after episode 41 is 252.0


  1%|          | 42/6000 [02:58<5:24:48,  3.27s/it]

Total reward after episode 42 is 252.0


  1%|          | 43/6000 [03:01<5:23:51,  3.26s/it]

Total reward after episode 43 is 252.0


  1%|          | 44/6000 [03:05<5:24:21,  3.27s/it]

Total reward after episode 44 is 252.0


  1%|          | 45/6000 [03:08<5:23:16,  3.26s/it]

Total reward after episode 45 is 251.0


  1%|          | 46/6000 [03:11<5:21:53,  3.24s/it]

Total reward after episode 46 is 252.0


  1%|          | 47/6000 [03:14<5:17:08,  3.20s/it]

Total reward after episode 47 is 252.0


  1%|          | 48/6000 [03:17<5:17:48,  3.20s/it]

Total reward after episode 48 is 251.0


  1%|          | 49/6000 [03:20<5:14:36,  3.17s/it]

Total reward after episode 49 is 248.0


  1%|          | 50/6000 [03:24<5:31:39,  3.34s/it]

Total reward after episode 50 is 252.0


  1%|          | 51/6000 [03:29<6:03:12,  3.66s/it]

Total reward after episode 51 is 248.0


  1%|          | 52/6000 [03:33<6:33:11,  3.97s/it]

Total reward after episode 52 is 249.0


  1%|          | 53/6000 [03:36<6:08:03,  3.71s/it]

Total reward after episode 53 is 252.0


  1%|          | 54/6000 [03:40<6:08:33,  3.72s/it]

Total reward after episode 54 is 252.0


  1%|          | 55/6000 [03:44<6:03:35,  3.67s/it]

Total reward after episode 55 is 252.0


  1%|          | 56/6000 [03:48<6:13:30,  3.77s/it]

Total reward after episode 56 is 252.0


  1%|          | 57/6000 [03:52<6:18:04,  3.82s/it]

Total reward after episode 57 is 252.0


  1%|          | 58/6000 [03:55<6:06:10,  3.70s/it]

Total reward after episode 58 is 251.0


  1%|          | 59/6000 [04:00<6:47:11,  4.11s/it]

Total reward after episode 59 is 248.0


  1%|          | 60/6000 [04:03<6:21:32,  3.85s/it]

Total reward after episode 60 is 251.0


  1%|          | 61/6000 [04:07<6:12:36,  3.76s/it]

Total reward after episode 61 is 251.0


  1%|          | 62/6000 [04:11<6:09:44,  3.74s/it]

Total reward after episode 62 is 248.0


  1%|          | 63/6000 [04:14<5:59:43,  3.64s/it]

Total reward after episode 63 is 248.0


  1%|          | 64/6000 [04:17<5:43:26,  3.47s/it]

Total reward after episode 64 is 248.0


  1%|          | 65/6000 [04:21<5:56:54,  3.61s/it]

Total reward after episode 65 is 248.0


  1%|          | 66/6000 [04:25<6:02:40,  3.67s/it]

Total reward after episode 66 is 251.0


  1%|          | 67/6000 [04:28<6:00:40,  3.65s/it]

Total reward after episode 67 is 250.0


  1%|          | 68/6000 [04:32<5:53:34,  3.58s/it]

Total reward after episode 68 is 248.0


  1%|          | 69/6000 [04:35<5:40:47,  3.45s/it]

Total reward after episode 69 is 248.0


  1%|          | 70/6000 [04:38<5:35:06,  3.39s/it]

Total reward after episode 70 is 252.0


  1%|          | 71/6000 [04:42<5:31:14,  3.35s/it]

Total reward after episode 71 is 248.0


  1%|          | 72/6000 [04:45<5:27:34,  3.32s/it]

Total reward after episode 72 is 250.0


  1%|          | 73/6000 [04:48<5:24:00,  3.28s/it]

Total reward after episode 73 is 250.0


  1%|          | 74/6000 [04:51<5:23:44,  3.28s/it]

Total reward after episode 74 is 250.0


  1%|▏         | 75/6000 [04:54<5:24:04,  3.28s/it]

Total reward after episode 75 is 248.0


  1%|▏         | 76/6000 [05:06<9:15:00,  5.62s/it]

Total reward after episode 76 is 625.0


  1%|▏         | 77/6000 [05:09<8:06:09,  4.92s/it]

Total reward after episode 77 is 248.0


  1%|▏         | 78/6000 [05:18<10:03:35,  6.12s/it]

Total reward after episode 78 is 636.0


  1%|▏         | 79/6000 [05:32<13:57:19,  8.48s/it]

Total reward after episode 79 is 630.0


  1%|▏         | 80/6000 [06:06<26:39:45, 16.21s/it]

Total reward after episode 80 is 1420.0


  1%|▏         | 81/6000 [06:13<22:18:03, 13.56s/it]

Total reward after episode 81 is 248.0


  1%|▏         | 82/6000 [06:19<18:29:07, 11.24s/it]

Total reward after episode 82 is 252.0


  1%|▏         | 83/6000 [06:25<16:01:05,  9.75s/it]

Total reward after episode 83 is 248.0


  1%|▏         | 84/6000 [06:38<17:18:22, 10.53s/it]

Total reward after episode 84 is 628.0


  1%|▏         | 85/6000 [06:43<14:46:29,  8.99s/it]

Total reward after episode 85 is 244.0


  1%|▏         | 86/6000 [06:47<12:05:59,  7.37s/it]

Total reward after episode 86 is 251.0


  1%|▏         | 87/6000 [07:00<14:48:08,  9.01s/it]

Total reward after episode 87 is 628.0


  1%|▏         | 88/6000 [07:03<12:12:42,  7.44s/it]

Total reward after episode 88 is 246.0


  1%|▏         | 89/6000 [07:23<18:18:27, 11.15s/it]

Total reward after episode 89 is 760.0


  2%|▏         | 90/6000 [07:33<17:26:57, 10.63s/it]

Total reward after episode 90 is 630.0


  2%|▏         | 91/6000 [07:42<16:56:49, 10.32s/it]

Total reward after episode 91 is 632.0


  2%|▏         | 92/6000 [08:14<27:23:46, 16.69s/it]

Total reward after episode 92 is 601.0


  2%|▏         | 93/6000 [08:32<27:58:50, 17.05s/it]

Total reward after episode 93 is 815.0


  2%|▏         | 94/6000 [08:44<25:24:35, 15.49s/it]

Total reward after episode 94 is 633.0


  2%|▏         | 95/6000 [09:03<27:22:38, 16.69s/it]

Total reward after episode 95 is 1320.0


  2%|▏         | 96/6000 [09:06<20:50:32, 12.71s/it]

Total reward after episode 96 is 247.0


  2%|▏         | 97/6000 [09:27<24:52:41, 15.17s/it]

Total reward after episode 97 is 1431.0


  2%|▏         | 98/6000 [09:49<28:14:27, 17.23s/it]

Total reward after episode 98 is 807.0


  2%|▏         | 99/6000 [10:08<28:40:01, 17.49s/it]

Total reward after episode 99 is 814.0


  2%|▏         | 100/6000 [10:27<29:28:03, 17.98s/it]

Total reward after episode 100 is 1347.0


  2%|▏         | 101/6000 [10:43<28:43:59, 17.54s/it]

Total reward after episode 101 is 1347.0


  2%|▏         | 102/6000 [10:56<26:15:09, 16.02s/it]

Total reward after episode 102 is 620.0


  2%|▏         | 103/6000 [11:16<28:32:39, 17.43s/it]

Total reward after episode 103 is 1433.0


  2%|▏         | 104/6000 [11:25<24:03:09, 14.69s/it]

Total reward after episode 104 is 629.0


  2%|▏         | 105/6000 [14:01<93:31:44, 57.12s/it]

Total reward after episode 105 is 592.0


  2%|▏         | 106/6000 [14:51<90:10:29, 55.08s/it]

Total reward after episode 106 is 1292.0


  2%|▏         | 107/6000 [15:10<72:22:17, 44.21s/it]

Total reward after episode 107 is 1435.0


  2%|▏         | 108/6000 [15:14<52:39:20, 32.17s/it]

Total reward after episode 108 is 242.0


  2%|▏         | 109/6000 [15:33<46:15:34, 28.27s/it]

Total reward after episode 109 is 1433.0


  2%|▏         | 110/6000 [15:44<37:37:15, 22.99s/it]

Total reward after episode 110 is 622.0


  2%|▏         | 111/6000 [15:53<31:03:28, 18.99s/it]

Total reward after episode 111 is 631.0


  2%|▏         | 112/6000 [16:11<30:26:38, 18.61s/it]

Total reward after episode 112 is 1435.0


  2%|▏         | 113/6000 [16:21<26:11:06, 16.01s/it]

Total reward after episode 113 is 621.0


  2%|▏         | 114/6000 [16:30<22:30:15, 13.76s/it]

Total reward after episode 114 is 619.0


  2%|▏         | 115/6000 [16:41<21:09:50, 12.95s/it]

Total reward after episode 115 is 620.0


  2%|▏         | 116/6000 [17:05<26:46:29, 16.38s/it]

Total reward after episode 116 is 1441.0


  2%|▏         | 117/6000 [17:09<20:50:08, 12.75s/it]

Total reward after episode 117 is 240.0


  2%|▏         | 118/6000 [17:23<21:25:28, 13.11s/it]

Total reward after episode 118 is 734.0


  2%|▏         | 119/6000 [17:28<17:13:17, 10.54s/it]

Total reward after episode 119 is 245.0


  2%|▏         | 120/6000 [17:32<14:11:29,  8.69s/it]

Total reward after episode 120 is 241.0


  2%|▏         | 121/6000 [18:17<31:54:42, 19.54s/it]

Total reward after episode 121 is 770.0


  2%|▏         | 122/6000 [18:28<27:27:15, 16.81s/it]

Total reward after episode 122 is 633.0


  2%|▏         | 123/6000 [18:43<26:46:39, 16.40s/it]

Total reward after episode 123 is 1043.0


  2%|▏         | 124/6000 [19:02<28:10:18, 17.26s/it]

Total reward after episode 124 is 814.0


  2%|▏         | 125/6000 [19:24<30:27:08, 18.66s/it]

Total reward after episode 125 is 1318.0


  2%|▏         | 126/6000 [19:57<37:19:03, 22.87s/it]

Total reward after episode 126 is 1413.0


  2%|▏         | 127/6000 [20:01<28:00:35, 17.17s/it]

Total reward after episode 127 is 239.0


  2%|▏         | 128/6000 [20:12<25:04:38, 15.37s/it]

Total reward after episode 128 is 631.0


  2%|▏         | 129/6000 [20:15<19:14:40, 11.80s/it]

Total reward after episode 129 is 247.0


  2%|▏         | 130/6000 [20:42<26:21:36, 16.17s/it]

Total reward after episode 130 is 1310.0


  2%|▏         | 131/6000 [20:46<20:22:07, 12.49s/it]

Total reward after episode 131 is 241.0


  2%|▏         | 132/6000 [21:04<22:58:02, 14.09s/it]

Total reward after episode 132 is 1437.0


  2%|▏         | 133/6000 [21:28<28:13:05, 17.31s/it]

Total reward after episode 133 is 1335.0


  2%|▏         | 134/6000 [21:44<27:37:18, 16.95s/it]

Total reward after episode 134 is 738.0


  2%|▏         | 135/6000 [21:49<21:26:27, 13.16s/it]

Total reward after episode 135 is 240.0


  2%|▏         | 136/6000 [22:30<35:12:32, 21.62s/it]

Total reward after episode 136 is 1398.0


  2%|▏         | 137/6000 [22:50<34:17:11, 21.05s/it]

Total reward after episode 137 is 638.0


  2%|▏         | 138/6000 [23:08<32:48:04, 20.14s/it]

Total reward after episode 138 is 815.0


  2%|▏         | 139/6000 [23:27<32:07:37, 19.73s/it]

Total reward after episode 139 is 1438.0


  2%|▏         | 140/6000 [23:59<38:25:34, 23.61s/it]

Total reward after episode 140 is 1426.0


  2%|▏         | 141/6000 [24:12<33:17:36, 20.46s/it]

Total reward after episode 141 is 628.0


  2%|▏         | 142/6000 [24:24<28:59:24, 17.82s/it]

Total reward after episode 142 is 627.0


  2%|▏         | 143/6000 [24:28<22:24:29, 13.77s/it]

Total reward after episode 143 is 240.0


  2%|▏         | 144/6000 [24:38<20:08:55, 12.39s/it]

Total reward after episode 144 is 626.0


  2%|▏         | 145/6000 [24:57<23:38:29, 14.54s/it]

Total reward after episode 145 is 1431.0


  2%|▏         | 146/6000 [25:09<22:12:22, 13.66s/it]

Total reward after episode 146 is 626.0


  2%|▏         | 147/6000 [25:13<17:29:21, 10.76s/it]

Total reward after episode 147 is 232.0


  2%|▏         | 148/6000 [25:36<23:49:59, 14.66s/it]

Total reward after episode 148 is 1428.0


  2%|▏         | 149/6000 [25:45<20:48:46, 12.81s/it]

Total reward after episode 149 is 654.0


  2%|▎         | 150/6000 [25:51<17:21:59, 10.69s/it]

Total reward after episode 150 is 200.0


  3%|▎         | 151/6000 [25:55<14:23:54,  8.86s/it]

Total reward after episode 151 is 229.0


  3%|▎         | 152/6000 [26:12<17:59:42, 11.08s/it]

Total reward after episode 152 is 1047.0


  3%|▎         | 153/6000 [26:31<22:00:45, 13.55s/it]

Total reward after episode 153 is 1338.0


  3%|▎         | 154/6000 [26:52<25:31:44, 15.72s/it]

Total reward after episode 154 is 752.0


  3%|▎         | 155/6000 [27:10<26:58:23, 16.61s/it]

Total reward after episode 155 is 1436.0


  3%|▎         | 156/6000 [27:24<25:35:49, 15.77s/it]

Total reward after episode 156 is 618.0


  3%|▎         | 157/6000 [27:33<22:05:05, 13.61s/it]

Total reward after episode 157 is 654.0


  3%|▎         | 158/6000 [27:43<20:35:04, 12.68s/it]

Total reward after episode 158 is 734.0


  3%|▎         | 159/6000 [27:48<16:53:57, 10.42s/it]

Total reward after episode 159 is 246.0


  3%|▎         | 160/6000 [28:09<22:02:28, 13.59s/it]

Total reward after episode 160 is 751.0


  3%|▎         | 161/6000 [29:01<40:43:18, 25.11s/it]

Total reward after episode 161 is 1281.0


  3%|▎         | 162/6000 [29:05<30:26:09, 18.77s/it]

Total reward after episode 162 is 238.0


  3%|▎         | 163/6000 [29:15<26:09:43, 16.14s/it]

Total reward after episode 163 is 622.0


  3%|▎         | 164/6000 [29:29<24:52:29, 15.34s/it]

Total reward after episode 164 is 633.0


  3%|▎         | 165/6000 [29:46<25:33:29, 15.77s/it]

Total reward after episode 165 is 816.0


  3%|▎         | 166/6000 [30:08<28:45:35, 17.75s/it]

Total reward after episode 166 is 1430.0


  3%|▎         | 167/6000 [30:28<29:42:22, 18.33s/it]

Total reward after episode 167 is 1345.0


  3%|▎         | 168/6000 [30:39<26:23:26, 16.29s/it]

Total reward after episode 168 is 622.0


  3%|▎         | 169/6000 [30:47<22:30:51, 13.90s/it]

Total reward after episode 169 is 654.0


  3%|▎         | 170/6000 [31:14<28:27:08, 17.57s/it]

Total reward after episode 170 is 1428.0


  3%|▎         | 171/6000 [31:40<32:41:58, 20.20s/it]

Total reward after episode 171 is 1318.0


  3%|▎         | 172/6000 [31:58<31:46:41, 19.63s/it]

Total reward after episode 172 is 1342.0


  3%|▎         | 173/6000 [32:17<31:34:17, 19.51s/it]

Total reward after episode 173 is 1343.0


  3%|▎         | 174/6000 [32:27<26:29:22, 16.37s/it]

Total reward after episode 174 is 624.0


  3%|▎         | 175/6000 [32:55<32:13:17, 19.91s/it]

Total reward after episode 175 is 1428.0


  3%|▎         | 176/6000 [33:13<31:16:24, 19.33s/it]

Total reward after episode 176 is 622.0


  3%|▎         | 177/6000 [33:37<33:44:53, 20.86s/it]

Total reward after episode 177 is 810.0


  3%|▎         | 178/6000 [33:47<28:34:55, 17.67s/it]

Total reward after episode 178 is 625.0


  3%|▎         | 179/6000 [34:44<47:17:26, 29.25s/it]

Total reward after episode 179 is 713.0


  3%|▎         | 180/6000 [34:48<35:14:36, 21.80s/it]

Total reward after episode 180 is 237.0


  3%|▎         | 181/6000 [35:01<31:08:59, 19.27s/it]

Total reward after episode 181 is 619.0


  3%|▎         | 182/6000 [35:18<29:37:01, 18.33s/it]

Total reward after episode 182 is 747.0


  3%|▎         | 183/6000 [35:37<30:21:16, 18.79s/it]

Total reward after episode 183 is 815.0


  3%|▎         | 184/6000 [35:56<30:21:08, 18.79s/it]

Total reward after episode 184 is 1438.0


  3%|▎         | 185/6000 [36:00<23:17:03, 14.41s/it]

Total reward after episode 185 is 241.0


  3%|▎         | 186/6000 [36:18<24:48:09, 15.36s/it]

Total reward after episode 186 is 640.0


  3%|▎         | 187/6000 [36:33<24:41:56, 15.30s/it]

Total reward after episode 187 is 1050.0


  3%|▎         | 188/6000 [36:37<19:18:17, 11.96s/it]

Total reward after episode 188 is 241.0


  3%|▎         | 189/6000 [36:41<15:22:55,  9.53s/it]

Total reward after episode 189 is 245.0


  3%|▎         | 190/6000 [37:03<21:10:56, 13.12s/it]

Total reward after episode 190 is 1430.0


  3%|▎         | 191/6000 [37:23<24:41:19, 15.30s/it]

Total reward after episode 191 is 1040.0


  3%|▎         | 192/6000 [37:27<19:16:05, 11.94s/it]

Total reward after episode 192 is 236.0


  3%|▎         | 193/6000 [37:56<27:20:29, 16.95s/it]

Total reward after episode 193 is 1425.0


  3%|▎         | 194/6000 [38:20<30:52:55, 19.15s/it]

Total reward after episode 194 is 806.0


  3%|▎         | 195/6000 [38:44<33:06:11, 20.53s/it]

Total reward after episode 195 is 1334.0


  3%|▎         | 196/6000 [38:47<24:49:56, 15.40s/it]

Total reward after episode 196 is 248.0


  3%|▎         | 197/6000 [39:19<32:43:08, 20.30s/it]

Total reward after episode 197 is 794.0


  3%|▎         | 198/6000 [39:24<25:06:57, 15.58s/it]

Total reward after episode 198 is 237.0


  3%|▎         | 199/6000 [39:35<22:56:13, 14.23s/it]

Total reward after episode 199 is 634.0


  3%|▎         | 200/6000 [39:38<17:41:39, 10.98s/it]

Total reward after episode 200 is 251.0


  3%|▎         | 201/6000 [39:58<22:07:40, 13.74s/it]

Total reward after episode 201 is 1333.0


  3%|▎         | 202/6000 [40:29<30:14:05, 18.77s/it]

Total reward after episode 202 is 1323.0


  3%|▎         | 203/6000 [40:49<31:01:37, 19.27s/it]

Total reward after episode 203 is 636.0


  3%|▎         | 204/6000 [41:13<33:21:58, 20.72s/it]

Total reward after episode 204 is 1427.0


  3%|▎         | 205/6000 [41:38<35:30:11, 22.06s/it]

Total reward after episode 205 is 1426.0


  3%|▎         | 206/6000 [41:49<29:47:29, 18.51s/it]

Total reward after episode 206 is 627.0


  3%|▎         | 207/6000 [42:14<33:06:10, 20.57s/it]

Total reward after episode 207 is 1338.0


  3%|▎         | 208/6000 [42:34<32:50:02, 20.41s/it]

Total reward after episode 208 is 814.0


  3%|▎         | 209/6000 [42:53<32:09:55, 20.00s/it]

Total reward after episode 209 is 1348.0


  4%|▎         | 210/6000 [47:13<148:00:51, 92.03s/it]

Total reward after episode 210 is 267.0


  4%|▎         | 211/6000 [47:23<108:08:38, 67.25s/it]

Total reward after episode 211 is 624.0


  4%|▎         | 212/6000 [47:31<79:29:46, 49.44s/it] 

Total reward after episode 212 is 622.0


  4%|▎         | 213/6000 [47:34<57:16:09, 35.63s/it]

Total reward after episode 213 is 250.0


  4%|▎         | 214/6000 [47:59<52:06:06, 32.42s/it]

Total reward after episode 214 is 1327.0


  4%|▎         | 215/6000 [48:20<46:27:19, 28.91s/it]

Total reward after episode 215 is 1435.0


  4%|▎         | 216/6000 [48:47<45:49:26, 28.52s/it]

Total reward after episode 216 is 1422.0


  4%|▎         | 217/6000 [48:55<36:03:50, 22.45s/it]

Total reward after episode 217 is 631.0


  4%|▎         | 218/6000 [49:03<29:05:56, 18.12s/it]

Total reward after episode 218 is 611.0


  4%|▎         | 219/6000 [49:07<21:58:40, 13.69s/it]

Total reward after episode 219 is 247.0


  4%|▎         | 220/6000 [49:32<27:45:02, 17.28s/it]

Total reward after episode 220 is 1333.0


  4%|▎         | 221/6000 [49:58<31:54:50, 19.88s/it]

Total reward after episode 221 is 1338.0


  4%|▎         | 222/6000 [50:22<33:33:01, 20.90s/it]

Total reward after episode 222 is 1431.0


  4%|▎         | 223/6000 [50:43<33:55:13, 21.14s/it]

Total reward after episode 223 is 1431.0


  4%|▎         | 224/6000 [51:06<34:24:32, 21.45s/it]

Total reward after episode 224 is 808.0


  4%|▍         | 225/6000 [51:09<25:42:06, 16.02s/it]

Total reward after episode 225 is 250.0


  4%|▍         | 226/6000 [51:12<19:36:14, 12.22s/it]

Total reward after episode 226 is 247.0


  4%|▍         | 227/6000 [51:42<27:53:26, 17.39s/it]

Total reward after episode 227 is 620.0


  4%|▍         | 228/6000 [52:03<29:36:56, 18.47s/it]

Total reward after episode 228 is 1434.0


  4%|▍         | 229/6000 [52:07<22:35:18, 14.09s/it]

Total reward after episode 229 is 239.0


  4%|▍         | 230/6000 [52:11<17:47:06, 11.10s/it]

Total reward after episode 230 is 241.0


  4%|▍         | 231/6000 [52:47<29:50:09, 18.62s/it]

Total reward after episode 231 is 1018.0


  4%|▍         | 232/6000 [52:51<22:40:31, 14.15s/it]

Total reward after episode 232 is 248.0


  4%|▍         | 233/6000 [53:08<24:21:12, 15.20s/it]

Total reward after episode 233 is 608.0


  4%|▍         | 234/6000 [53:30<27:40:25, 17.28s/it]

Total reward after episode 234 is 1346.0


  4%|▍         | 235/6000 [54:07<36:52:03, 23.02s/it]

Total reward after episode 235 is 1852.0


  4%|▍         | 236/6000 [54:46<44:26:33, 27.76s/it]

Total reward after episode 236 is 1321.0


  4%|▍         | 237/6000 [55:01<38:27:46, 24.03s/it]

Total reward after episode 237 is 1441.0


  4%|▍         | 238/6000 [55:21<36:46:54, 22.98s/it]

Total reward after episode 238 is 1337.0


  4%|▍         | 239/6000 [55:39<34:04:05, 21.29s/it]

Total reward after episode 239 is 1330.0


  4%|▍         | 240/6000 [55:49<28:48:27, 18.00s/it]

Total reward after episode 240 is 631.0


  4%|▍         | 241/6000 [55:56<23:35:33, 14.75s/it]

Total reward after episode 241 is 608.0


  4%|▍         | 242/6000 [56:13<24:38:35, 15.41s/it]

Total reward after episode 242 is 1336.0


  4%|▍         | 243/6000 [56:21<20:56:39, 13.10s/it]

Total reward after episode 243 is 610.0


  4%|▍         | 244/6000 [56:31<19:17:52, 12.07s/it]

Total reward after episode 244 is 624.0


  4%|▍         | 245/6000 [56:34<14:56:40,  9.35s/it]

Total reward after episode 245 is 248.0


  4%|▍         | 246/6000 [56:41<14:11:03,  8.87s/it]

Total reward after episode 246 is 612.0


  4%|▍         | 247/6000 [56:55<16:32:10, 10.35s/it]

Total reward after episode 247 is 604.0


  4%|▍         | 248/6000 [57:16<21:43:34, 13.60s/it]

Total reward after episode 248 is 1326.0


  4%|▍         | 249/6000 [57:27<20:06:27, 12.59s/it]

Total reward after episode 249 is 634.0


  4%|▍         | 250/6000 [57:43<21:56:18, 13.74s/it]

Total reward after episode 250 is 815.0


  4%|▍         | 251/6000 [57:52<19:34:38, 12.26s/it]

Total reward after episode 251 is 625.0


  4%|▍         | 252/6000 [57:55<15:24:08,  9.65s/it]

Total reward after episode 252 is 239.0


  4%|▍         | 253/6000 [58:17<21:18:32, 13.35s/it]

Total reward after episode 253 is 1425.0


  4%|▍         | 254/6000 [58:21<16:37:57, 10.42s/it]

Total reward after episode 254 is 237.0


  4%|▍         | 255/6000 [58:38<19:45:57, 12.39s/it]

Total reward after episode 255 is 1432.0


  4%|▍         | 256/6000 [58:47<18:05:31, 11.34s/it]

Total reward after episode 256 is 637.0


  4%|▍         | 257/6000 [58:56<17:00:58, 10.67s/it]

Total reward after episode 257 is 634.0


  4%|▍         | 258/6000 [59:16<21:31:53, 13.50s/it]

Total reward after episode 258 is 1326.0


  4%|▍         | 259/6000 [59:25<19:26:47, 12.19s/it]

Total reward after episode 259 is 634.0


  4%|▍         | 260/6000 [59:50<25:42:26, 16.12s/it]

Total reward after episode 260 is 1424.0


  4%|▍         | 261/6000 [59:54<19:40:38, 12.34s/it]

Total reward after episode 261 is 239.0


  4%|▍         | 262/6000 [1:00:03<17:56:28, 11.26s/it]

Total reward after episode 262 is 631.0


  4%|▍         | 263/6000 [1:00:12<16:49:14, 10.56s/it]

Total reward after episode 263 is 637.0


  4%|▍         | 264/6000 [1:00:20<15:47:39,  9.91s/it]

Total reward after episode 264 is 638.0


  4%|▍         | 265/6000 [1:00:46<23:16:17, 14.61s/it]

Total reward after episode 265 is 1689.0


  4%|▍         | 266/6000 [1:00:53<19:59:29, 12.55s/it]

Total reward after episode 266 is 610.0


  4%|▍         | 267/6000 [1:01:01<17:34:16, 11.03s/it]

Total reward after episode 267 is 612.0


  4%|▍         | 268/6000 [1:01:10<16:50:05, 10.57s/it]

Total reward after episode 268 is 623.0


  4%|▍         | 269/6000 [1:01:21<16:42:10, 10.49s/it]

Total reward after episode 269 is 614.0


  4%|▍         | 270/6000 [1:01:25<13:39:17,  8.58s/it]

Total reward after episode 270 is 240.0


  5%|▍         | 271/6000 [1:01:28<11:11:09,  7.03s/it]

Total reward after episode 271 is 239.0


  5%|▍         | 272/6000 [1:01:52<19:06:17, 12.01s/it]

Total reward after episode 272 is 1036.0


  5%|▍         | 273/6000 [1:02:15<24:15:33, 15.25s/it]

Total reward after episode 273 is 1044.0


  5%|▍         | 274/6000 [1:02:40<28:53:59, 18.17s/it]

Total reward after episode 274 is 1339.0


  5%|▍         | 275/6000 [1:02:59<29:29:16, 18.54s/it]

Total reward after episode 275 is 1047.0


  5%|▍         | 276/6000 [1:03:56<47:48:51, 30.07s/it]

Total reward after episode 276 is 566.0


  5%|▍         | 277/6000 [1:04:07<38:37:22, 24.30s/it]

Total reward after episode 277 is 627.0


  5%|▍         | 278/6000 [1:04:18<32:16:02, 20.30s/it]

Total reward after episode 278 is 631.0


  5%|▍         | 279/6000 [1:04:35<30:42:11, 19.32s/it]

Total reward after episode 279 is 743.0


  5%|▍         | 280/6000 [1:05:16<40:52:54, 25.73s/it]

Total reward after episode 280 is 1299.0


  5%|▍         | 281/6000 [1:05:46<43:14:24, 27.22s/it]

Total reward after episode 281 is 1420.0


  5%|▍         | 282/6000 [1:05:57<35:27:57, 22.33s/it]

Total reward after episode 282 is 611.0


  5%|▍         | 283/6000 [1:06:23<37:21:02, 23.52s/it]

Total reward after episode 283 is 803.0


  5%|▍         | 284/6000 [1:06:35<31:44:09, 19.99s/it]

Total reward after episode 284 is 620.0


  5%|▍         | 285/6000 [1:06:47<27:47:11, 17.50s/it]

Total reward after episode 285 is 611.0


  5%|▍         | 286/6000 [1:07:07<29:10:41, 18.38s/it]

Total reward after episode 286 is 1342.0


  5%|▍         | 287/6000 [1:07:33<32:37:39, 20.56s/it]

Total reward after episode 287 is 1335.0


  5%|▍         | 288/6000 [1:07:41<26:48:21, 16.89s/it]

Total reward after episode 288 is 623.0


  5%|▍         | 289/6000 [1:08:06<30:45:06, 19.38s/it]

Total reward after episode 289 is 725.0


  5%|▍         | 290/6000 [1:08:27<31:28:23, 19.84s/it]

Total reward after episode 290 is 1326.0


  5%|▍         | 291/6000 [1:08:55<35:21:19, 22.29s/it]

Total reward after episode 291 is 1317.0


  5%|▍         | 292/6000 [1:09:13<33:10:13, 20.92s/it]

Total reward after episode 292 is 640.0


  5%|▍         | 293/6000 [1:09:25<28:38:56, 18.07s/it]

Total reward after episode 293 is 615.0


  5%|▍         | 294/6000 [1:09:35<25:02:38, 15.80s/it]

Total reward after episode 294 is 630.0


  5%|▍         | 295/6000 [1:10:02<30:28:26, 19.23s/it]

Total reward after episode 295 is 1317.0


  5%|▍         | 296/6000 [1:10:26<32:22:18, 20.43s/it]

Total reward after episode 296 is 726.0


  5%|▍         | 297/6000 [1:10:53<35:40:23, 22.52s/it]

Total reward after episode 297 is 1030.0


  5%|▍         | 298/6000 [1:11:14<34:58:55, 22.09s/it]

Total reward after episode 298 is 1039.0


  5%|▍         | 299/6000 [1:11:26<29:58:00, 18.92s/it]

Total reward after episode 299 is 732.0


  5%|▌         | 300/6000 [1:11:49<32:20:42, 20.43s/it]

Total reward after episode 300 is 1038.0


  5%|▌         | 301/6000 [1:12:01<27:59:37, 17.68s/it]

Total reward after episode 301 is 609.0


  5%|▌         | 302/6000 [1:12:34<35:31:18, 22.44s/it]

Total reward after episode 302 is 1017.0


  5%|▌         | 303/6000 [1:12:46<30:23:23, 19.20s/it]

Total reward after episode 303 is 625.0


  5%|▌         | 304/6000 [1:13:17<36:02:21, 22.78s/it]

Total reward after episode 304 is 1032.0


  5%|▌         | 305/6000 [1:13:34<33:09:59, 20.97s/it]

Total reward after episode 305 is 816.0


  5%|▌         | 306/6000 [1:13:48<29:45:09, 18.81s/it]

Total reward after episode 306 is 625.0


  5%|▌         | 307/6000 [1:14:01<27:03:17, 17.11s/it]

Total reward after episode 307 is 603.0


  5%|▌         | 308/6000 [1:14:18<27:20:03, 17.29s/it]

Total reward after episode 308 is 1049.0


  5%|▌         | 309/6000 [1:14:39<28:43:53, 18.17s/it]

Total reward after episode 309 is 1042.0


  5%|▌         | 310/6000 [1:14:55<27:56:20, 17.68s/it]

Total reward after episode 310 is 1348.0


  5%|▌         | 311/6000 [1:15:06<24:52:35, 15.74s/it]

Total reward after episode 311 is 630.0


  5%|▌         | 312/6000 [1:15:30<28:40:53, 18.15s/it]

Total reward after episode 312 is 1428.0


  5%|▌         | 313/6000 [1:15:39<24:14:26, 15.34s/it]

Total reward after episode 313 is 624.0


  5%|▌         | 314/6000 [1:15:56<25:06:48, 15.90s/it]

Total reward after episode 314 is 1345.0


  5%|▌         | 315/6000 [1:16:08<22:57:19, 14.54s/it]

Total reward after episode 315 is 628.0


  5%|▌         | 316/6000 [1:16:21<22:25:40, 14.20s/it]

Total reward after episode 316 is 628.0


  5%|▌         | 317/6000 [1:16:33<21:21:00, 13.52s/it]

Total reward after episode 317 is 616.0


  5%|▌         | 318/6000 [1:16:43<19:52:03, 12.59s/it]

Total reward after episode 318 is 623.0


  5%|▌         | 319/6000 [1:17:06<24:35:26, 15.58s/it]

Total reward after episode 319 is 1339.0


  5%|▌         | 320/6000 [1:17:27<27:04:46, 17.16s/it]

Total reward after episode 320 is 1042.0


  5%|▌         | 321/6000 [1:17:38<24:07:11, 15.29s/it]

Total reward after episode 321 is 619.0


  5%|▌         | 322/6000 [1:17:48<21:45:02, 13.79s/it]

Total reward after episode 322 is 625.0


  5%|▌         | 323/6000 [1:18:08<24:51:57, 15.77s/it]

Total reward after episode 323 is 1043.0


  5%|▌         | 324/6000 [1:18:31<28:06:06, 17.82s/it]

Total reward after episode 324 is 1038.0


  5%|▌         | 325/6000 [1:18:47<27:08:04, 17.21s/it]

Total reward after episode 325 is 728.0


  5%|▌         | 326/6000 [1:19:02<26:13:53, 16.64s/it]

Total reward after episode 326 is 616.0


  5%|▌         | 327/6000 [1:19:23<28:13:04, 17.91s/it]

Total reward after episode 327 is 1435.0


  5%|▌         | 328/6000 [1:19:27<21:50:44, 13.87s/it]

Total reward after episode 328 is 238.0


  5%|▌         | 329/6000 [1:19:37<20:01:21, 12.71s/it]

Total reward after episode 329 is 632.0


  6%|▌         | 330/6000 [1:19:58<23:32:37, 14.95s/it]

Total reward after episode 330 is 589.0


  6%|▌         | 331/6000 [1:20:21<27:24:31, 17.41s/it]

Total reward after episode 331 is 1339.0


  6%|▌         | 332/6000 [1:20:31<24:01:39, 15.26s/it]

Total reward after episode 332 is 638.0


  6%|▌         | 333/6000 [1:20:49<25:14:29, 16.03s/it]

Total reward after episode 333 is 752.0


  6%|▌         | 334/6000 [1:20:58<21:56:02, 13.94s/it]

Total reward after episode 334 is 618.0


  6%|▌         | 335/6000 [1:21:37<33:42:17, 21.42s/it]

Total reward after episode 335 is 1019.0


  6%|▌         | 336/6000 [1:21:49<29:32:43, 18.78s/it]

Total reward after episode 336 is 623.0


  6%|▌         | 337/6000 [1:22:19<34:55:54, 22.21s/it]

Total reward after episode 337 is 1027.0


  6%|▌         | 338/6000 [1:22:38<33:09:08, 21.08s/it]

Total reward after episode 338 is 815.0


  6%|▌         | 339/6000 [1:23:03<34:59:11, 22.25s/it]

Total reward after episode 339 is 1432.0


  6%|▌         | 340/6000 [1:23:27<35:41:04, 22.70s/it]

Total reward after episode 340 is 1427.0


  6%|▌         | 341/6000 [1:23:57<39:22:41, 25.05s/it]

Total reward after episode 341 is 627.0


  6%|▌         | 342/6000 [1:24:15<35:48:59, 22.79s/it]

Total reward after episode 342 is 616.0


  6%|▌         | 343/6000 [1:24:36<34:53:15, 22.20s/it]

Total reward after episode 343 is 639.0


  6%|▌         | 344/6000 [1:24:46<29:14:44, 18.61s/it]

Total reward after episode 344 is 627.0


  6%|▌         | 345/6000 [1:24:55<24:35:50, 15.66s/it]

Total reward after episode 345 is 609.0


  6%|▌         | 346/6000 [1:25:19<28:30:31, 18.15s/it]

Total reward after episode 346 is 1035.0


  6%|▌         | 347/6000 [1:25:43<31:18:36, 19.94s/it]

Total reward after episode 347 is 1039.0


  6%|▌         | 348/6000 [1:26:02<31:03:34, 19.78s/it]

Total reward after episode 348 is 1043.0


  6%|▌         | 349/6000 [1:26:13<26:41:13, 17.00s/it]

Total reward after episode 349 is 634.0


  6%|▌         | 350/6000 [1:26:39<31:10:48, 19.87s/it]

Total reward after episode 350 is 1333.0


  6%|▌         | 351/6000 [1:26:51<27:39:07, 17.62s/it]

Total reward after episode 351 is 618.0


  6%|▌         | 352/6000 [1:26:59<22:50:50, 14.56s/it]

Total reward after episode 352 is 614.0


  6%|▌         | 353/6000 [1:27:22<26:37:04, 16.97s/it]

Total reward after episode 353 is 746.0


  6%|▌         | 354/6000 [1:27:41<27:44:16, 17.69s/it]

Total reward after episode 354 is 1356.0


  6%|▌         | 355/6000 [1:27:51<24:03:43, 15.35s/it]

Total reward after episode 355 is 623.0


  6%|▌         | 356/6000 [1:28:01<21:30:05, 13.71s/it]

Total reward after episode 356 is 627.0


  6%|▌         | 357/6000 [1:28:23<25:22:05, 16.18s/it]

Total reward after episode 357 is 1328.0


  6%|▌         | 358/6000 [1:28:47<29:25:03, 18.77s/it]

Total reward after episode 358 is 1692.0


  6%|▌         | 359/6000 [1:29:07<29:53:03, 19.07s/it]

Total reward after episode 359 is 1042.0


  6%|▌         | 360/6000 [1:29:19<26:14:31, 16.75s/it]

Total reward after episode 360 is 735.0


  6%|▌         | 361/6000 [1:29:45<30:48:07, 19.66s/it]

Total reward after episode 361 is 742.0


  6%|▌         | 362/6000 [1:29:53<25:30:47, 16.29s/it]

Total reward after episode 362 is 632.0


  6%|▌         | 363/6000 [1:30:17<28:50:02, 18.41s/it]

Total reward after episode 363 is 1702.0


  6%|▌         | 364/6000 [1:30:30<26:26:08, 16.89s/it]

Total reward after episode 364 is 603.0


  6%|▌         | 365/6000 [1:31:12<38:08:22, 24.37s/it]

Total reward after episode 365 is 1003.0


  6%|▌         | 366/6000 [1:31:33<36:44:37, 23.48s/it]

Total reward after episode 366 is 1041.0


  6%|▌         | 367/6000 [1:31:53<34:51:03, 22.27s/it]

Total reward after episode 367 is 1333.0


  6%|▌         | 368/6000 [1:32:03<29:22:08, 18.77s/it]

Total reward after episode 368 is 736.0


  6%|▌         | 369/6000 [1:32:37<36:18:06, 23.21s/it]

Total reward after episode 369 is 1561.0


  6%|▌         | 370/6000 [1:32:45<29:21:45, 18.78s/it]

Total reward after episode 370 is 624.0


  6%|▌         | 371/6000 [1:32:56<25:19:59, 16.20s/it]

Total reward after episode 371 is 623.0


  6%|▌         | 372/6000 [1:33:18<28:07:23, 17.99s/it]

Total reward after episode 372 is 1434.0


  6%|▌         | 373/6000 [1:33:27<24:12:49, 15.49s/it]

Total reward after episode 373 is 654.0


  6%|▌         | 374/6000 [1:33:37<21:40:36, 13.87s/it]

Total reward after episode 374 is 654.0


  6%|▋         | 375/6000 [1:33:49<20:47:45, 13.31s/it]

Total reward after episode 375 is 737.0


  6%|▋         | 376/6000 [1:38:16<139:26:33, 89.26s/it]

Total reward after episode 376 is 267.0


  6%|▋         | 377/6000 [1:38:26<102:17:28, 65.49s/it]

Total reward after episode 377 is 736.0


  6%|▋         | 378/6000 [1:38:37<76:55:49, 49.26s/it] 

Total reward after episode 378 is 624.0


  6%|▋         | 379/6000 [1:38:53<61:18:37, 39.27s/it]

Total reward after episode 379 is 1049.0


  6%|▋         | 380/6000 [1:39:09<50:12:50, 32.17s/it]

Total reward after episode 380 is 1150.0


  6%|▋         | 381/6000 [1:39:18<39:28:42, 25.29s/it]

Total reward after episode 381 is 625.0


  6%|▋         | 382/6000 [1:39:39<37:23:15, 23.96s/it]

Total reward after episode 382 is 1334.0


  6%|▋         | 383/6000 [1:39:48<30:08:28, 19.32s/it]

Total reward after episode 383 is 614.0


  6%|▋         | 384/6000 [1:40:02<28:04:41, 18.00s/it]

Total reward after episode 384 is 1048.0


  6%|▋         | 385/6000 [1:40:12<24:02:09, 15.41s/it]

Total reward after episode 385 is 736.0


  6%|▋         | 386/6000 [1:40:24<22:38:09, 14.52s/it]

Total reward after episode 386 is 734.0


  6%|▋         | 387/6000 [1:40:55<30:13:47, 19.39s/it]

Total reward after episode 387 is 1698.0


  6%|▋         | 388/6000 [1:41:08<27:25:31, 17.59s/it]

Total reward after episode 388 is 1050.0


  6%|▋         | 388/6000 [1:41:21<24:26:05, 15.67s/it]


KeyboardInterrupt: 