<a href="https://colab.research.google.com/github/Einzelganger2502/Mario-RL-Agent/blob/main/Mario-RL-Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nes-py==0.2.6
!pip install gym-super-mario-bros
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!apt install -y libgl1-mesa-glx
!pip install opencv-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nes-py==0.2.6
  Downloading nes_py-0.2.6.tar.gz (75 kB)
[K     |████████████████████████████████| 75 kB 2.4 MB/s 
Collecting pygame>=1.9.3
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.3 MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-0.2.6-cp37-cp37m-linux_x86_64.whl size=168540 sha256=876e5b82b936657818e1749813f0d28cabcc6de7eae58453e7e22846a292eb40
  Stored in directory: /root/.cache/pip/wheels/cf/87/a9/d777bc0614683325afc2501fe16a01ae29a9bf6c5650cffbad
Successfully built nes-py
Installing collected packages: pygame, nes-py
Successfully installed nes-py-0.2.6 pygame-2.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple

In [2]:
import torch
import torch.nn as nn
import random
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from tqdm import tqdm
import pickle 
from gym_super_mario_bros.actions import RIGHT_ONLY
import gym
import numpy as np
import collections 
import cv2
import matplotlib.pyplot as plt

In [3]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init to first obs"""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    """
    Downsamples image to 84x84
    Greyscales image

    Returns numpy array
    """
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 240 * 256 * 3:
            img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    """Normalize pixel values in frame --> 0 to 1"""
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


def make_env(env):
    env = MaxAndSkipEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    env = ScaledFloatFrame(env)
    return JoypadSpace(env, RIGHT_ONLY)

In [4]:
class DQNSolver(nn.Module):

    def __init__(self, input_shape, n_actions):
        super(DQNSolver, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)
    

class DQNAgent:

    def __init__(self, state_space, action_space, max_memory_size, batch_size, gamma, lr,
                 dropout, exploration_max, exploration_min, exploration_decay, double_dq, pretrained):

        # Define DQN Layers
        self.state_space = state_space
        self.action_space = action_space
        self.double_dq = double_dq
        self.pretrained = pretrained
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.double_dq:  
            self.local_net = DQNSolver(state_space, action_space).to(self.device)
            self.target_net = DQNSolver(state_space, action_space).to(self.device)
            
            if self.pretrained:
                self.local_net.load_state_dict(torch.load("dq1.pt", map_location=torch.device(self.device)))
                self.target_net.load_state_dict(torch.load("dq2.pt", map_location=torch.device(self.device)))
                    
            self.optimizer = torch.optim.Adam(self.local_net.parameters(), lr=lr)
            self.copy = 5000  # Copy the local model weights into the target network every 5000 steps
            self.step = 0
        else:  
            self.dqn = DQNSolver(state_space, action_space).to(self.device)
            
            if self.pretrained:
                self.dqn.load_state_dict(torch.load("dq.pt", map_location=torch.device(self.device)))
            self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=lr)

        # Create memory
        self.max_memory_size = max_memory_size
        if self.pretrained:
            self.STATE_MEM = torch.load("STATE_MEM.pt")
            self.ACTION_MEM = torch.load("ACTION_MEM.pt")
            self.REWARD_MEM = torch.load("REWARD_MEM.pt")
            self.STATE2_MEM = torch.load("STATE2_MEM.pt")
            self.DONE_MEM = torch.load("DONE_MEM.pt")
            with open("ending_position.pkl", 'rb') as f:
                self.ending_position = pickle.load(f)
            with open("num_in_queue.pkl", 'rb') as f:
                self.num_in_queue = pickle.load(f)
        else:
            self.STATE_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.ACTION_MEM = torch.zeros(max_memory_size, 1)
            self.REWARD_MEM = torch.zeros(max_memory_size, 1)
            self.STATE2_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.DONE_MEM = torch.zeros(max_memory_size, 1)
            self.ending_position = 0
            self.num_in_queue = 0
        
        self.memory_sample_size = batch_size
        
        # Learning parameters
        self.gamma = gamma
        self.l1 = nn.SmoothL1Loss().to(self.device) # Also known as Huber loss
        self.exploration_max = exploration_max
        self.exploration_rate = exploration_max
        self.exploration_min = exploration_min
        self.exploration_decay = exploration_decay

    def remember(self, state, action, reward, state2, done):
        self.STATE_MEM[self.ending_position] = state.float()
        self.ACTION_MEM[self.ending_position] = action.float()
        self.REWARD_MEM[self.ending_position] = reward.float()
        self.STATE2_MEM[self.ending_position] = state2.float()
        self.DONE_MEM[self.ending_position] = done.float()
        self.ending_position = (self.ending_position + 1) % self.max_memory_size  # FIFO tensor
        self.num_in_queue = min(self.num_in_queue + 1, self.max_memory_size)
        
    def recall(self):
        # Randomly sample 'batch size' experiences
        idx = random.choices(range(self.num_in_queue), k=self.memory_sample_size)
        
        STATE = self.STATE_MEM[idx]
        ACTION = self.ACTION_MEM[idx]
        REWARD = self.REWARD_MEM[idx]
        STATE2 = self.STATE2_MEM[idx]
        DONE = self.DONE_MEM[idx]
        
        return STATE, ACTION, REWARD, STATE2, DONE

    def act(self, state):
        # Epsilon-greedy action
        
        if self.double_dq:
            self.step += 1
        if random.random() < self.exploration_rate:  
            return torch.tensor([[random.randrange(self.action_space)]])
        if self.double_dq:
            # Local net is used for the policy
            return torch.argmax(self.local_net(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()
        else:
            return torch.argmax(self.dqn(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()

    def copy_model(self):
        # Copy local net weights into target net
        
        self.target_net.load_state_dict(self.local_net.state_dict())
    
    def experience_replay(self):
        
        if self.double_dq and self.step % self.copy == 0:
            self.copy_model()

        if self.memory_sample_size > self.num_in_queue:
            return

        STATE, ACTION, REWARD, STATE2, DONE = self.recall()
        STATE = STATE.to(self.device)
        ACTION = ACTION.to(self.device)
        REWARD = REWARD.to(self.device)
        STATE2 = STATE2.to(self.device)
        DONE = DONE.to(self.device)
        
        self.optimizer.zero_grad()
        if self.double_dq:
            # Double Q-Learning target is Q*(S, A) <- r + γ max_a Q_target(S', a)
            target = REWARD + torch.mul((self.gamma * 
                                        self.target_net(STATE2).max(1).values.unsqueeze(1)), 
                                        1 - DONE)

            current = self.local_net(STATE).gather(1, ACTION.long()) # Local net approximation of Q-value
        else:
            # Q-Learning target is Q*(S, A) <- r + γ max_a Q(S', a) 
            target = REWARD + torch.mul((self.gamma * 
                                        self.dqn(STATE2).max(1).values.unsqueeze(1)), 
                                        1 - DONE)
                
            current = self.dqn(STATE).gather(1, ACTION.long())
        
        loss = self.l1(current, target)
        loss.backward() # Compute gradients
        self.optimizer.step() # Backpropagate error

        self.exploration_rate *= self.exploration_decay
        
        # Makes sure that exploration rate is always at least 'exploration min'
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)

In [5]:
def vectorize_action(action, action_space):
    # Given a scalar action, return a one-hot encoded action
    
    return [0 for _ in range(action)] + [1] + [0 for _ in range(action + 1, action_space)]

In [6]:
def show_state(env, ep=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Episode: %d %s" % (ep, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [7]:
def run(training_mode, pretrained):
   
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = make_env(env)  # Wraps the environment so that frames are grayscale 
    observation_space = env.observation_space.shape
    action_space = env.action_space.n
    agent = DQNAgent(state_space=observation_space,
                     action_space=action_space,
                     max_memory_size=30000,
                     batch_size=32,
                     gamma=0.90,
                     lr=0.00025,
                     dropout=0.,
                     exploration_max=1.0,
                     exploration_min=0.02,
                     exploration_decay=0.99,
                     double_dq=True,
                     pretrained=pretrained)
    
    num_episodes = 10000
    env.reset()
    total_rewards = []
    
    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor([state])
        total_reward = 0
        steps = 0
        while True:
            if not training_mode:
                show_state(env, ep_num)
            action = agent.act(state)
            steps += 1
            
            state_next, reward, terminal, info = env.step(int(action[0]))
            total_reward += reward
            state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)
            
            terminal = torch.tensor([int(terminal)]).unsqueeze(0)
            
            if training_mode:
                agent.remember(state, action, reward, state_next, terminal)
                agent.experience_replay()
            
            state = state_next
            if terminal:
                break
        
        total_rewards.append(total_reward)

        print("Total reward after episode {} is {}".format(ep_num + 1, total_rewards[-1]))
        num_episodes += 1      
    
    if training_mode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.ending_position, f)
        with open("num_in_queue.pkl", "wb") as f:
            pickle.dump(agent.num_in_queue, f)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(total_rewards, f)
        if agent.double_dq:
            torch.save(agent.local_net.state_dict(), "dq1.pt")
            torch.save(agent.target_net.state_dict(), "dq2.pt")
        else:
            torch.save(agent.dqn.state_dict(), "dq.pt")  
        torch.save(agent.STATE_MEM,  "STATE_MEM.pt")
        torch.save(agent.ACTION_MEM, "ACTION_MEM.pt")
        torch.save(agent.REWARD_MEM, "REWARD_MEM.pt")
        torch.save(agent.STATE2_MEM, "STATE2_MEM.pt")
        torch.save(agent.DONE_MEM,   "DONE_MEM.pt")
    
    env.close()
    
    if num_episodes > 500:
        plt.title("Episodes trained vs. Average Rewards (per 500 eps)")
        plt.plot([0 for _ in range(500)] + 
                 np.convolve(total_rewards, np.ones((500,))/500, mode="valid").tolist())
        plt.show()

run(training_mode=True, pretrained=False)

  0%|          | 1/10000 [00:00<1:58:06,  1.41it/s]

Total reward after episode 1 is 244.0


  0%|          | 2/10000 [00:04<7:13:19,  2.60s/it]

Total reward after episode 2 is 250.0


  0%|          | 3/10000 [00:26<32:06:55, 11.57s/it]

Total reward after episode 3 is 1046.0


  0%|          | 4/10000 [00:31<24:16:42,  8.74s/it]

Total reward after episode 4 is 239.0


  0%|          | 5/10000 [00:44<28:56:38, 10.43s/it]

Total reward after episode 5 is 601.0


  0%|          | 6/10000 [01:01<34:58:54, 12.60s/it]

Total reward after episode 6 is 595.0


  0%|          | 7/10000 [01:06<28:00:28, 10.09s/it]

Total reward after episode 7 is 231.0


  0%|          | 8/10000 [01:11<23:29:58,  8.47s/it]

Total reward after episode 8 is 231.0


  0%|          | 9/10000 [01:17<21:40:35,  7.81s/it]

Total reward after episode 9 is 224.0


  0%|          | 10/10000 [01:21<18:24:17,  6.63s/it]

Total reward after episode 10 is 243.0


  0%|          | 11/10000 [01:25<15:52:12,  5.72s/it]

Total reward after episode 11 is 251.0


  0%|          | 12/10000 [01:29<14:18:15,  5.16s/it]

Total reward after episode 12 is 250.0


  0%|          | 13/10000 [01:33<13:27:06,  4.85s/it]

Total reward after episode 13 is 252.0


  0%|          | 14/10000 [01:37<12:54:15,  4.65s/it]

Total reward after episode 14 is 252.0


  0%|          | 15/10000 [02:05<32:24:03, 11.68s/it]

Total reward after episode 15 is 593.0


  0%|          | 16/10000 [02:09<26:01:47,  9.39s/it]

Total reward after episode 16 is 247.0


  0%|          | 17/10000 [02:13<21:41:39,  7.82s/it]

Total reward after episode 17 is 251.0


  0%|          | 18/10000 [02:17<18:36:16,  6.71s/it]

Total reward after episode 18 is 247.0


  0%|          | 19/10000 [02:37<29:15:16, 10.55s/it]

Total reward after episode 19 is 638.0


  0%|          | 20/10000 [02:41<23:33:02,  8.50s/it]

Total reward after episode 20 is 251.0


  0%|          | 21/10000 [03:14<44:07:32, 15.92s/it]

Total reward after episode 21 is 732.0


  0%|          | 22/10000 [03:18<33:56:10, 12.24s/it]

Total reward after episode 22 is 251.0


  0%|          | 23/10000 [03:21<26:43:29,  9.64s/it]

Total reward after episode 23 is 252.0


  0%|          | 24/10000 [03:25<21:46:20,  7.86s/it]

Total reward after episode 24 is 251.0


  0%|          | 25/10000 [03:36<24:46:49,  8.94s/it]

Total reward after episode 25 is 626.0


  0%|          | 26/10000 [03:40<20:21:33,  7.35s/it]

Total reward after episode 26 is 250.0


  0%|          | 27/10000 [03:44<18:01:03,  6.50s/it]

Total reward after episode 27 is 237.0


  0%|          | 28/10000 [03:56<21:49:44,  7.88s/it]

Total reward after episode 28 is 640.0


  0%|          | 29/10000 [03:59<18:17:49,  6.61s/it]

Total reward after episode 29 is 250.0


  0%|          | 30/10000 [04:03<15:49:44,  5.72s/it]

Total reward after episode 30 is 252.0


  0%|          | 31/10000 [04:26<30:13:45, 10.92s/it]

Total reward after episode 31 is 809.0


  0%|          | 32/10000 [04:45<37:17:10, 13.47s/it]

Total reward after episode 32 is 1046.0


  0%|          | 33/10000 [04:49<29:18:03, 10.58s/it]

Total reward after episode 33 is 248.0


  0%|          | 34/10000 [05:00<29:12:10, 10.55s/it]

Total reward after episode 34 is 634.0


  0%|          | 35/10000 [05:03<23:36:54,  8.53s/it]

Total reward after episode 35 is 251.0


  0%|          | 36/10000 [05:24<33:51:36, 12.23s/it]

Total reward after episode 36 is 761.0


  0%|          | 37/10000 [05:28<26:53:03,  9.71s/it]

Total reward after episode 37 is 252.0


  0%|          | 38/10000 [05:39<27:43:59, 10.02s/it]

Total reward after episode 38 is 637.0


  0%|          | 39/10000 [05:50<28:27:43, 10.29s/it]

Total reward after episode 39 is 637.0


  0%|          | 40/10000 [05:54<23:08:02,  8.36s/it]

Total reward after episode 40 is 248.0


  0%|          | 41/10000 [06:12<31:18:16, 11.32s/it]

Total reward after episode 41 is 595.0


  0%|          | 42/10000 [06:16<25:07:56,  9.09s/it]

Total reward after episode 42 is 249.0


  0%|          | 43/10000 [06:27<26:30:46,  9.59s/it]

Total reward after episode 43 is 637.0


  0%|          | 44/10000 [06:30<21:45:46,  7.87s/it]

Total reward after episode 44 is 251.0


  0%|          | 45/10000 [06:34<18:26:38,  6.67s/it]

Total reward after episode 45 is 251.0


  0%|          | 46/10000 [06:45<21:57:20,  7.94s/it]

Total reward after episode 46 is 639.0


  0%|          | 47/10000 [06:56<24:16:11,  8.78s/it]

Total reward after episode 47 is 637.0


  0%|          | 48/10000 [07:16<33:35:08, 12.15s/it]

Total reward after episode 48 is 1043.0


  0%|          | 49/10000 [07:20<26:35:56,  9.62s/it]

Total reward after episode 49 is 248.0


  0%|          | 50/10000 [07:48<41:53:49, 15.16s/it]

Total reward after episode 50 is 1031.0


  1%|          | 51/10000 [08:00<39:37:17, 14.34s/it]

Total reward after episode 51 is 614.0


  1%|          | 52/10000 [08:11<36:49:14, 13.32s/it]

Total reward after episode 52 is 639.0


  1%|          | 53/10000 [08:52<59:52:32, 21.67s/it]

Total reward after episode 53 is 784.0


  1%|          | 54/10000 [09:02<49:53:19, 18.06s/it]

Total reward after episode 54 is 617.0


  1%|          | 55/10000 [09:13<44:29:47, 16.11s/it]

Total reward after episode 55 is 626.0


  1%|          | 56/10000 [09:38<51:11:14, 18.53s/it]

Total reward after episode 56 is 808.0


  1%|          | 57/10000 [09:49<44:57:19, 16.28s/it]

Total reward after episode 57 is 633.0


  1%|          | 58/10000 [09:58<39:13:26, 14.20s/it]

Total reward after episode 58 is 614.0


  1%|          | 59/10000 [10:02<31:00:23, 11.23s/it]

Total reward after episode 59 is 239.0


  1%|          | 60/10000 [10:07<25:13:43,  9.14s/it]

Total reward after episode 60 is 239.0


  1%|          | 61/10000 [10:31<37:32:29, 13.60s/it]

Total reward after episode 61 is 1040.0


  1%|          | 62/10000 [10:35<29:44:54, 10.78s/it]

Total reward after episode 62 is 239.0


  1%|          | 63/10000 [10:47<30:35:22, 11.08s/it]

Total reward after episode 63 is 636.0


  1%|          | 64/10000 [10:50<24:35:36,  8.91s/it]

Total reward after episode 64 is 251.0


  1%|          | 65/10000 [11:00<25:03:51,  9.08s/it]

Total reward after episode 65 is 614.0


  1%|          | 66/10000 [11:12<27:28:31,  9.96s/it]

Total reward after episode 66 is 630.0


  1%|          | 67/10000 [11:23<28:11:01, 10.21s/it]

Total reward after episode 67 is 631.0


  1%|          | 68/10000 [11:34<28:56:06, 10.49s/it]

Total reward after episode 68 is 630.0


  1%|          | 69/10000 [11:45<29:26:14, 10.67s/it]

Total reward after episode 69 is 632.0


  1%|          | 70/10000 [11:49<23:37:15,  8.56s/it]

Total reward after episode 70 is 250.0


  1%|          | 71/10000 [11:58<24:41:53,  8.95s/it]

Total reward after episode 71 is 613.0


  1%|          | 72/10000 [12:09<25:42:11,  9.32s/it]

Total reward after episode 72 is 630.0


  1%|          | 73/10000 [12:13<21:37:39,  7.84s/it]

Total reward after episode 73 is 237.0


  1%|          | 74/10000 [12:17<18:12:02,  6.60s/it]

Total reward after episode 74 is 250.0


  1%|          | 75/10000 [12:21<16:08:38,  5.86s/it]

Total reward after episode 75 is 243.0


  1%|          | 76/10000 [12:33<21:16:39,  7.72s/it]

Total reward after episode 76 is 629.0


  1%|          | 77/10000 [13:13<48:04:44, 17.44s/it]

Total reward after episode 77 is 718.0


  1%|          | 78/10000 [13:17<36:53:22, 13.38s/it]

Total reward after episode 78 is 251.0


  1%|          | 79/10000 [13:21<29:26:16, 10.68s/it]

Total reward after episode 79 is 237.0


  1%|          | 80/10000 [13:32<29:11:58, 10.60s/it]

Total reward after episode 80 is 638.0


  1%|          | 81/10000 [13:48<33:44:38, 12.25s/it]

Total reward after episode 81 is 1050.0


  1%|          | 82/10000 [13:52<26:41:40,  9.69s/it]

Total reward after episode 82 is 250.0


  1%|          | 83/10000 [13:56<21:59:04,  7.98s/it]

Total reward after episode 83 is 251.0


  1%|          | 84/10000 [13:59<18:40:49,  6.78s/it]

Total reward after episode 84 is 251.0


  1%|          | 85/10000 [14:03<16:22:41,  5.95s/it]

Total reward after episode 85 is 250.0


  1%|          | 86/10000 [14:08<15:06:05,  5.48s/it]

Total reward after episode 86 is 239.0


  1%|          | 87/10000 [14:28<27:32:09, 10.00s/it]

Total reward after episode 87 is 605.0


  1%|          | 88/10000 [14:50<37:02:02, 13.45s/it]

Total reward after episode 88 is 1440.0


  1%|          | 89/10000 [14:54<29:12:52, 10.61s/it]

Total reward after episode 89 is 248.0


  1%|          | 90/10000 [15:24<45:29:51, 16.53s/it]

Total reward after episode 90 is 1027.0


  1%|          | 91/10000 [15:34<39:56:00, 14.51s/it]

Total reward after episode 91 is 607.0


  1%|          | 92/10000 [15:38<31:05:35, 11.30s/it]

Total reward after episode 92 is 251.0


  1%|          | 93/10000 [15:42<24:53:24,  9.04s/it]

Total reward after episode 93 is 248.0


  1%|          | 94/10000 [15:46<20:40:23,  7.51s/it]

Total reward after episode 94 is 249.0


  1%|          | 95/10000 [15:55<22:31:17,  8.19s/it]

Total reward after episode 95 is 624.0


  1%|          | 96/10000 [16:00<19:17:06,  7.01s/it]

Total reward after episode 96 is 239.0


  1%|          | 97/10000 [16:03<16:33:11,  6.02s/it]

Total reward after episode 97 is 252.0


  1%|          | 98/10000 [16:08<15:05:50,  5.49s/it]

Total reward after episode 98 is 239.0


  1%|          | 99/10000 [16:12<14:11:11,  5.16s/it]

Total reward after episode 99 is 240.0


  1%|          | 100/10000 [16:16<13:05:35,  4.76s/it]

Total reward after episode 100 is 248.0


  1%|          | 101/10000 [16:20<12:26:53,  4.53s/it]

Total reward after episode 101 is 248.0


  1%|          | 102/10000 [16:32<18:33:29,  6.75s/it]

Total reward after episode 102 is 629.0


  1%|          | 103/10000 [16:50<27:51:42, 10.13s/it]

Total reward after episode 103 is 1047.0


  1%|          | 104/10000 [17:02<29:24:54, 10.70s/it]

Total reward after episode 104 is 628.0


  1%|          | 105/10000 [17:06<24:29:04,  8.91s/it]

Total reward after episode 105 is 244.0


  1%|          | 106/10000 [17:10<20:22:03,  7.41s/it]

Total reward after episode 106 is 251.0


  1%|          | 107/10000 [17:22<23:54:46,  8.70s/it]

Total reward after episode 107 is 612.0


  1%|          | 108/10000 [17:31<24:27:28,  8.90s/it]

Total reward after episode 108 is 625.0


  1%|          | 109/10000 [17:55<36:14:11, 13.19s/it]

Total reward after episode 109 is 609.0


  1%|          | 110/10000 [17:59<29:07:55, 10.60s/it]

Total reward after episode 110 is 240.0


  1%|          | 111/10000 [18:03<23:46:45,  8.66s/it]

Total reward after episode 111 is 250.0


  1%|          | 112/10000 [18:07<19:50:40,  7.22s/it]

Total reward after episode 112 is 251.0


  1%|          | 113/10000 [18:11<17:11:18,  6.26s/it]

Total reward after episode 113 is 248.0


  1%|          | 114/10000 [18:15<15:20:16,  5.59s/it]

Total reward after episode 114 is 250.0


  1%|          | 115/10000 [18:19<14:03:31,  5.12s/it]

Total reward after episode 115 is 248.0


  1%|          | 116/10000 [18:40<26:33:16,  9.67s/it]

Total reward after episode 116 is 620.0


  1%|          | 117/10000 [18:45<23:09:38,  8.44s/it]

Total reward after episode 117 is 222.0


  1%|          | 118/10000 [18:49<19:26:56,  7.09s/it]

Total reward after episode 118 is 251.0


  1%|          | 119/10000 [18:54<17:28:05,  6.36s/it]

Total reward after episode 119 is 238.0


  1%|          | 120/10000 [18:58<15:24:27,  5.61s/it]

Total reward after episode 120 is 249.0


  1%|          | 121/10000 [19:19<28:03:47, 10.23s/it]

Total reward after episode 121 is 1044.0


  1%|          | 122/10000 [19:23<23:25:51,  8.54s/it]

Total reward after episode 122 is 240.0


  1%|          | 123/10000 [19:27<19:50:37,  7.23s/it]

Total reward after episode 123 is 238.0


  1%|          | 124/10000 [19:31<17:14:04,  6.28s/it]

Total reward after episode 124 is 248.0


  1%|▏         | 125/10000 [19:36<15:25:51,  5.63s/it]

Total reward after episode 125 is 248.0


  1%|▏         | 126/10000 [19:58<28:53:54, 10.54s/it]

Total reward after episode 126 is 1043.0


  1%|▏         | 127/10000 [20:02<24:08:08,  8.80s/it]

Total reward after episode 127 is 240.0


  1%|▏         | 128/10000 [20:11<24:25:40,  8.91s/it]

Total reward after episode 128 is 626.0


  1%|▏         | 129/10000 [20:17<21:21:04,  7.79s/it]

Total reward after episode 129 is 229.0


  1%|▏         | 130/10000 [20:27<23:20:25,  8.51s/it]

Total reward after episode 130 is 626.0


  1%|▏         | 131/10000 [20:31<19:26:57,  7.09s/it]

Total reward after episode 131 is 252.0


  1%|▏         | 132/10000 [21:00<37:46:15, 13.78s/it]

Total reward after episode 132 is 626.0


  1%|▏         | 133/10000 [21:18<41:35:46, 15.18s/it]

Total reward after episode 133 is 613.0


  1%|▏         | 134/10000 [21:23<32:28:00, 11.85s/it]

Total reward after episode 134 is 251.0


  1%|▏         | 135/10000 [21:26<25:49:19,  9.42s/it]

Total reward after episode 135 is 248.0


  1%|▏         | 136/10000 [21:32<22:26:48,  8.19s/it]

Total reward after episode 136 is 248.0


  1%|▏         | 137/10000 [21:36<18:55:37,  6.91s/it]

Total reward after episode 137 is 249.0


  1%|▏         | 138/10000 [21:40<16:34:53,  6.05s/it]

Total reward after episode 138 is 248.0


  1%|▏         | 139/10000 [21:44<14:57:00,  5.46s/it]

Total reward after episode 139 is 248.0


  1%|▏         | 140/10000 [21:47<13:36:24,  4.97s/it]

Total reward after episode 140 is 250.0


  1%|▏         | 141/10000 [21:51<12:37:22,  4.61s/it]

Total reward after episode 141 is 250.0


  1%|▏         | 142/10000 [21:55<12:08:34,  4.43s/it]

Total reward after episode 142 is 246.0


  1%|▏         | 143/10000 [21:59<11:42:49,  4.28s/it]

Total reward after episode 143 is 251.0


  1%|▏         | 144/10000 [22:11<18:08:25,  6.63s/it]

Total reward after episode 144 is 629.0


  1%|▏         | 145/10000 [22:15<15:54:43,  5.81s/it]

Total reward after episode 145 is 251.0


  1%|▏         | 146/10000 [22:20<14:48:24,  5.41s/it]

Total reward after episode 146 is 238.0


  1%|▏         | 147/10000 [22:24<14:08:31,  5.17s/it]

Total reward after episode 147 is 234.0


  1%|▏         | 148/10000 [22:28<13:06:01,  4.79s/it]

Total reward after episode 148 is 251.0


  1%|▏         | 149/10000 [22:32<12:28:55,  4.56s/it]

Total reward after episode 149 is 245.0


  2%|▏         | 150/10000 [22:36<11:56:17,  4.36s/it]

Total reward after episode 150 is 249.0


  2%|▏         | 151/10000 [22:40<11:32:27,  4.22s/it]

Total reward after episode 151 is 251.0


  2%|▏         | 152/10000 [22:44<11:13:51,  4.11s/it]

Total reward after episode 152 is 251.0


  2%|▏         | 153/10000 [22:48<10:56:21,  4.00s/it]

Total reward after episode 153 is 248.0


  2%|▏         | 154/10000 [22:52<10:56:51,  4.00s/it]

Total reward after episode 154 is 250.0


  2%|▏         | 155/10000 [22:55<10:50:57,  3.97s/it]

Total reward after episode 155 is 251.0


  2%|▏         | 156/10000 [23:07<17:13:56,  6.30s/it]

Total reward after episode 156 is 622.0


  2%|▏         | 157/10000 [23:12<15:44:17,  5.76s/it]

Total reward after episode 157 is 238.0


  2%|▏         | 158/10000 [23:15<14:06:44,  5.16s/it]

Total reward after episode 158 is 252.0


  2%|▏         | 159/10000 [23:19<12:58:53,  4.75s/it]

Total reward after episode 159 is 251.0


  2%|▏         | 160/10000 [23:48<32:44:12, 11.98s/it]

Total reward after episode 160 is 1031.0


  2%|▏         | 161/10000 [23:53<26:35:59,  9.73s/it]

Total reward after episode 161 is 248.0


  2%|▏         | 162/10000 [24:06<29:52:53, 10.93s/it]

Total reward after episode 162 is 616.0


  2%|▏         | 163/10000 [24:10<24:05:58,  8.82s/it]

Total reward after episode 163 is 249.0


  2%|▏         | 164/10000 [24:14<20:01:25,  7.33s/it]

Total reward after episode 164 is 249.0


  2%|▏         | 165/10000 [24:23<21:30:42,  7.87s/it]

Total reward after episode 165 is 625.0


  2%|▏         | 166/10000 [24:27<18:15:22,  6.68s/it]

Total reward after episode 166 is 249.0


  2%|▏         | 167/10000 [24:31<16:03:02,  5.88s/it]

Total reward after episode 167 is 247.0


  2%|▏         | 168/10000 [24:40<18:44:25,  6.86s/it]

Total reward after episode 168 is 623.0


  2%|▏         | 169/10000 [24:52<22:43:51,  8.32s/it]

Total reward after episode 169 is 639.0


  2%|▏         | 170/10000 [24:56<19:04:44,  6.99s/it]

Total reward after episode 170 is 251.0


  2%|▏         | 171/10000 [25:00<16:38:08,  6.09s/it]

Total reward after episode 171 is 246.0


  2%|▏         | 172/10000 [25:04<14:50:26,  5.44s/it]

Total reward after episode 172 is 251.0


  2%|▏         | 173/10000 [25:08<13:54:09,  5.09s/it]

Total reward after episode 173 is 249.0


  2%|▏         | 174/10000 [25:12<13:08:32,  4.82s/it]

Total reward after episode 174 is 246.0


  2%|▏         | 175/10000 [25:16<12:24:59,  4.55s/it]

Total reward after episode 175 is 251.0


  2%|▏         | 176/10000 [25:21<12:32:24,  4.60s/it]

Total reward after episode 176 is 234.0


  2%|▏         | 177/10000 [25:25<11:56:27,  4.38s/it]

Total reward after episode 177 is 248.0


  2%|▏         | 178/10000 [25:29<11:30:07,  4.22s/it]

Total reward after episode 178 is 250.0


  2%|▏         | 179/10000 [25:50<25:18:29,  9.28s/it]

Total reward after episode 179 is 1041.0


  2%|▏         | 180/10000 [26:07<31:59:06, 11.73s/it]

Total reward after episode 180 is 1048.0


  2%|▏         | 181/10000 [26:11<25:33:40,  9.37s/it]

Total reward after episode 181 is 251.0


  2%|▏         | 182/10000 [26:33<35:48:10, 13.13s/it]

Total reward after episode 182 is 1133.0


  2%|▏         | 183/10000 [26:37<28:38:55, 10.51s/it]

Total reward after episode 183 is 238.0


  2%|▏         | 184/10000 [26:42<23:40:11,  8.68s/it]

Total reward after episode 184 is 242.0


  2%|▏         | 185/10000 [27:05<35:38:49, 13.07s/it]

Total reward after episode 185 is 1342.0


  2%|▏         | 186/10000 [27:24<40:42:46, 14.93s/it]

Total reward after episode 186 is 1045.0


  2%|▏         | 187/10000 [27:28<31:41:24, 11.63s/it]

Total reward after episode 187 is 250.0


  2%|▏         | 188/10000 [27:50<39:49:47, 14.61s/it]

Total reward after episode 188 is 755.0


  2%|▏         | 189/10000 [27:54<31:06:44, 11.42s/it]

Total reward after episode 189 is 249.0


  2%|▏         | 190/10000 [27:58<25:32:48,  9.37s/it]

Total reward after episode 190 is 235.0


  2%|▏         | 191/10000 [28:03<21:44:21,  7.98s/it]

Total reward after episode 191 is 251.0


  2%|▏         | 192/10000 [28:27<35:01:13, 12.85s/it]

Total reward after episode 192 is 1586.0


  2%|▏         | 193/10000 [28:32<28:24:52, 10.43s/it]

Total reward after episode 193 is 237.0


  2%|▏         | 194/10000 [28:44<29:35:36, 10.86s/it]

Total reward after episode 194 is 623.0


  2%|▏         | 195/10000 [28:48<23:55:18,  8.78s/it]

Total reward after episode 195 is 249.0


  2%|▏         | 196/10000 [28:52<20:24:08,  7.49s/it]

Total reward after episode 196 is 237.0


  2%|▏         | 197/10000 [28:57<17:57:15,  6.59s/it]

Total reward after episode 197 is 239.0


  2%|▏         | 198/10000 [29:01<15:46:58,  5.80s/it]

Total reward after episode 198 is 251.0


  2%|▏         | 199/10000 [29:05<14:15:44,  5.24s/it]

Total reward after episode 199 is 248.0


  2%|▏         | 200/10000 [29:09<13:11:06,  4.84s/it]

Total reward after episode 200 is 251.0


  2%|▏         | 201/10000 [29:13<12:26:10,  4.57s/it]

Total reward after episode 201 is 248.0


  2%|▏         | 202/10000 [29:17<11:57:04,  4.39s/it]

Total reward after episode 202 is 251.0


  2%|▏         | 203/10000 [29:21<11:41:13,  4.29s/it]

Total reward after episode 203 is 248.0


  2%|▏         | 204/10000 [29:25<11:22:53,  4.18s/it]

Total reward after episode 204 is 251.0


  2%|▏         | 205/10000 [29:34<15:24:52,  5.67s/it]

Total reward after episode 205 is 628.0


  2%|▏         | 206/10000 [29:45<19:55:11,  7.32s/it]

Total reward after episode 206 is 637.0


  2%|▏         | 207/10000 [29:49<17:05:28,  6.28s/it]

Total reward after episode 207 is 251.0


  2%|▏         | 208/10000 [30:06<26:10:04,  9.62s/it]

Total reward after episode 208 is 1048.0


  2%|▏         | 209/10000 [30:10<21:29:03,  7.90s/it]

Total reward after episode 209 is 248.0


  2%|▏         | 210/10000 [30:15<18:45:28,  6.90s/it]

Total reward after episode 210 is 236.0


  2%|▏         | 211/10000 [30:36<30:41:49, 11.29s/it]

Total reward after episode 211 is 768.0


  2%|▏         | 212/10000 [30:40<24:35:44,  9.05s/it]

Total reward after episode 212 is 248.0


  2%|▏         | 213/10000 [30:44<20:23:12,  7.50s/it]

Total reward after episode 213 is 251.0


  2%|▏         | 214/10000 [30:48<17:26:51,  6.42s/it]

Total reward after episode 214 is 248.0


  2%|▏         | 215/10000 [30:52<15:29:59,  5.70s/it]

Total reward after episode 215 is 248.0


  2%|▏         | 216/10000 [31:08<24:13:33,  8.91s/it]

Total reward after episode 216 is 1046.0


  2%|▏         | 217/10000 [31:12<20:21:14,  7.49s/it]

Total reward after episode 217 is 246.0


  2%|▏         | 218/10000 [31:30<28:56:50, 10.65s/it]

Total reward after episode 218 is 625.0


  2%|▏         | 219/10000 [31:35<23:56:00,  8.81s/it]

Total reward after episode 219 is 237.0


  2%|▏         | 220/10000 [31:39<19:57:28,  7.35s/it]

Total reward after episode 220 is 248.0


  2%|▏         | 221/10000 [31:43<17:34:08,  6.47s/it]

Total reward after episode 221 is 239.0


  2%|▏         | 222/10000 [31:47<15:42:48,  5.79s/it]

Total reward after episode 222 is 240.0


  2%|▏         | 223/10000 [31:57<18:28:30,  6.80s/it]

Total reward after episode 223 is 622.0


  2%|▏         | 224/10000 [32:01<16:35:22,  6.11s/it]

Total reward after episode 224 is 237.0


  2%|▏         | 225/10000 [32:06<15:40:56,  5.78s/it]

Total reward after episode 225 is 243.0


  2%|▏         | 226/10000 [32:10<14:06:21,  5.20s/it]

Total reward after episode 226 is 247.0


  2%|▏         | 227/10000 [32:21<18:53:13,  6.96s/it]

Total reward after episode 227 is 607.0


  2%|▏         | 228/10000 [32:25<16:29:31,  6.08s/it]

Total reward after episode 228 is 244.0


  2%|▏         | 229/10000 [32:29<15:10:26,  5.59s/it]

Total reward after episode 229 is 237.0


  2%|▏         | 230/10000 [32:33<13:47:27,  5.08s/it]

Total reward after episode 230 is 251.0


  2%|▏         | 231/10000 [32:42<17:04:46,  6.29s/it]

Total reward after episode 231 is 625.0


  2%|▏         | 232/10000 [32:53<20:51:44,  7.69s/it]

Total reward after episode 232 is 626.0


  2%|▏         | 233/10000 [33:04<23:36:18,  8.70s/it]

Total reward after episode 233 is 617.0


  2%|▏         | 234/10000 [33:08<19:41:43,  7.26s/it]

Total reward after episode 234 is 249.0


  2%|▏         | 235/10000 [33:12<16:55:36,  6.24s/it]

Total reward after episode 235 is 249.0


  2%|▏         | 236/10000 [33:16<14:59:02,  5.52s/it]

Total reward after episode 236 is 251.0


  2%|▏         | 237/10000 [33:20<14:04:00,  5.19s/it]

Total reward after episode 237 is 237.0


  2%|▏         | 238/10000 [33:24<12:58:44,  4.79s/it]

Total reward after episode 238 is 251.0


  2%|▏         | 239/10000 [33:36<18:16:19,  6.74s/it]

Total reward after episode 239 is 625.0


  2%|▏         | 240/10000 [33:40<16:21:06,  6.03s/it]

Total reward after episode 240 is 237.0


  2%|▏         | 241/10000 [33:44<14:34:11,  5.37s/it]

Total reward after episode 241 is 248.0


  2%|▏         | 242/10000 [33:59<22:22:05,  8.25s/it]

Total reward after episode 242 is 622.0


  2%|▏         | 243/10000 [34:03<18:47:55,  6.94s/it]

Total reward after episode 243 is 250.0


  2%|▏         | 244/10000 [34:07<16:43:50,  6.17s/it]

Total reward after episode 244 is 237.0


  2%|▏         | 245/10000 [34:11<15:17:21,  5.64s/it]

Total reward after episode 245 is 237.0


  2%|▏         | 246/10000 [34:23<20:05:21,  7.41s/it]

Total reward after episode 246 is 638.0


  2%|▏         | 247/10000 [34:28<17:51:34,  6.59s/it]

Total reward after episode 247 is 237.0


  2%|▏         | 248/10000 [34:32<15:56:43,  5.89s/it]

Total reward after episode 248 is 247.0


  2%|▏         | 249/10000 [35:06<38:52:50, 14.35s/it]

Total reward after episode 249 is 1021.0


  2%|▎         | 250/10000 [35:11<30:49:19, 11.38s/it]

Total reward after episode 250 is 237.0


  3%|▎         | 251/10000 [35:15<24:50:45,  9.17s/it]

Total reward after episode 251 is 247.0


  3%|▎         | 252/10000 [35:27<27:50:30, 10.28s/it]

Total reward after episode 252 is 606.0


  3%|▎         | 253/10000 [35:39<28:32:56, 10.54s/it]

Total reward after episode 253 is 636.0


  3%|▎         | 254/10000 [35:53<31:34:27, 11.66s/it]

Total reward after episode 254 is 606.0


  3%|▎         | 255/10000 [36:08<34:09:38, 12.62s/it]

Total reward after episode 255 is 624.0


  3%|▎         | 256/10000 [36:11<26:58:26,  9.97s/it]

Total reward after episode 256 is 249.0


  3%|▎         | 257/10000 [36:24<28:42:00, 10.60s/it]

Total reward after episode 257 is 628.0


  3%|▎         | 258/10000 [36:35<29:06:44, 10.76s/it]

Total reward after episode 258 is 639.0


  3%|▎         | 259/10000 [36:39<23:29:59,  8.68s/it]

Total reward after episode 259 is 250.0


  3%|▎         | 260/10000 [36:42<19:33:05,  7.23s/it]

Total reward after episode 260 is 249.0


  3%|▎         | 261/10000 [36:46<17:02:40,  6.30s/it]

Total reward after episode 261 is 244.0


  3%|▎         | 262/10000 [36:50<15:05:32,  5.58s/it]

Total reward after episode 262 is 250.0


  3%|▎         | 263/10000 [36:55<14:00:51,  5.18s/it]

Total reward after episode 263 is 244.0


  3%|▎         | 264/10000 [36:58<12:56:24,  4.78s/it]

Total reward after episode 264 is 248.0


  3%|▎         | 265/10000 [37:02<12:10:45,  4.50s/it]

Total reward after episode 265 is 250.0


  3%|▎         | 266/10000 [37:06<11:34:02,  4.28s/it]

Total reward after episode 266 is 249.0


  3%|▎         | 267/10000 [37:25<23:37:49,  8.74s/it]

Total reward after episode 267 is 1044.0


  3%|▎         | 268/10000 [37:29<19:34:49,  7.24s/it]

Total reward after episode 268 is 249.0


  3%|▎         | 269/10000 [37:33<16:51:18,  6.24s/it]

Total reward after episode 269 is 250.0


  3%|▎         | 270/10000 [37:53<28:09:06, 10.42s/it]

Total reward after episode 270 is 1040.0


  3%|▎         | 271/10000 [37:57<22:42:54,  8.41s/it]

Total reward after episode 271 is 249.0


  3%|▎         | 272/10000 [38:01<19:02:53,  7.05s/it]

Total reward after episode 272 is 248.0


  3%|▎         | 273/10000 [38:05<16:43:08,  6.19s/it]

Total reward after episode 273 is 244.0


  3%|▎         | 274/10000 [38:15<20:01:39,  7.41s/it]

Total reward after episode 274 is 638.0


  3%|▎         | 275/10000 [38:19<17:04:20,  6.32s/it]

Total reward after episode 275 is 249.0


  3%|▎         | 276/10000 [38:23<15:06:20,  5.59s/it]

Total reward after episode 276 is 251.0


  3%|▎         | 277/10000 [38:33<18:41:27,  6.92s/it]

Total reward after episode 277 is 638.0


  3%|▎         | 278/10000 [38:43<20:59:49,  7.78s/it]

Total reward after episode 278 is 635.0


  3%|▎         | 279/10000 [38:46<17:51:07,  6.61s/it]

Total reward after episode 279 is 252.0


  3%|▎         | 280/10000 [38:50<15:38:52,  5.80s/it]

Total reward after episode 280 is 247.0


  3%|▎         | 281/10000 [38:54<14:04:43,  5.21s/it]

Total reward after episode 281 is 250.0


  3%|▎         | 282/10000 [38:59<13:25:50,  4.98s/it]

Total reward after episode 282 is 242.0


  3%|▎         | 283/10000 [39:03<12:58:39,  4.81s/it]

Total reward after episode 283 is 237.0


  3%|▎         | 284/10000 [39:12<16:31:59,  6.13s/it]

Total reward after episode 284 is 629.0


  3%|▎         | 285/10000 [39:16<14:41:49,  5.45s/it]

Total reward after episode 285 is 250.0


  3%|▎         | 286/10000 [39:20<13:49:38,  5.12s/it]

Total reward after episode 286 is 242.0


  3%|▎         | 287/10000 [39:45<29:39:40, 10.99s/it]

Total reward after episode 287 is 1039.0


  3%|▎         | 288/10000 [40:37<62:44:05, 23.25s/it]

Total reward after episode 288 is 709.0


  3%|▎         | 289/10000 [40:48<52:44:00, 19.55s/it]

Total reward after episode 289 is 625.0


  3%|▎         | 290/10000 [41:08<53:11:47, 19.72s/it]

Total reward after episode 290 is 618.0


  3%|▎         | 291/10000 [41:12<40:40:16, 15.08s/it]

Total reward after episode 291 is 239.0


  3%|▎         | 292/10000 [41:16<31:37:01, 11.72s/it]

Total reward after episode 292 is 248.0


  3%|▎         | 293/10000 [41:20<25:13:30,  9.36s/it]

Total reward after episode 293 is 249.0


  3%|▎         | 294/10000 [41:24<20:56:51,  7.77s/it]

Total reward after episode 294 is 243.0


  3%|▎         | 295/10000 [41:28<17:49:38,  6.61s/it]

Total reward after episode 295 is 249.0


  3%|▎         | 296/10000 [41:49<29:19:22, 10.88s/it]

Total reward after episode 296 is 757.0


  3%|▎         | 297/10000 [41:53<23:43:47,  8.80s/it]

Total reward after episode 297 is 246.0


  3%|▎         | 298/10000 [41:57<20:01:13,  7.43s/it]

Total reward after episode 298 is 237.0


  3%|▎         | 299/10000 [42:01<17:34:34,  6.52s/it]

Total reward after episode 299 is 242.0


  3%|▎         | 300/10000 [42:05<15:24:40,  5.72s/it]

Total reward after episode 300 is 249.0


  3%|▎         | 301/10000 [42:09<14:00:33,  5.20s/it]

Total reward after episode 301 is 250.0


  3%|▎         | 302/10000 [42:19<17:22:49,  6.45s/it]

Total reward after episode 302 is 613.0


  3%|▎         | 303/10000 [42:30<21:16:23,  7.90s/it]

Total reward after episode 303 is 636.0


  3%|▎         | 304/10000 [42:34<18:07:00,  6.73s/it]

Total reward after episode 304 is 247.0


  3%|▎         | 305/10000 [42:38<16:22:33,  6.08s/it]

Total reward after episode 305 is 239.0


  3%|▎         | 306/10000 [42:42<14:40:46,  5.45s/it]

Total reward after episode 306 is 247.0


  3%|▎         | 307/10000 [42:46<13:28:18,  5.00s/it]

Total reward after episode 307 is 247.0


  3%|▎         | 308/10000 [42:50<12:36:16,  4.68s/it]

Total reward after episode 308 is 247.0


  3%|▎         | 309/10000 [43:11<25:18:59,  9.40s/it]

Total reward after episode 309 is 1045.0


  3%|▎         | 310/10000 [43:21<26:05:32,  9.69s/it]

Total reward after episode 310 is 608.0


  3%|▎         | 311/10000 [43:26<21:47:25,  8.10s/it]

Total reward after episode 311 is 243.0


  3%|▎         | 312/10000 [43:30<18:33:58,  6.90s/it]

Total reward after episode 312 is 246.0


  3%|▎         | 313/10000 [43:40<21:10:04,  7.87s/it]

Total reward after episode 313 is 608.0


  3%|▎         | 314/10000 [43:50<23:14:17,  8.64s/it]

Total reward after episode 314 is 640.0


  3%|▎         | 315/10000 [44:07<30:05:56, 11.19s/it]

Total reward after episode 315 is 1050.0


  3%|▎         | 316/10000 [44:11<24:16:09,  9.02s/it]

Total reward after episode 316 is 248.0


  3%|▎         | 317/10000 [44:23<26:40:00,  9.91s/it]

Total reward after episode 317 is 633.0


  3%|▎         | 318/10000 [44:27<22:00:13,  8.18s/it]

Total reward after episode 318 is 246.0


  3%|▎         | 319/10000 [44:31<18:30:02,  6.88s/it]

Total reward after episode 319 is 250.0


  3%|▎         | 320/10000 [44:56<32:51:33, 12.22s/it]

Total reward after episode 320 is 809.0


  3%|▎         | 321/10000 [45:00<26:13:38,  9.75s/it]

Total reward after episode 321 is 246.0


  3%|▎         | 322/10000 [45:04<21:55:20,  8.15s/it]

Total reward after episode 322 is 244.0


  3%|▎         | 323/10000 [46:00<60:01:35, 22.33s/it]

Total reward after episode 323 is 694.0


  3%|▎         | 324/10000 [46:04<45:12:22, 16.82s/it]

Total reward after episode 324 is 251.0


  3%|▎         | 325/10000 [46:51<70:00:55, 26.05s/it]

Total reward after episode 325 is 1538.0


  3%|▎         | 326/10000 [47:25<75:47:11, 28.20s/it]

Total reward after episode 326 is 768.0


  3%|▎         | 327/10000 [47:28<56:08:00, 20.89s/it]

Total reward after episode 327 is 252.0


  3%|▎         | 328/10000 [47:48<55:20:36, 20.60s/it]

Total reward after episode 328 is 640.0


  3%|▎         | 329/10000 [47:53<42:18:57, 15.75s/it]

Total reward after episode 329 is 235.0


  3%|▎         | 330/10000 [47:57<32:45:08, 12.19s/it]

Total reward after episode 330 is 250.0


  3%|▎         | 331/10000 [48:00<26:02:46,  9.70s/it]

Total reward after episode 331 is 246.0


  3%|▎         | 332/10000 [48:05<21:57:28,  8.18s/it]

Total reward after episode 332 is 235.0


  3%|▎         | 333/10000 [48:10<18:58:22,  7.07s/it]

Total reward after episode 333 is 242.0


  3%|▎         | 334/10000 [48:14<16:30:55,  6.15s/it]

Total reward after episode 334 is 247.0


  3%|▎         | 335/10000 [48:44<35:53:18, 13.37s/it]

Total reward after episode 335 is 1037.0


  3%|▎         | 336/10000 [48:48<28:41:27, 10.69s/it]

Total reward after episode 336 is 242.0


  3%|▎         | 337/10000 [48:52<23:10:11,  8.63s/it]

Total reward after episode 337 is 248.0


  3%|▎         | 338/10000 [49:04<25:42:31,  9.58s/it]

Total reward after episode 338 is 636.0


  3%|▎         | 339/10000 [49:08<21:06:02,  7.86s/it]

Total reward after episode 339 is 250.0


  3%|▎         | 340/10000 [49:29<31:44:47, 11.83s/it]

Total reward after episode 340 is 778.0


  3%|▎         | 341/10000 [49:33<25:26:25,  9.48s/it]

Total reward after episode 341 is 247.0


  3%|▎         | 342/10000 [49:47<29:24:13, 10.96s/it]

Total reward after episode 342 is 629.0


  3%|▎         | 343/10000 [50:17<44:16:52, 16.51s/it]

Total reward after episode 343 is 765.0


  3%|▎         | 344/10000 [50:46<54:17:18, 20.24s/it]

Total reward after episode 344 is 627.0


  3%|▎         | 345/10000 [51:34<76:46:28, 28.63s/it]

Total reward after episode 345 is 735.0


  3%|▎         | 346/10000 [51:48<65:25:06, 24.39s/it]

Total reward after episode 346 is 619.0


  3%|▎         | 347/10000 [51:53<49:20:54, 18.40s/it]

Total reward after episode 347 is 239.0


  3%|▎         | 348/10000 [51:57<38:06:30, 14.21s/it]

Total reward after episode 348 is 242.0


  3%|▎         | 349/10000 [52:19<44:16:05, 16.51s/it]

Total reward after episode 349 is 624.0


  4%|▎         | 350/10000 [52:43<49:54:30, 18.62s/it]

Total reward after episode 350 is 1040.0


  4%|▎         | 351/10000 [53:23<67:17:26, 25.11s/it]

Total reward after episode 351 is 612.0


  4%|▎         | 352/10000 [53:27<50:11:16, 18.73s/it]

Total reward after episode 352 is 248.0


  4%|▎         | 353/10000 [53:39<45:06:50, 16.84s/it]

Total reward after episode 353 is 638.0


  4%|▎         | 354/10000 [53:53<42:40:48, 15.93s/it]

Total reward after episode 354 is 606.0


  4%|▎         | 355/10000 [54:05<39:23:26, 14.70s/it]

Total reward after episode 355 is 641.0


  4%|▎         | 356/10000 [54:09<30:40:07, 11.45s/it]

Total reward after episode 356 is 246.0


  4%|▎         | 357/10000 [54:13<24:40:43,  9.21s/it]

Total reward after episode 357 is 246.0


  4%|▎         | 358/10000 [54:27<28:34:23, 10.67s/it]

Total reward after episode 358 is 630.0


  4%|▎         | 359/10000 [54:31<23:21:00,  8.72s/it]

Total reward after episode 359 is 246.0


  4%|▎         | 360/10000 [54:35<19:28:43,  7.27s/it]

Total reward after episode 360 is 252.0


  4%|▎         | 361/10000 [54:39<16:46:06,  6.26s/it]

Total reward after episode 361 is 249.0


  4%|▎         | 362/10000 [54:43<14:51:47,  5.55s/it]

Total reward after episode 362 is 248.0


  4%|▎         | 363/10000 [54:47<14:04:06,  5.26s/it]

Total reward after episode 363 is 242.0


  4%|▎         | 364/10000 [54:51<12:56:33,  4.84s/it]

Total reward after episode 364 is 252.0


  4%|▎         | 365/10000 [54:55<12:08:47,  4.54s/it]

Total reward after episode 365 is 252.0


  4%|▎         | 366/10000 [55:15<24:57:10,  9.32s/it]

Total reward after episode 366 is 1347.0


  4%|▎         | 367/10000 [55:49<44:37:40, 16.68s/it]

Total reward after episode 367 is 1308.0


  4%|▎         | 368/10000 [56:02<41:12:48, 15.40s/it]

Total reward after episode 368 is 609.0


  4%|▎         | 369/10000 [56:05<31:53:04, 11.92s/it]

Total reward after episode 369 is 252.0


  4%|▎         | 370/10000 [56:09<25:19:48,  9.47s/it]

Total reward after episode 370 is 252.0


  4%|▎         | 371/10000 [56:13<21:11:28,  7.92s/it]

Total reward after episode 371 is 241.0


  4%|▎         | 372/10000 [56:17<17:49:05,  6.66s/it]

Total reward after episode 372 is 252.0


  4%|▎         | 373/10000 [56:21<15:28:27,  5.79s/it]

Total reward after episode 373 is 247.0


  4%|▎         | 374/10000 [56:25<13:52:02,  5.19s/it]

Total reward after episode 374 is 252.0


  4%|▍         | 375/10000 [56:28<12:45:03,  4.77s/it]

Total reward after episode 375 is 252.0


  4%|▍         | 376/10000 [56:33<12:34:21,  4.70s/it]

Total reward after episode 376 is 236.0


  4%|▍         | 377/10000 [56:37<11:52:06,  4.44s/it]

Total reward after episode 377 is 248.0


  4%|▍         | 378/10000 [56:48<17:06:33,  6.40s/it]

Total reward after episode 378 is 639.0


  4%|▍         | 379/10000 [56:52<14:59:35,  5.61s/it]

Total reward after episode 379 is 246.0


  4%|▍         | 380/10000 [57:15<29:16:39, 10.96s/it]

Total reward after episode 380 is 810.0


  4%|▍         | 381/10000 [57:36<37:05:23, 13.88s/it]

Total reward after episode 381 is 639.0


  4%|▍         | 382/10000 [57:40<29:35:37, 11.08s/it]

Total reward after episode 382 is 236.0


  4%|▍         | 383/10000 [57:45<24:19:46,  9.11s/it]

Total reward after episode 383 is 236.0


  4%|▍         | 384/10000 [57:49<20:39:29,  7.73s/it]

Total reward after episode 384 is 236.0


  4%|▍         | 385/10000 [57:54<17:50:02,  6.68s/it]

Total reward after episode 385 is 250.0


  4%|▍         | 386/10000 [58:22<34:53:21, 13.06s/it]

Total reward after episode 386 is 628.0


  4%|▍         | 387/10000 [59:37<84:48:24, 31.76s/it]

Total reward after episode 387 is 964.0


  4%|▍         | 388/10000 [59:41<62:23:39, 23.37s/it]

Total reward after episode 388 is 250.0


  4%|▍         | 389/10000 [59:44<46:43:09, 17.50s/it]

Total reward after episode 389 is 250.0


  4%|▍         | 390/10000 [59:48<35:44:04, 13.39s/it]

Total reward after episode 390 is 250.0


  4%|▍         | 391/10000 [59:52<28:00:32, 10.49s/it]

Total reward after episode 391 is 252.0


  4%|▍         | 392/10000 [1:00:11<34:39:57, 12.99s/it]

Total reward after episode 392 is 816.0


  4%|▍         | 393/10000 [1:00:15<27:22:49, 10.26s/it]

Total reward after episode 393 is 252.0


  4%|▍         | 394/10000 [1:01:26<76:18:45, 28.60s/it]

Total reward after episode 394 is 971.0


  4%|▍         | 395/10000 [1:01:30<56:51:47, 21.31s/it]

Total reward after episode 395 is 238.0


  4%|▍         | 396/10000 [1:01:43<49:39:38, 18.62s/it]

Total reward after episode 396 is 631.0


  4%|▍         | 397/10000 [1:02:04<51:37:39, 19.35s/it]

Total reward after episode 397 is 638.0


  4%|▍         | 398/10000 [1:03:17<94:37:27, 35.48s/it]

Total reward after episode 398 is 1507.0


  4%|▍         | 399/10000 [1:03:27<74:14:49, 27.84s/it]

Total reward after episode 399 is 601.0


  4%|▍         | 400/10000 [1:04:30<102:20:10, 38.38s/it]

Total reward after episode 400 is 752.0


  4%|▍         | 401/10000 [1:05:16<108:42:09, 40.77s/it]

Total reward after episode 401 is 1003.0


  4%|▍         | 402/10000 [1:05:44<98:35:20, 36.98s/it] 

Total reward after episode 402 is 1034.0


  4%|▍         | 403/10000 [1:07:20<145:09:15, 54.45s/it]

Total reward after episode 403 is 705.0


  4%|▍         | 404/10000 [1:07:30<110:05:24, 41.30s/it]

Total reward after episode 404 is 634.0


  4%|▍         | 405/10000 [1:07:34<80:01:50, 30.03s/it] 

Total reward after episode 405 is 246.0


  4%|▍         | 406/10000 [1:07:38<59:01:18, 22.15s/it]

Total reward after episode 406 is 246.0


  4%|▍         | 407/10000 [1:08:14<70:36:53, 26.50s/it]

Total reward after episode 407 is 1019.0


  4%|▍         | 408/10000 [1:08:18<52:25:18, 19.67s/it]

Total reward after episode 408 is 252.0


  4%|▍         | 409/10000 [1:08:53<64:21:30, 24.16s/it]

Total reward after episode 409 is 1420.0


  4%|▍         | 410/10000 [1:08:57<48:31:52, 18.22s/it]

Total reward after episode 410 is 235.0


  4%|▍         | 411/10000 [1:09:02<37:33:56, 14.10s/it]

Total reward after episode 411 is 242.0


  4%|▍         | 412/10000 [1:09:30<49:08:41, 18.45s/it]

Total reward after episode 412 is 1032.0


  4%|▍         | 413/10000 [1:09:34<37:25:29, 14.05s/it]

Total reward after episode 413 is 248.0


  4%|▍         | 414/10000 [1:09:44<34:14:48, 12.86s/it]

Total reward after episode 414 is 623.0


  4%|▍         | 415/10000 [1:09:55<32:20:30, 12.15s/it]

Total reward after episode 415 is 652.0


  4%|▍         | 416/10000 [1:10:06<31:25:07, 11.80s/it]

Total reward after episode 416 is 652.0


  4%|▍         | 417/10000 [1:10:09<24:58:11,  9.38s/it]

Total reward after episode 417 is 252.0


  4%|▍         | 418/10000 [1:10:20<26:19:02,  9.89s/it]

Total reward after episode 418 is 634.0


  4%|▍         | 419/10000 [1:10:24<21:24:26,  8.04s/it]

Total reward after episode 419 is 252.0


  4%|▍         | 420/10000 [1:10:28<17:58:29,  6.75s/it]

Total reward after episode 420 is 252.0


  4%|▍         | 421/10000 [1:10:32<15:34:20,  5.85s/it]

Total reward after episode 421 is 252.0


  4%|▍         | 422/10000 [1:10:35<13:53:10,  5.22s/it]

Total reward after episode 422 is 252.0


  4%|▍         | 423/10000 [1:11:11<38:15:12, 14.38s/it]

Total reward after episode 423 is 1414.0


  4%|▍         | 424/10000 [1:11:15<29:47:48, 11.20s/it]

Total reward after episode 424 is 252.0


  4%|▍         | 425/10000 [1:11:19<23:53:35,  8.98s/it]

Total reward after episode 425 is 252.0


  4%|▍         | 426/10000 [1:11:22<19:44:49,  7.43s/it]

Total reward after episode 426 is 248.0


  4%|▍         | 427/10000 [1:11:26<16:50:58,  6.34s/it]

Total reward after episode 427 is 252.0


  4%|▍         | 428/10000 [1:12:18<53:18:08, 20.05s/it]

Total reward after episode 428 is 769.0


  4%|▍         | 429/10000 [1:12:22<40:19:49, 15.17s/it]

Total reward after episode 429 is 252.0


  4%|▍         | 430/10000 [1:12:26<31:13:54, 11.75s/it]

Total reward after episode 430 is 252.0


  4%|▍         | 431/10000 [1:12:38<31:52:17, 11.99s/it]

Total reward after episode 431 is 606.0


  4%|▍         | 432/10000 [1:12:42<25:18:03,  9.52s/it]

Total reward after episode 432 is 252.0


  4%|▍         | 433/10000 [1:12:55<27:39:47, 10.41s/it]

Total reward after episode 433 is 649.0


  4%|▍         | 434/10000 [1:12:58<22:21:43,  8.42s/it]

Total reward after episode 434 is 252.0


  4%|▍         | 435/10000 [1:13:28<39:20:53, 14.81s/it]

Total reward after episode 435 is 1332.0


  4%|▍         | 436/10000 [1:13:32<30:31:43, 11.49s/it]

Total reward after episode 436 is 248.0


  4%|▍         | 437/10000 [1:13:36<24:20:26,  9.16s/it]

Total reward after episode 437 is 252.0


  4%|▍         | 438/10000 [1:13:39<19:59:26,  7.53s/it]

Total reward after episode 438 is 252.0


  4%|▍         | 439/10000 [1:13:43<16:54:15,  6.36s/it]

Total reward after episode 439 is 252.0


  4%|▍         | 440/10000 [1:13:47<14:48:20,  5.58s/it]

Total reward after episode 440 is 252.0


  4%|▍         | 441/10000 [1:13:50<13:20:50,  5.03s/it]

Total reward after episode 441 is 252.0


  4%|▍         | 442/10000 [1:13:54<12:18:19,  4.63s/it]

Total reward after episode 442 is 252.0


  4%|▍         | 443/10000 [1:13:58<11:35:08,  4.36s/it]

Total reward after episode 443 is 252.0


  4%|▍         | 444/10000 [1:14:19<25:14:15,  9.51s/it]

Total reward after episode 444 is 759.0


  4%|▍         | 445/10000 [1:14:23<20:35:56,  7.76s/it]

Total reward after episode 445 is 247.0


  4%|▍         | 446/10000 [1:14:33<22:07:30,  8.34s/it]

Total reward after episode 446 is 627.0


  4%|▍         | 447/10000 [1:14:36<18:25:01,  6.94s/it]

Total reward after episode 447 is 252.0


  4%|▍         | 448/10000 [1:14:41<16:14:34,  6.12s/it]

Total reward after episode 448 is 239.0


  4%|▍         | 449/10000 [1:15:03<29:07:28, 10.98s/it]

Total reward after episode 449 is 1039.0


  4%|▍         | 450/10000 [1:15:24<37:23:51, 14.10s/it]

Total reward after episode 450 is 1041.0


  5%|▍         | 451/10000 [1:15:42<40:12:34, 15.16s/it]

Total reward after episode 451 is 818.0


  5%|▍         | 452/10000 [1:15:46<31:06:23, 11.73s/it]

Total reward after episode 452 is 252.0


  5%|▍         | 453/10000 [1:16:10<40:51:20, 15.41s/it]

Total reward after episode 453 is 1036.0


  5%|▍         | 454/10000 [1:16:30<44:21:32, 16.73s/it]

Total reward after episode 454 is 730.0


  5%|▍         | 455/10000 [1:16:33<34:02:21, 12.84s/it]

Total reward after episode 455 is 252.0


  5%|▍         | 456/10000 [1:16:46<33:55:13, 12.79s/it]

Total reward after episode 456 is 619.0


  5%|▍         | 457/10000 [1:18:07<88:18:08, 33.31s/it]

Total reward after episode 457 is 551.0


  5%|▍         | 458/10000 [1:18:27<77:53:26, 29.39s/it]

Total reward after episode 458 is 814.0


  5%|▍         | 459/10000 [1:18:41<64:59:04, 24.52s/it]

Total reward after episode 459 is 605.0


  5%|▍         | 460/10000 [1:18:51<54:10:12, 20.44s/it]

Total reward after episode 460 is 632.0


  5%|▍         | 461/10000 [1:19:21<61:10:02, 23.08s/it]

Total reward after episode 461 is 599.0


  5%|▍         | 462/10000 [1:19:24<45:43:47, 17.26s/it]

Total reward after episode 462 is 252.0


  5%|▍         | 463/10000 [1:19:28<34:56:23, 13.19s/it]

Total reward after episode 463 is 248.0


  5%|▍         | 464/10000 [1:19:56<46:59:46, 17.74s/it]

Total reward after episode 464 is 626.0


  5%|▍         | 465/10000 [1:20:00<35:53:58, 13.55s/it]

Total reward after episode 465 is 252.0


  5%|▍         | 466/10000 [1:20:04<28:24:34, 10.73s/it]

Total reward after episode 466 is 238.0


  5%|▍         | 467/10000 [1:20:24<35:09:19, 13.28s/it]

Total reward after episode 467 is 1048.0


  5%|▍         | 468/10000 [1:20:27<27:39:47, 10.45s/it]

Total reward after episode 468 is 248.0


  5%|▍         | 469/10000 [1:20:47<35:12:14, 13.30s/it]

Total reward after episode 469 is 1043.0


  5%|▍         | 470/10000 [1:20:51<27:37:01, 10.43s/it]

Total reward after episode 470 is 252.0


  5%|▍         | 471/10000 [1:21:11<34:53:33, 13.18s/it]

Total reward after episode 471 is 815.0


  5%|▍         | 472/10000 [1:21:14<27:22:36, 10.34s/it]

Total reward after episode 472 is 252.0


  5%|▍         | 473/10000 [1:21:24<26:27:12, 10.00s/it]

Total reward after episode 473 is 629.0


  5%|▍         | 474/10000 [1:21:27<21:25:07,  8.09s/it]

Total reward after episode 474 is 246.0


  5%|▍         | 475/10000 [1:21:58<39:02:05, 14.75s/it]

Total reward after episode 475 is 1028.0


  5%|▍         | 476/10000 [1:22:20<45:22:52, 17.15s/it]

Total reward after episode 476 is 582.0


  5%|▍         | 477/10000 [1:22:24<34:47:53, 13.15s/it]

Total reward after episode 477 is 246.0


  5%|▍         | 478/10000 [1:22:36<33:46:42, 12.77s/it]

Total reward after episode 478 is 650.0


  5%|▍         | 479/10000 [1:22:48<33:25:37, 12.64s/it]

Total reward after episode 479 is 608.0


  5%|▍         | 480/10000 [1:22:52<26:17:55,  9.94s/it]

Total reward after episode 480 is 248.0


  5%|▍         | 481/10000 [1:23:38<54:59:47, 20.80s/it]

Total reward after episode 481 is 776.0


  5%|▍         | 482/10000 [1:23:42<41:32:44, 15.71s/it]

Total reward after episode 482 is 248.0


  5%|▍         | 483/10000 [1:24:47<80:54:30, 30.61s/it]

Total reward after episode 483 is 1259.0


  5%|▍         | 484/10000 [1:24:52<59:58:58, 22.69s/it]

Total reward after episode 484 is 239.0


  5%|▍         | 485/10000 [1:25:01<49:39:29, 18.79s/it]

Total reward after episode 485 is 610.0


  5%|▍         | 486/10000 [1:25:05<37:47:04, 14.30s/it]

Total reward after episode 486 is 252.0


  5%|▍         | 487/10000 [1:25:09<29:28:52, 11.16s/it]

Total reward after episode 487 is 252.0


  5%|▍         | 488/10000 [1:25:13<23:40:14,  8.96s/it]

Total reward after episode 488 is 247.0


  5%|▍         | 489/10000 [1:25:31<31:04:59, 11.77s/it]

Total reward after episode 489 is 818.0


  5%|▍         | 490/10000 [1:26:08<51:18:56, 19.43s/it]

Total reward after episode 490 is 586.0


  5%|▍         | 491/10000 [1:26:16<42:22:04, 16.04s/it]

Total reward after episode 491 is 613.0


  5%|▍         | 492/10000 [1:26:43<50:24:16, 19.08s/it]

Total reward after episode 492 is 806.0


  5%|▍         | 493/10000 [1:26:46<38:16:20, 14.49s/it]

Total reward after episode 493 is 248.0


  5%|▍         | 494/10000 [1:26:51<30:06:33, 11.40s/it]

Total reward after episode 494 is 240.0


  5%|▍         | 495/10000 [1:26:54<24:04:54,  9.12s/it]

Total reward after episode 495 is 252.0


  5%|▍         | 496/10000 [1:26:59<20:32:43,  7.78s/it]

Total reward after episode 496 is 235.0


  5%|▍         | 497/10000 [1:27:03<17:49:43,  6.75s/it]

Total reward after episode 497 is 239.0


  5%|▍         | 498/10000 [1:27:07<15:29:47,  5.87s/it]

Total reward after episode 498 is 252.0


  5%|▍         | 499/10000 [1:27:11<13:51:22,  5.25s/it]

Total reward after episode 499 is 248.0


  5%|▌         | 500/10000 [1:27:15<12:42:03,  4.81s/it]

Total reward after episode 500 is 246.0


  5%|▌         | 501/10000 [1:27:19<11:53:09,  4.50s/it]

Total reward after episode 501 is 246.0


  5%|▌         | 502/10000 [1:27:22<11:18:41,  4.29s/it]

Total reward after episode 502 is 246.0


  5%|▌         | 503/10000 [1:27:34<17:17:41,  6.56s/it]

Total reward after episode 503 is 630.0


  5%|▌         | 504/10000 [1:27:38<15:04:41,  5.72s/it]

Total reward after episode 504 is 251.0


  5%|▌         | 505/10000 [1:28:04<30:55:55, 11.73s/it]

Total reward after episode 505 is 1037.0


  5%|▌         | 506/10000 [1:28:23<36:37:36, 13.89s/it]

Total reward after episode 506 is 640.0


  5%|▌         | 507/10000 [1:28:26<28:37:19, 10.85s/it]

Total reward after episode 507 is 246.0


  5%|▌         | 508/10000 [1:29:30<70:24:25, 26.70s/it]

Total reward after episode 508 is 576.0


  5%|▌         | 509/10000 [1:29:35<52:42:18, 19.99s/it]

Total reward after episode 509 is 243.0


  5%|▌         | 510/10000 [1:29:47<47:07:41, 17.88s/it]

Total reward after episode 510 is 626.0


  5%|▌         | 511/10000 [1:29:51<35:57:59, 13.65s/it]

Total reward after episode 511 is 252.0


  5%|▌         | 512/10000 [1:29:55<28:16:17, 10.73s/it]

Total reward after episode 512 is 248.0


  5%|▌         | 513/10000 [1:30:06<28:13:14, 10.71s/it]

Total reward after episode 513 is 630.0


  5%|▌         | 514/10000 [1:30:33<41:36:15, 15.79s/it]

Total reward after episode 514 is 1036.0


  5%|▌         | 515/10000 [1:30:37<32:05:43, 12.18s/it]

Total reward after episode 515 is 252.0


  5%|▌         | 516/10000 [1:30:47<30:19:22, 11.51s/it]

Total reward after episode 516 is 626.0


  5%|▌         | 517/10000 [1:31:43<65:42:24, 24.94s/it]

Total reward after episode 517 is 991.0


  5%|▌         | 518/10000 [1:32:10<66:38:04, 25.30s/it]

Total reward after episode 518 is 1034.0


  5%|▌         | 519/10000 [1:32:13<49:39:13, 18.85s/it]

Total reward after episode 519 is 248.0


  5%|▌         | 520/10000 [1:32:17<37:46:56, 14.35s/it]

Total reward after episode 520 is 248.0


  5%|▌         | 521/10000 [1:32:21<29:28:12, 11.19s/it]

Total reward after episode 521 is 248.0


  5%|▌         | 522/10000 [1:32:25<23:39:07,  8.98s/it]

Total reward after episode 522 is 248.0


  5%|▌         | 523/10000 [1:32:53<38:34:56, 14.66s/it]

Total reward after episode 523 is 628.0


  5%|▌         | 524/10000 [1:33:14<43:23:48, 16.49s/it]

Total reward after episode 524 is 1350.0


  5%|▌         | 524/10000 [1:33:16<28:06:42, 10.68s/it]


KeyboardInterrupt: ignored