<a href="https://colab.research.google.com/github/FreeOnel/Deep-RL-Pacman/blob/main/PINBALL_Duelling_Double_PER_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb
import wandb
from wandb.keras import WandbCallback
import random
import gym
import cv2
from itertools import count
import numpy as np
import heapq
from collections import deque
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Activation, Flatten, Conv2D, Subtract, Add, Input, Lambda
from keras.optimizers import Adam
from tqdm import tqdm
from numba import cuda

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/47/af/4cfe48fe55046181b992251933cff4ceb3bfd71a42838f5fe683683cd925/wandb-0.10.25-py2.py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.9MB/s 
[?25hCollecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/99/98019716955ba243657daedd1de8f3a88ca1f5b75057c38e959db22fb87b/GitPython-3.1.14-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 23.8MB/s 
Collecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/f3/92/5a33be64990ba815364a8f2dd9e6f51de60d23dfddafb4f1fc5577d4dc64/sentry_sdk-1.0.0-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 25.8MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl
Collecting configparser>=3.8.1

In [None]:

class Agent:
  
    def __init__(self, state_size, action_size, n_step=3):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer = []
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=n_step)
        self.count = count()
        
        self.alpha = 0.6 # Prioritization parameter
        self.gamma = 0.99 # Discount factor
        self.epsilon = 1.0 # Max Prob for Explore
        self.epsilon_min = 0.1 # Min Prob for Explore
        self.epsilon_decay = 0.995 # Decay Rate for Epsilon
        self.update_rate = 1000  # Freq of Network Update
        
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()

    def _build_model(self):
      
        inputs = Input(shape=(self.state_size))
        
        x = Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs)
        x = Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(x)
        x = Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(x)
        x = Flatten()(x)
        
        # Dueling Network
        val = Dense(1, activation='linear')(x)
        advantage = Dense(self.action_size, activation='linear')(x)
        
        # Using Mean for Advantage
        mean = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(advantage)
        advantage = Subtract()([advantage, mean])
        outputs = Add()([val, advantage])
        
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=Adam())
        
        return model

    def store(self, state, action, reward, next_state, Terminal, td_error):
      
        # n-step queue for calculating return of n previous steps
        self.n_step_buffer.append((state, action, reward, next_state, Terminal))
        
        if len(self.n_step_buffer) < self.n_step:
          return
        
        l_reward, l_next_state, l_Terminal = self.n_step_buffer[-1][-3:]

        for transition in reversed(list(self.n_step_buffer)[:-1]):
            r, n_s, T = transition[-3:]

            l_reward = r + self.gamma * l_reward * (1 - T)
            l_next_state, l_Terminal = (n_s, T) if T else (l_next_state, l_Terminal)
        
        l_state, l_action = self.n_step_buffer[0][:2]

        t = (l_state, l_action, l_reward, l_next_state, l_Terminal)
        heapq.heappush(self.buffer, (-td_error, next(self.count), t))
        if len(self.buffer) > 2000:
            self.buffer = self.buffer[:-1]
        heapq.heapify(self.buffer)

    def act(self, state):
      
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        #act_values = self.model.predict(state)
        act_values = self.model(state, training=False)
        
        return np.argmax(act_values[0])
    
    # Prioritises greedily with 0.6 probability and otherwise prioritised randomly
    def replay(self, batch_size):
        
        # Semi Stochastic Prioritization
        prioritization = int(batch_size*self.alpha)
        batch_prioritized = heapq.nsmallest(prioritization, self.buffer)
        batch_uniform = random.sample(self.buffer, batch_size-prioritization)
        batch = batch_prioritized + batch_uniform
        
        batch = [e for (_, _, e) in batch]
        states = []
        targets = []
        
        for state, action, reward, next_state, Terminal in batch:
            
            if not Terminal:
                n_s = np.expand_dims(next_state.reshape(88, 80, 1), axis=0)
                # Double DQN
                m_a = np.argmax(self.model(n_s, training=False)[0])
                target = (reward + self.gamma * self.target_model(n_s, training=False)[0][m_a])
            else:
                target = reward
                
            c_s = np.expand_dims(state.reshape(88, 80, 1), axis=0)
            #target_f = self.model(c_s, training=False)
            target_f = self.model(c_s, training=False)
            proto_tensor = tf.make_tensor_proto(target_f)
            target_f = tf.make_ndarray(proto_tensor)
            target_f[0][int(action)] = target
            states.append(state)
            targets.append(target_f.reshape(self.action_size))
            
        self.model.fit(np.array(states), np.array(targets), batch_size=batch_size, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def calculate_td_error(self, state, action, reward, next_state, Terminal):
        if not Terminal:
            n_s = np.expand_dims(next_state.reshape(88, 80, 1), axis=0)
            m_a = np.argmax(self.model(n_s, training=False)[0])
            target = (reward + self.gamma * self.target_model(n_s, training=False)[0][m_a])
        else:
            target = reward

        c_s = np.expand_dims(state.reshape(88, 80, 1), axis=0)
        target_f = self.model(c_s, training=False)[0][action]
        
        return target_f - target

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def load(self, name):
        self.model.load_weights(name)
        self.target_model.set_weights(self.model.get_weights())

    def save(self, name):
        self.model.save_weights(name)

In [None]:

# Record training with wand api
wb = True
if wb:
  wandb.init(project='PERDDQN', entity='pacman_dqn')

def preprocess(frame):
    # Got some ideas from https://github.com/ageron/tiny-dqn
    mspacman_color = np.array([210, 164, 74]).mean()
    
    # Dilate image twice with a 3x3 kernel
    kernel = np.ones((3,3),np.uint8)
    dilation = cv2.dilate(frame,kernel,iterations = 2)
    img = frame[1:176:2,::2]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img[img==mspacman_color] = 0 
    img = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    return img.reshape(88, 80, 1)


env = gym.make('MsPacman-v4') # Skip 4 Frames
state_size = (88, 80, 1)
action_size = 5
agent = Agent(state_size, action_size, 5) # 5-step return

episodes = 20000
batch_size = 32
total_time = 0 
all_rewards = 0
Terminal = False

# Initializing Buffer
while len(agent.buffer) < 2000:
  state = preprocess(env.reset())
  frame_stack = deque(maxlen=4) # Deque for getting mean of 4 frames instead of stacking
  frame_stack.append(state)

  for skip in range(90): # Skip first 3 seconds of the game
      env.step(0)

  for time in range(2500):

      state = sum(frame_stack)/len(frame_stack)
  
      action = agent.act(np.expand_dims(state.reshape(88, 80, 1), axis=0))
      next_state, reward, Terminal, _ = env.step(action)

      next_state = preprocess(next_state)
      frame_stack.append(next_state)
      next_state = sum(frame_stack)/len(frame_stack)
      
      td_error = agent.calculate_td_error(state, action, reward, next_state, Terminal)
      
      agent.store(state, action, reward, next_state, Terminal, td_error)

      state = next_state

      if Terminal:
          break
  
print("buffer initialized")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 88, 80, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 22, 20, 32)   2080        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 11, 10, 64)   32832       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 11, 10, 64)   36928       conv2d_1[0][0]                   
______________________________________________________________________________________________

In [None]:

for e in tqdm(range(0, episodes)):
    total_reward = 0
    game_score = 0
    state = preprocess(env.reset())
    frame_stack = deque(maxlen=4)
    frame_stack.append(state)
    epochs = 0
    
    for skip in range(90):
        env.step(0)
    
    for time in range(20000):
        total_time += 1
        epochs += 1
        
        if total_time % agent.update_rate == 0:
            agent.update_target_model()
        
        state = sum(frame_stack)/len(frame_stack)
        
        action = agent.act(np.expand_dims(state.reshape(88, 80, 1), axis=0))
        next_state, reward, Terminal, _ = env.step(action)
        
        next_state = preprocess(next_state)
        frame_stack.append(next_state)
        next_state = sum(frame_stack)/len(frame_stack)
        
        td_error = agent.calculate_td_error(state, action, reward, next_state, Terminal)

        agent.store(state, action, reward, next_state, Terminal, td_error)
        
        state = next_state
        total_reward += reward
        
        if Terminal:
            all_rewards += total_reward
            if wb:
              wandb.log({'episodes:': e, 'episodic_reward': total_reward})
            print("episode: {}/{}, reward: {}".format(e+1, episodes, total_reward))
            break

        if epochs % 5 == 0:    
          agent.replay(batch_size)

    if (e+1) % 500 == 0:
      print("model saved on episode", e)
      agent.save("")

  0%|          | 1/20000 [02:09<720:02:13, 129.61s/it]

episode: 1/20000, reward: 310.0


  0%|          | 2/20000 [04:39<754:28:40, 135.82s/it]

episode: 2/20000, reward: 240.0


  0%|          | 3/20000 [07:22<799:38:39, 143.96s/it]

episode: 3/20000, reward: 550.0


  0%|          | 4/20000 [09:42<791:37:51, 142.52s/it]

episode: 4/20000, reward: 190.0


  0%|          | 5/20000 [12:14<808:01:47, 145.48s/it]

episode: 5/20000, reward: 400.0


  0%|          | 6/20000 [14:29<790:28:00, 142.33s/it]

episode: 6/20000, reward: 360.0


  0%|          | 7/20000 [16:04<711:14:24, 128.07s/it]

episode: 7/20000, reward: 230.0


  0%|          | 8/20000 [18:05<700:06:32, 126.07s/it]

episode: 8/20000, reward: 390.0


  0%|          | 9/20000 [20:03<685:57:49, 123.53s/it]

episode: 9/20000, reward: 170.0


  0%|          | 10/20000 [21:36<634:58:51, 114.35s/it]

episode: 10/20000, reward: 350.0


  0%|          | 11/20000 [22:43<557:09:21, 100.34s/it]

episode: 11/20000, reward: 100.0


  0%|          | 12/20000 [24:43<589:46:47, 106.22s/it]

episode: 12/20000, reward: 390.0


  0%|          | 13/20000 [26:27<585:26:37, 105.45s/it]

episode: 13/20000, reward: 270.0


  0%|          | 14/20000 [29:01<666:04:37, 119.98s/it]

episode: 14/20000, reward: 460.0


  0%|          | 15/20000 [30:40<631:51:38, 113.82s/it]

episode: 15/20000, reward: 330.0


  0%|          | 16/20000 [32:21<610:27:02, 109.97s/it]

episode: 16/20000, reward: 260.0


  0%|          | 17/20000 [34:07<602:41:29, 108.58s/it]

episode: 17/20000, reward: 290.0


  0%|          | 18/20000 [37:40<777:12:47, 140.02s/it]

episode: 18/20000, reward: 1430.0


  0%|          | 19/20000 [38:58<674:05:08, 121.45s/it]

episode: 19/20000, reward: 140.0


  0%|          | 20/20000 [41:21<710:12:05, 127.96s/it]

episode: 20/20000, reward: 990.0


  0%|          | 21/20000 [43:50<744:31:57, 134.16s/it]

episode: 21/20000, reward: 620.0


  0%|          | 22/20000 [45:38<701:09:46, 126.35s/it]

episode: 22/20000, reward: 340.0


  0%|          | 23/20000 [47:28<673:24:06, 121.35s/it]

episode: 23/20000, reward: 330.0


  0%|          | 24/20000 [50:21<760:35:27, 137.07s/it]

episode: 24/20000, reward: 950.0


  0%|          | 25/20000 [54:25<937:38:50, 168.99s/it]

episode: 25/20000, reward: 410.0


  0%|          | 26/20000 [57:22<951:29:27, 171.49s/it]

episode: 26/20000, reward: 480.0


  0%|          | 27/20000 [59:20<861:44:36, 155.32s/it]

episode: 27/20000, reward: 240.0


  0%|          | 28/20000 [1:02:08<883:42:51, 159.29s/it]

episode: 28/20000, reward: 450.0


  0%|          | 29/20000 [1:06:12<1024:06:34, 184.61s/it]

episode: 29/20000, reward: 820.0


  0%|          | 30/20000 [1:08:33<951:22:26, 171.50s/it] 

episode: 30/20000, reward: 260.0


  0%|          | 31/20000 [1:10:45<886:28:44, 159.81s/it]

episode: 31/20000, reward: 290.0


  0%|          | 32/20000 [1:14:12<964:20:31, 173.86s/it]

episode: 32/20000, reward: 580.0


  0%|          | 33/20000 [1:17:11<973:13:36, 175.47s/it]

episode: 33/20000, reward: 580.0


  0%|          | 34/20000 [1:20:20<994:20:05, 179.29s/it]

episode: 34/20000, reward: 1360.0


  0%|          | 35/20000 [1:22:54<953:50:32, 171.99s/it]

episode: 35/20000, reward: 380.0


  0%|          | 36/20000 [1:24:33<831:50:53, 150.00s/it]

episode: 36/20000, reward: 230.0


  0%|          | 37/20000 [1:26:41<794:44:01, 143.32s/it]

episode: 37/20000, reward: 350.0


  0%|          | 38/20000 [1:28:38<751:50:08, 135.59s/it]

episode: 38/20000, reward: 480.0


  0%|          | 39/20000 [1:30:34<717:38:55, 129.43s/it]

episode: 39/20000, reward: 430.0


  0%|          | 40/20000 [1:33:47<824:23:46, 148.69s/it]

episode: 40/20000, reward: 770.0


  0%|          | 41/20000 [1:35:07<710:40:44, 128.19s/it]

episode: 41/20000, reward: 100.0


  0%|          | 42/20000 [1:37:37<746:38:24, 134.68s/it]

episode: 42/20000, reward: 290.0


  0%|          | 43/20000 [1:41:09<874:57:42, 157.83s/it]

episode: 43/20000, reward: 1430.0


  0%|          | 44/20000 [1:44:40<962:21:46, 173.61s/it]

episode: 44/20000, reward: 1070.0


  0%|          | 45/20000 [1:46:30<856:36:38, 154.54s/it]

episode: 45/20000, reward: 280.0


  0%|          | 46/20000 [1:50:10<966:51:02, 174.43s/it]

episode: 46/20000, reward: 710.0


  0%|          | 47/20000 [1:53:02<961:58:34, 173.56s/it]

episode: 47/20000, reward: 540.0


  0%|          | 48/20000 [1:55:19<901:10:49, 162.60s/it]

episode: 48/20000, reward: 590.0


  0%|          | 49/20000 [1:57:38<861:29:17, 155.45s/it]

episode: 49/20000, reward: 400.0


  0%|          | 50/20000 [1:59:43<811:31:29, 146.44s/it]

episode: 50/20000, reward: 450.0


  0%|          | 51/20000 [2:03:43<966:11:18, 174.36s/it]

episode: 51/20000, reward: 740.0


  0%|          | 52/20000 [2:07:21<1039:14:59, 187.55s/it]

episode: 52/20000, reward: 770.0


  0%|          | 53/20000 [2:10:12<1011:08:36, 182.49s/it]

episode: 53/20000, reward: 650.0


  0%|          | 54/20000 [2:12:27<931:54:16, 168.20s/it] 

episode: 54/20000, reward: 970.0


  0%|          | 55/20000 [2:14:21<843:00:49, 152.16s/it]

episode: 55/20000, reward: 460.0


  0%|          | 56/20000 [2:17:56<946:52:21, 170.92s/it]

episode: 56/20000, reward: 800.0


  0%|          | 57/20000 [2:19:31<821:24:12, 148.28s/it]

episode: 57/20000, reward: 220.0


  0%|          | 58/20000 [2:20:38<685:40:14, 123.78s/it]

episode: 58/20000, reward: 110.0


  0%|          | 59/20000 [2:21:47<594:30:06, 107.33s/it]

episode: 59/20000, reward: 130.0


  0%|          | 60/20000 [2:24:27<681:12:45, 122.99s/it]

episode: 60/20000, reward: 590.0


  0%|          | 61/20000 [2:26:23<670:56:44, 121.14s/it]

episode: 61/20000, reward: 570.0


  0%|          | 62/20000 [2:29:38<792:41:07, 143.13s/it]

episode: 62/20000, reward: 1790.0


  0%|          | 63/20000 [2:33:25<932:11:54, 168.33s/it]

episode: 63/20000, reward: 1540.0


  0%|          | 64/20000 [2:35:06<820:25:27, 148.15s/it]

episode: 64/20000, reward: 300.0


  0%|          | 65/20000 [2:37:47<841:50:38, 152.03s/it]

episode: 65/20000, reward: 850.0


  0%|          | 66/20000 [2:39:48<789:51:13, 142.64s/it]

episode: 66/20000, reward: 480.0


  0%|          | 67/20000 [2:42:08<785:42:22, 141.90s/it]

episode: 67/20000, reward: 380.0


  0%|          | 68/20000 [2:45:56<928:12:12, 167.65s/it]

episode: 68/20000, reward: 430.0


  0%|          | 69/20000 [2:47:45<831:56:21, 150.27s/it]

episode: 69/20000, reward: 170.0


  0%|          | 70/20000 [2:49:40<773:22:44, 139.70s/it]

episode: 70/20000, reward: 300.0


  0%|          | 71/20000 [2:52:04<780:16:59, 140.95s/it]

episode: 71/20000, reward: 250.0


  0%|          | 72/20000 [2:53:26<681:36:51, 123.13s/it]

episode: 72/20000, reward: 180.0


  0%|          | 73/20000 [2:55:18<663:54:52, 119.94s/it]

episode: 73/20000, reward: 130.0


  0%|          | 74/20000 [2:58:11<752:04:56, 135.88s/it]

episode: 74/20000, reward: 670.0


  0%|          | 75/20000 [3:00:19<738:17:00, 133.39s/it]

episode: 75/20000, reward: 360.0


  0%|          | 76/20000 [3:02:20<717:26:14, 129.63s/it]

episode: 76/20000, reward: 300.0


  0%|          | 77/20000 [3:05:05<775:38:30, 140.16s/it]

episode: 77/20000, reward: 520.0


  0%|          | 78/20000 [3:06:18<664:22:25, 120.06s/it]

episode: 78/20000, reward: 260.0


  0%|          | 79/20000 [3:09:04<740:24:55, 133.80s/it]

episode: 79/20000, reward: 480.0


  0%|          | 80/20000 [3:10:47<689:46:50, 124.66s/it]

episode: 80/20000, reward: 300.0


  0%|          | 81/20000 [3:12:11<623:03:48, 112.61s/it]

episode: 81/20000, reward: 340.0


  0%|          | 82/20000 [3:14:22<653:36:04, 118.13s/it]

episode: 82/20000, reward: 550.0


  0%|          | 83/20000 [3:16:23<657:42:34, 118.88s/it]

episode: 83/20000, reward: 200.0


  0%|          | 84/20000 [3:18:01<622:06:33, 112.45s/it]

episode: 84/20000, reward: 330.0


  0%|          | 85/20000 [3:20:56<725:54:45, 131.22s/it]

episode: 85/20000, reward: 1040.0


  0%|          | 86/20000 [3:23:56<806:43:03, 145.84s/it]

episode: 86/20000, reward: 470.0


  0%|          | 87/20000 [3:26:02<774:15:24, 139.98s/it]

episode: 87/20000, reward: 380.0


  0%|          | 88/20000 [3:29:41<905:53:27, 163.78s/it]

episode: 88/20000, reward: 430.0


  0%|          | 89/20000 [3:31:26<807:17:19, 145.96s/it]

episode: 89/20000, reward: 270.0


  0%|          | 90/20000 [3:34:36<881:53:12, 159.46s/it]

episode: 90/20000, reward: 460.0


  0%|          | 91/20000 [3:36:28<801:37:18, 144.95s/it]

episode: 91/20000, reward: 300.0


  0%|          | 92/20000 [3:39:50<896:59:15, 162.20s/it]

episode: 92/20000, reward: 1940.0


  0%|          | 93/20000 [3:41:35<801:49:14, 145.00s/it]

episode: 93/20000, reward: 180.0


  0%|          | 94/20000 [3:44:51<886:40:16, 160.35s/it]

episode: 94/20000, reward: 860.0


  0%|          | 95/20000 [3:46:46<810:32:14, 146.59s/it]

episode: 95/20000, reward: 330.0


  0%|          | 96/20000 [3:49:48<869:40:50, 157.30s/it]

episode: 96/20000, reward: 470.0


  0%|          | 97/20000 [3:51:25<770:01:28, 139.28s/it]

episode: 97/20000, reward: 360.0


  0%|          | 98/20000 [3:53:26<739:35:13, 133.78s/it]

episode: 98/20000, reward: 230.0


  0%|          | 99/20000 [3:56:06<782:33:53, 141.56s/it]

episode: 99/20000, reward: 540.0


  0%|          | 100/20000 [3:58:03<741:32:07, 134.15s/it]

episode: 100/20000, reward: 280.0


  1%|          | 101/20000 [4:01:16<839:34:11, 151.89s/it]

episode: 101/20000, reward: 330.0


  1%|          | 102/20000 [4:03:25<802:05:52, 145.12s/it]

episode: 102/20000, reward: 370.0


  1%|          | 103/20000 [4:06:18<848:15:02, 153.48s/it]

episode: 103/20000, reward: 500.0


  1%|          | 104/20000 [4:07:55<754:26:22, 136.51s/it]

episode: 104/20000, reward: 180.0


  1%|          | 105/20000 [4:10:33<790:25:59, 143.03s/it]

episode: 105/20000, reward: 330.0


  1%|          | 106/20000 [4:13:22<833:28:16, 150.82s/it]

episode: 106/20000, reward: 350.0


  1%|          | 107/20000 [4:15:48<825:08:34, 149.32s/it]

episode: 107/20000, reward: 330.0


  1%|          | 108/20000 [4:18:01<797:06:19, 144.26s/it]

episode: 108/20000, reward: 410.0


  1%|          | 109/20000 [4:20:32<809:35:50, 146.53s/it]

episode: 109/20000, reward: 370.0


  1%|          | 110/20000 [4:23:04<818:17:34, 148.11s/it]

episode: 110/20000, reward: 750.0


  1%|          | 111/20000 [4:24:46<741:48:41, 134.27s/it]

episode: 111/20000, reward: 140.0


  1%|          | 112/20000 [4:27:16<768:05:36, 139.04s/it]

episode: 112/20000, reward: 370.0


  1%|          | 113/20000 [4:29:41<777:26:29, 140.73s/it]

episode: 113/20000, reward: 240.0


  1%|          | 114/20000 [4:31:49<756:44:59, 137.00s/it]

episode: 114/20000, reward: 440.0


  1%|          | 115/20000 [4:34:13<767:22:35, 138.93s/it]

episode: 115/20000, reward: 310.0


  1%|          | 116/20000 [4:36:33<769:34:54, 139.33s/it]

episode: 116/20000, reward: 340.0


  1%|          | 117/20000 [4:39:20<815:02:08, 147.57s/it]

episode: 117/20000, reward: 500.0


  1%|          | 118/20000 [4:41:06<747:03:56, 135.27s/it]

episode: 118/20000, reward: 210.0


  1%|          | 119/20000 [4:44:02<814:10:25, 147.43s/it]

episode: 119/20000, reward: 560.0


  1%|          | 120/20000 [4:46:50<848:23:39, 153.63s/it]

episode: 120/20000, reward: 900.0


  1%|          | 121/20000 [4:49:05<817:29:18, 148.04s/it]

episode: 121/20000, reward: 880.0


  1%|          | 122/20000 [4:51:40<828:42:25, 150.08s/it]

episode: 122/20000, reward: 600.0


  1%|          | 123/20000 [4:54:19<843:28:27, 152.76s/it]

episode: 123/20000, reward: 1730.0


  1%|          | 124/20000 [4:56:36<816:42:17, 147.92s/it]

episode: 124/20000, reward: 420.0


  1%|          | 125/20000 [4:58:45<785:35:53, 142.30s/it]

episode: 125/20000, reward: 470.0


  1%|          | 126/20000 [5:00:54<763:41:56, 138.34s/it]

episode: 126/20000, reward: 290.0


  1%|          | 127/20000 [5:02:19<674:41:59, 122.22s/it]

episode: 127/20000, reward: 140.0


  1%|          | 128/20000 [5:04:00<640:40:40, 116.06s/it]

episode: 128/20000, reward: 110.0


  1%|          | 129/20000 [5:05:46<623:01:11, 112.87s/it]

episode: 129/20000, reward: 360.0


  1%|          | 130/20000 [5:08:06<668:48:50, 121.17s/it]

episode: 130/20000, reward: 370.0


  1%|          | 131/20000 [5:09:37<617:33:17, 111.89s/it]

episode: 131/20000, reward: 340.0


  1%|          | 132/20000 [5:11:30<620:04:12, 112.35s/it]

episode: 132/20000, reward: 310.0


  1%|          | 133/20000 [5:13:40<648:39:52, 117.54s/it]

episode: 133/20000, reward: 190.0


  1%|          | 134/20000 [5:16:34<742:03:33, 134.47s/it]

episode: 134/20000, reward: 860.0


  1%|          | 135/20000 [5:18:39<727:44:24, 131.88s/it]

episode: 135/20000, reward: 350.0


  1%|          | 136/20000 [5:20:45<717:32:55, 130.04s/it]

episode: 136/20000, reward: 180.0


  1%|          | 137/20000 [5:23:51<809:00:47, 146.63s/it]

episode: 137/20000, reward: 380.0


  1%|          | 138/20000 [5:25:20<713:48:24, 129.38s/it]

episode: 138/20000, reward: 90.0


  1%|          | 139/20000 [5:26:56<658:41:43, 119.39s/it]

episode: 139/20000, reward: 200.0


  1%|          | 140/20000 [5:29:01<668:51:58, 121.24s/it]

episode: 140/20000, reward: 370.0


  1%|          | 141/20000 [5:31:10<681:39:56, 123.57s/it]

episode: 141/20000, reward: 330.0


  1%|          | 142/20000 [5:33:28<705:35:56, 127.92s/it]

episode: 142/20000, reward: 310.0


  1%|          | 143/20000 [5:35:18<675:15:15, 122.42s/it]

episode: 143/20000, reward: 390.0


  1%|          | 144/20000 [5:36:44<614:08:57, 111.35s/it]

episode: 144/20000, reward: 240.0


  1%|          | 145/20000 [5:39:08<669:01:58, 121.31s/it]

episode: 145/20000, reward: 270.0


  1%|          | 146/20000 [5:40:45<627:57:02, 113.86s/it]

episode: 146/20000, reward: 240.0


  1%|          | 147/20000 [5:43:26<706:54:51, 128.19s/it]

episode: 147/20000, reward: 560.0


  1%|          | 148/20000 [5:44:59<647:59:47, 117.51s/it]

episode: 148/20000, reward: 280.0


  1%|          | 149/20000 [5:46:36<613:43:31, 111.30s/it]

episode: 149/20000, reward: 260.0


  1%|          | 150/20000 [5:48:11<587:49:12, 106.61s/it]

episode: 150/20000, reward: 240.0


  1%|          | 151/20000 [5:49:44<565:35:26, 102.58s/it]

episode: 151/20000, reward: 340.0


  1%|          | 152/20000 [5:51:26<563:25:32, 102.19s/it]

episode: 152/20000, reward: 280.0


  1%|          | 153/20000 [5:54:17<678:17:26, 123.03s/it]

episode: 153/20000, reward: 250.0


  1%|          | 154/20000 [5:56:39<709:46:30, 128.75s/it]

episode: 154/20000, reward: 460.0


  1%|          | 155/20000 [5:58:32<682:20:29, 123.78s/it]

episode: 155/20000, reward: 310.0


  1%|          | 156/20000 [6:00:17<651:59:38, 118.28s/it]

episode: 156/20000, reward: 300.0


  1%|          | 157/20000 [6:02:39<690:22:10, 125.25s/it]

episode: 157/20000, reward: 590.0


  1%|          | 158/20000 [6:04:40<683:23:49, 123.99s/it]

episode: 158/20000, reward: 240.0


  1%|          | 159/20000 [6:06:06<620:22:48, 112.56s/it]

episode: 159/20000, reward: 280.0


  1%|          | 160/20000 [6:08:20<656:26:47, 119.11s/it]

episode: 160/20000, reward: 260.0


  1%|          | 161/20000 [6:10:24<665:07:23, 120.69s/it]

episode: 161/20000, reward: 300.0


  1%|          | 162/20000 [6:12:03<629:05:17, 114.16s/it]

episode: 162/20000, reward: 390.0


  1%|          | 163/20000 [6:13:57<628:17:03, 114.02s/it]

episode: 163/20000, reward: 320.0


  1%|          | 164/20000 [6:15:44<616:53:19, 111.96s/it]

episode: 164/20000, reward: 200.0


  1%|          | 165/20000 [6:17:22<592:55:10, 107.61s/it]

episode: 165/20000, reward: 200.0


  1%|          | 166/20000 [6:19:22<613:26:38, 111.34s/it]

episode: 166/20000, reward: 210.0


  1%|          | 167/20000 [6:22:11<708:34:32, 128.62s/it]

episode: 167/20000, reward: 1020.0


  1%|          | 168/20000 [6:25:21<810:12:12, 147.07s/it]

episode: 168/20000, reward: 960.0


  1%|          | 169/20000 [6:27:17<758:45:05, 137.74s/it]

episode: 169/20000, reward: 360.0


  1%|          | 170/20000 [6:29:15<726:29:09, 131.89s/it]

episode: 170/20000, reward: 570.0


  1%|          | 171/20000 [6:30:40<649:52:12, 117.99s/it]

episode: 171/20000, reward: 130.0


  1%|          | 172/20000 [6:32:45<660:40:26, 119.95s/it]

episode: 172/20000, reward: 260.0


  1%|          | 173/20000 [6:34:38<649:03:32, 117.85s/it]

episode: 173/20000, reward: 410.0


  1%|          | 174/20000 [6:36:40<655:46:36, 119.08s/it]

episode: 174/20000, reward: 270.0


  1%|          | 175/20000 [6:38:26<634:35:12, 115.23s/it]

episode: 175/20000, reward: 390.0


  1%|          | 176/20000 [6:41:33<753:28:31, 136.83s/it]

episode: 176/20000, reward: 1230.0


  1%|          | 177/20000 [6:43:53<757:32:08, 137.57s/it]

episode: 177/20000, reward: 540.0


  1%|          | 178/20000 [6:46:57<835:13:21, 151.69s/it]

episode: 178/20000, reward: 860.0


  1%|          | 179/20000 [6:50:12<905:48:55, 164.52s/it]

episode: 179/20000, reward: 1540.0


  1%|          | 180/20000 [6:52:38<875:32:20, 159.03s/it]

episode: 180/20000, reward: 1060.0


  1%|          | 181/20000 [6:54:29<796:27:21, 144.67s/it]

episode: 181/20000, reward: 270.0


  1%|          | 182/20000 [6:57:47<883:57:45, 160.57s/it]

episode: 182/20000, reward: 1140.0


  1%|          | 183/20000 [6:59:33<794:56:28, 144.41s/it]

episode: 183/20000, reward: 280.0


  1%|          | 184/20000 [7:01:27<744:12:12, 135.20s/it]

episode: 184/20000, reward: 460.0


  1%|          | 185/20000 [7:02:43<645:17:42, 117.24s/it]

episode: 185/20000, reward: 290.0


  1%|          | 186/20000 [7:06:05<786:38:35, 142.92s/it]

episode: 186/20000, reward: 1050.0


  1%|          | 187/20000 [7:07:58<737:14:25, 133.96s/it]

episode: 187/20000, reward: 380.0


  1%|          | 188/20000 [7:10:58<812:32:25, 147.65s/it]

episode: 188/20000, reward: 780.0


  1%|          | 189/20000 [7:12:12<691:40:25, 125.69s/it]

episode: 189/20000, reward: 230.0


  1%|          | 190/20000 [7:14:16<688:47:22, 125.17s/it]

episode: 190/20000, reward: 240.0


  1%|          | 191/20000 [7:16:57<747:26:29, 135.84s/it]

episode: 191/20000, reward: 820.0


  1%|          | 192/20000 [7:20:05<833:16:59, 151.44s/it]

episode: 192/20000, reward: 2110.0


  1%|          | 193/20000 [7:21:30<723:03:00, 131.42s/it]

episode: 193/20000, reward: 170.0


  1%|          | 194/20000 [7:23:26<697:56:01, 126.86s/it]

episode: 194/20000, reward: 320.0


  1%|          | 195/20000 [7:24:55<634:48:26, 115.39s/it]

episode: 195/20000, reward: 240.0


  1%|          | 196/20000 [7:26:18<582:24:26, 105.87s/it]

episode: 196/20000, reward: 150.0


  1%|          | 197/20000 [7:28:05<584:25:44, 106.24s/it]

episode: 197/20000, reward: 260.0


  1%|          | 198/20000 [7:29:27<543:47:40, 98.86s/it] 

episode: 198/20000, reward: 260.0


  1%|          | 199/20000 [7:32:05<641:17:57, 116.59s/it]

episode: 199/20000, reward: 620.0


  1%|          | 200/20000 [7:36:18<865:43:17, 157.40s/it]

episode: 200/20000, reward: 730.0


  1%|          | 201/20000 [7:38:04<782:16:41, 142.24s/it]

episode: 201/20000, reward: 330.0


  1%|          | 202/20000 [7:40:07<749:44:36, 136.33s/it]

episode: 202/20000, reward: 410.0


  1%|          | 203/20000 [7:41:14<635:04:36, 115.49s/it]

episode: 203/20000, reward: 180.0


  1%|          | 204/20000 [7:42:38<583:03:52, 106.03s/it]

episode: 204/20000, reward: 220.0


  1%|          | 205/20000 [7:46:21<776:19:30, 141.19s/it]

episode: 205/20000, reward: 1470.0


  1%|          | 206/20000 [7:48:28<753:04:24, 136.96s/it]

episode: 206/20000, reward: 500.0


  1%|          | 207/20000 [7:49:52<664:50:20, 120.92s/it]

episode: 207/20000, reward: 290.0


  1%|          | 208/20000 [7:51:37<639:19:32, 116.29s/it]

episode: 208/20000, reward: 390.0


  1%|          | 209/20000 [7:53:53<671:01:00, 122.06s/it]

episode: 209/20000, reward: 380.0


  1%|          | 210/20000 [7:57:08<792:08:56, 144.10s/it]

episode: 210/20000, reward: 460.0


  1%|          | 211/20000 [7:59:09<753:24:06, 137.06s/it]

episode: 211/20000, reward: 450.0


  1%|          | 212/20000 [8:00:52<697:43:55, 126.94s/it]

episode: 212/20000, reward: 160.0


  1%|          | 213/20000 [8:03:56<791:45:49, 144.05s/it]

episode: 213/20000, reward: 470.0


  1%|          | 214/20000 [8:06:47<835:21:31, 151.99s/it]

episode: 214/20000, reward: 410.0


  1%|          | 215/20000 [8:08:51<789:26:28, 143.64s/it]

episode: 215/20000, reward: 470.0


  1%|          | 216/20000 [8:10:31<717:57:20, 130.64s/it]

episode: 216/20000, reward: 170.0


  1%|          | 217/20000 [8:14:13<868:13:12, 157.99s/it]

episode: 217/20000, reward: 520.0


  1%|          | 218/20000 [8:15:25<726:31:55, 132.22s/it]

episode: 218/20000, reward: 290.0


  1%|          | 219/20000 [8:16:43<637:58:31, 116.11s/it]

episode: 219/20000, reward: 140.0


  1%|          | 220/20000 [8:18:02<575:51:46, 104.81s/it]

episode: 220/20000, reward: 270.0


  1%|          | 221/20000 [8:20:06<607:18:59, 110.54s/it]

episode: 221/20000, reward: 300.0


  1%|          | 222/20000 [8:21:46<591:03:18, 107.58s/it]

episode: 222/20000, reward: 220.0


  1%|          | 223/20000 [8:23:25<576:37:36, 104.96s/it]

episode: 223/20000, reward: 210.0


  1%|          | 224/20000 [8:25:43<630:42:47, 114.81s/it]

episode: 224/20000, reward: 320.0


  1%|          | 225/20000 [8:27:30<616:53:46, 112.30s/it]

episode: 225/20000, reward: 400.0


  1%|          | 226/20000 [8:28:52<568:10:58, 103.44s/it]

episode: 226/20000, reward: 130.0


  1%|          | 227/20000 [8:31:30<657:18:30, 119.67s/it]

episode: 227/20000, reward: 780.0


  1%|          | 228/20000 [8:33:17<636:37:19, 115.91s/it]

episode: 228/20000, reward: 320.0


  1%|          | 229/20000 [8:34:46<591:37:56, 107.73s/it]

episode: 229/20000, reward: 390.0


  1%|          | 230/20000 [8:37:02<638:30:28, 116.27s/it]

episode: 230/20000, reward: 280.0


  1%|          | 231/20000 [8:38:28<588:56:39, 107.25s/it]

episode: 231/20000, reward: 130.0


  1%|          | 232/20000 [8:41:14<686:04:56, 124.94s/it]

episode: 232/20000, reward: 820.0


  1%|          | 233/20000 [8:42:22<592:04:46, 107.83s/it]

episode: 233/20000, reward: 190.0


  1%|          | 234/20000 [8:44:07<586:52:57, 106.89s/it]

episode: 234/20000, reward: 310.0


  1%|          | 235/20000 [8:46:40<663:46:02, 120.90s/it]

episode: 235/20000, reward: 420.0


  1%|          | 236/20000 [8:48:31<646:24:37, 117.74s/it]

episode: 236/20000, reward: 310.0


  1%|          | 237/20000 [8:49:56<592:49:41, 107.99s/it]

episode: 237/20000, reward: 380.0


  1%|          | 238/20000 [8:52:49<700:21:25, 127.58s/it]

episode: 238/20000, reward: 760.0


  1%|          | 239/20000 [8:54:28<652:05:13, 118.80s/it]

episode: 239/20000, reward: 240.0


  1%|          | 240/20000 [8:56:34<664:29:50, 121.06s/it]

episode: 240/20000, reward: 340.0


  1%|          | 241/20000 [8:57:58<603:01:39, 109.87s/it]

episode: 241/20000, reward: 340.0


  1%|          | 242/20000 [8:59:41<591:56:11, 107.85s/it]

episode: 242/20000, reward: 350.0


  1%|          | 243/20000 [9:01:28<590:51:34, 107.66s/it]

episode: 243/20000, reward: 290.0


  1%|          | 244/20000 [9:03:27<608:52:54, 110.95s/it]

episode: 244/20000, reward: 370.0


  1%|          | 245/20000 [9:04:55<571:17:49, 104.11s/it]

episode: 245/20000, reward: 190.0


  1%|          | 246/20000 [9:06:45<581:50:43, 106.04s/it]

episode: 246/20000, reward: 230.0


  1%|          | 247/20000 [9:08:41<597:37:53, 108.92s/it]

episode: 247/20000, reward: 300.0


  1%|          | 248/20000 [9:10:01<550:19:34, 100.30s/it]

episode: 248/20000, reward: 170.0


  1%|          | 249/20000 [9:11:42<550:22:47, 100.32s/it]

episode: 249/20000, reward: 260.0


  1%|▏         | 250/20000 [9:12:48<494:41:19, 90.17s/it] 

episode: 250/20000, reward: 160.0


  1%|▏         | 251/20000 [9:15:55<654:27:44, 119.30s/it]

episode: 251/20000, reward: 390.0


  1%|▏         | 252/20000 [9:19:30<810:50:53, 147.82s/it]

episode: 252/20000, reward: 560.0


  1%|▏         | 253/20000 [9:22:05<822:23:24, 149.93s/it]

episode: 253/20000, reward: 560.0


  1%|▏         | 254/20000 [9:25:11<881:38:24, 160.74s/it]

episode: 254/20000, reward: 1790.0


  1%|▏         | 255/20000 [9:27:11<814:54:02, 148.58s/it]

episode: 255/20000, reward: 510.0


  1%|▏         | 256/20000 [9:29:08<762:54:26, 139.10s/it]

episode: 256/20000, reward: 240.0


  1%|▏         | 257/20000 [9:31:07<730:52:30, 133.27s/it]

episode: 257/20000, reward: 380.0


  1%|▏         | 258/20000 [9:33:57<791:16:51, 144.29s/it]

episode: 258/20000, reward: 530.0


  1%|▏         | 259/20000 [9:36:43<825:50:06, 150.60s/it]

episode: 259/20000, reward: 460.0


  1%|▏         | 260/20000 [9:40:24<942:21:46, 171.86s/it]

episode: 260/20000, reward: 730.0


  1%|▏         | 261/20000 [9:42:41<885:26:45, 161.49s/it]

episode: 261/20000, reward: 620.0


  1%|▏         | 262/20000 [9:44:43<819:34:59, 149.48s/it]

episode: 262/20000, reward: 420.0


  1%|▏         | 263/20000 [9:46:02<703:13:48, 128.27s/it]

episode: 263/20000, reward: 130.0


  1%|▏         | 264/20000 [9:48:05<694:20:18, 126.65s/it]

episode: 264/20000, reward: 340.0


  1%|▏         | 265/20000 [9:49:26<619:11:20, 112.95s/it]

episode: 265/20000, reward: 290.0


  1%|▏         | 266/20000 [9:50:55<579:40:19, 105.75s/it]

episode: 266/20000, reward: 230.0


  1%|▏         | 267/20000 [9:51:57<507:44:31, 92.63s/it] 

episode: 267/20000, reward: 150.0


  1%|▏         | 268/20000 [9:53:50<542:34:23, 98.99s/it]

episode: 268/20000, reward: 540.0


  1%|▏         | 269/20000 [9:57:09<705:48:55, 128.78s/it]

episode: 269/20000, reward: 1420.0


  1%|▏         | 270/20000 [9:58:55<668:42:37, 122.02s/it]

episode: 270/20000, reward: 370.0


  1%|▏         | 271/20000 [10:01:31<725:21:25, 132.36s/it]

episode: 271/20000, reward: 540.0


  1%|▏         | 272/20000 [10:03:08<666:15:25, 121.58s/it]

episode: 272/20000, reward: 330.0


  1%|▏         | 273/20000 [10:04:45<625:36:51, 114.17s/it]

episode: 273/20000, reward: 250.0


  1%|▏         | 274/20000 [10:06:33<616:20:37, 112.48s/it]

episode: 274/20000, reward: 500.0


  1%|▏         | 275/20000 [10:08:09<589:03:06, 107.51s/it]

episode: 275/20000, reward: 390.0


  1%|▏         | 276/20000 [10:11:23<730:09:47, 133.27s/it]

episode: 276/20000, reward: 580.0


  1%|▏         | 277/20000 [10:13:01<672:35:01, 122.77s/it]

episode: 277/20000, reward: 300.0


  1%|▏         | 278/20000 [10:15:39<730:54:30, 133.42s/it]

episode: 278/20000, reward: 1030.0


  1%|▏         | 279/20000 [10:17:17<672:28:50, 122.76s/it]

episode: 279/20000, reward: 290.0


  1%|▏         | 280/20000 [10:19:51<724:09:13, 132.20s/it]

episode: 280/20000, reward: 1200.0


  1%|▏         | 281/20000 [10:21:14<643:46:35, 117.53s/it]

episode: 281/20000, reward: 210.0


  1%|▏         | 282/20000 [10:22:37<585:27:47, 106.89s/it]

episode: 282/20000, reward: 200.0


  1%|▏         | 283/20000 [10:23:47<525:07:29, 95.88s/it] 

episode: 283/20000, reward: 120.0


  1%|▏         | 284/20000 [10:25:45<561:43:38, 102.57s/it]

episode: 284/20000, reward: 350.0


  1%|▏         | 285/20000 [10:27:05<524:37:15, 95.80s/it] 

episode: 285/20000, reward: 170.0


  1%|▏         | 286/20000 [10:28:48<536:06:05, 97.90s/it]

episode: 286/20000, reward: 160.0


  1%|▏         | 287/20000 [10:30:17<522:17:11, 95.38s/it]

episode: 287/20000, reward: 210.0


  1%|▏         | 288/20000 [10:32:12<554:16:10, 101.23s/it]

episode: 288/20000, reward: 280.0


  1%|▏         | 289/20000 [10:34:11<582:50:00, 106.45s/it]

episode: 289/20000, reward: 270.0


  1%|▏         | 290/20000 [10:36:00<586:50:25, 107.19s/it]

episode: 290/20000, reward: 230.0


  1%|▏         | 291/20000 [10:37:47<586:19:09, 107.10s/it]

episode: 291/20000, reward: 80.0


  1%|▏         | 292/20000 [10:39:35<588:42:59, 107.54s/it]

episode: 292/20000, reward: 180.0


  1%|▏         | 293/20000 [10:40:39<516:27:46, 94.35s/it] 

episode: 293/20000, reward: 120.0


  1%|▏         | 294/20000 [10:43:24<632:17:53, 115.51s/it]

episode: 294/20000, reward: 320.0


  1%|▏         | 295/20000 [10:44:46<578:36:21, 105.71s/it]

episode: 295/20000, reward: 150.0


  1%|▏         | 296/20000 [10:46:39<590:02:48, 107.80s/it]

episode: 296/20000, reward: 250.0


  1%|▏         | 297/20000 [10:49:59<741:08:59, 135.42s/it]

episode: 297/20000, reward: 680.0


  1%|▏         | 298/20000 [10:52:39<781:22:11, 142.77s/it]

episode: 298/20000, reward: 260.0


  1%|▏         | 299/20000 [10:54:11<698:31:48, 127.64s/it]

episode: 299/20000, reward: 130.0


  2%|▏         | 300/20000 [10:56:12<687:40:31, 125.67s/it]

episode: 300/20000, reward: 360.0


  2%|▏         | 301/20000 [10:57:56<652:28:44, 119.24s/it]

episode: 301/20000, reward: 210.0


  2%|▏         | 302/20000 [11:00:41<726:47:16, 132.83s/it]

episode: 302/20000, reward: 500.0


  2%|▏         | 303/20000 [11:03:01<738:36:10, 134.99s/it]

episode: 303/20000, reward: 340.0


  2%|▏         | 304/20000 [11:05:08<724:54:02, 132.50s/it]

episode: 304/20000, reward: 470.0


  2%|▏         | 305/20000 [11:06:37<653:50:03, 119.51s/it]

episode: 305/20000, reward: 150.0


  2%|▏         | 306/20000 [11:07:49<575:05:08, 105.12s/it]

episode: 306/20000, reward: 170.0


  2%|▏         | 307/20000 [11:09:05<528:43:59, 96.66s/it] 

episode: 307/20000, reward: 180.0


  2%|▏         | 308/20000 [11:10:41<526:34:33, 96.27s/it]

episode: 308/20000, reward: 190.0


  2%|▏         | 309/20000 [11:11:57<494:17:18, 90.37s/it]

episode: 309/20000, reward: 170.0


  2%|▏         | 310/20000 [11:13:31<499:21:02, 91.30s/it]

episode: 310/20000, reward: 180.0


  2%|▏         | 311/20000 [11:16:16<620:27:09, 113.45s/it]

episode: 311/20000, reward: 930.0


  2%|▏         | 312/20000 [11:18:01<605:52:03, 110.78s/it]

episode: 312/20000, reward: 360.0


  2%|▏         | 313/20000 [11:21:03<723:30:10, 132.30s/it]

episode: 313/20000, reward: 1300.0


  2%|▏         | 314/20000 [11:22:48<678:23:15, 124.06s/it]

episode: 314/20000, reward: 360.0


  2%|▏         | 315/20000 [11:24:11<611:31:22, 111.84s/it]

episode: 315/20000, reward: 160.0


  2%|▏         | 316/20000 [11:25:41<575:57:31, 105.34s/it]

episode: 316/20000, reward: 200.0


  2%|▏         | 317/20000 [11:28:31<681:10:41, 124.59s/it]

episode: 317/20000, reward: 1100.0


  2%|▏         | 318/20000 [11:30:20<655:29:40, 119.90s/it]

episode: 318/20000, reward: 340.0


  2%|▏         | 319/20000 [11:31:47<601:54:40, 110.10s/it]

episode: 319/20000, reward: 140.0


  2%|▏         | 320/20000 [11:34:19<669:52:27, 122.54s/it]

episode: 320/20000, reward: 510.0


  2%|▏         | 321/20000 [11:38:10<848:34:49, 155.24s/it]

episode: 321/20000, reward: 360.0


  2%|▏         | 322/20000 [11:40:43<845:03:29, 154.60s/it]

episode: 322/20000, reward: 410.0


  2%|▏         | 323/20000 [11:43:03<820:36:07, 150.13s/it]

episode: 323/20000, reward: 430.0


  2%|▏         | 324/20000 [11:45:14<789:17:29, 144.41s/it]

episode: 324/20000, reward: 440.0


  2%|▏         | 325/20000 [11:48:34<880:12:26, 161.05s/it]

episode: 325/20000, reward: 1250.0


  2%|▏         | 326/20000 [11:50:35<814:48:50, 149.10s/it]

episode: 326/20000, reward: 250.0


  2%|▏         | 327/20000 [11:53:47<885:20:06, 162.01s/it]

episode: 327/20000, reward: 1980.0


  2%|▏         | 328/20000 [11:57:19<967:27:42, 177.05s/it]

episode: 328/20000, reward: 610.0
