In [1]:
import copy
import gym
import os
import sys
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from skimage.transform import resize
from tensorflow.keras.layers import Input,Dense,Conv2D,Flatten
from tensorflow.keras import Model

In [2]:
MAX_EXPERIENCES = 500000
MIN_EXPERIENCES = 50000
TARGET_UPDATE_PERIOD = 10000
IM_SIZE = 84
K = 4 #env.action_space.n

In [3]:
class ImageTransformer:
    # transfrom image from 210,160,3
    # to 84,84
    def transform(self,state):
        state = tf.image.rgb_to_grayscale(state)
        state = tf.image.crop_to_bounding_box(state,34, 0, 160, 160)
        state = tf.image.resize(state,[IM_SIZE, IM_SIZE],method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
        state = tf.squeeze(state)
        return state


In [4]:
def update_state(state, obs_small):
    # throw oldest frame
    # append newest to end
    return np.append(state[:,:,1:],np.expand_dims(obs_small,2),axis=2)
    

In [5]:
class ReplayMemory:
    def __init__(self,size=MAX_EXPERIENCES,frame_height=IM_SIZE,frame_width = IM_SIZE,agent_history_length=4,batch_size=32):
        self.size =size
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.agent_history_length = agent_history_length
        self.batch_size = batch_size
        self.count = 0
        self.current = 0
        
        self.actions = np.empty(self.size,dtype=np.int32)
        self.rewards = np.empty(self.size,dtype=np.float32)
        self.frames = np.empty((self.size,self.frame_height,self.frame_width),dtype=np.uint8)
        self.terminal_flags = np.empty(self.size,dtype=np.bool)
        # pre-allocate memory for batch
        self.states = np.empty((self.batch_size,self.agent_history_length,self.frame_height,self.frame_width),dtype=np.uint8) 
        self.new_states = np.empty((self.batch_size,self.agent_history_length,self.frame_height,self.frame_width),dtype=np.uint8) 
        self.indices = np.empty(self.batch_size,dtype=np.int32)
        
    def add_experience(self,action,frame,reward,terminal):
        self.actions[self.current] = action
        self.frames[self.current] = frame
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.count = max(self.count,self.current+1)
        self.current = (self.current+1) % self.size
        
    def _get_state(self,index):
        #frames [t-3,t+1[
        return self.frames[index-self.agent_history_length+1:index+1, ...]
    
    def _get_valid_indices(self):
        for i in range(self.batch_size):
            while True:
                # start at agent_history_length since when sampling , 
                # this frame is taken along the previous agent_history_length-1 frames
                index = random.randint(self.agent_history_length, self.count - 1)
                # current is a circular index
                # reaching here means current made a cycle (see final comment)
                # in which case index = current-1 is newest experience , while index = current is oldest
                # thus having experinces before and after index is invalid since these are non-consecutive
                # if >= current was empty index would be less than current since count be current+1
                
                if index >= self.current and index - self.agent_history_length <= self.current:
                    continue
                # same goes here
                # if any of the experiences mark the end of the episode , the next experience is surely of a new episode
                # thus these are also non-consecutive experiences
                if self.terminal_flags[index - self.agent_history_length:index].any():
                    continue
                break
            self.indices[i] = index
          
    def get_minibatch(self):
        self._get_valid_indices()
        
        for i, idx in enumerate(self.indices):
            self.states[i] = self._get_state(idx - 1) # s (4 consecutive frames , s is last)
            self.new_states[i] = self._get_state(idx) # s'
        # states : N,T,H,W ----> N,H,W,T so it suits tensorflow , T acts like channels
        return np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.new_states, axes=(0, 2, 3, 1)), self.terminal_flags[self.indices]



In [6]:
def loss(targets,actions,yhat):
    selected_action_values = tf.reduce_sum(yhat * tf.one_hot(actions, K),axis=1)
    #cost = tf.reduce_mean(tf.square(targets - selected_action_values))
    cost = tf.reduce_mean(tf.keras.losses.Huber()(targets, selected_action_values))
    return cost

In [7]:
@tf.function
def train_step(model,opt,inputs,actions,targets):
    with tf.GradientTape() as tape:
        yhat = model(inputs, training=True)
        loss_value = loss(targets,actions,yhat)
    grads = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

In [8]:
def DQN(K,conv_layer_sizes,hidden_layer_sizes): # returns the DQN model
    input_ = Input(shape=(IM_SIZE,IM_SIZE,4))
    x = input_/255.0
    for num_output_filters,filtersz,poolsz in conv_layer_sizes:
        x = Conv2D(num_output_filters,filtersz,poolsz,activation='relu')(x)
    x = Flatten()(x)   
    for M in hidden_layer_sizes:
        x = Dense(M,activation='relu')(x)
    output = Dense(K,activation='relu')(x)
    model = Model(inputs=input_,outputs=output)
    return model

In [9]:
def copy_weights(from_,to):
    weights = [w.numpy() for w in from_.weights]
    to.set_weights(weights)

In [10]:
def sample_action(model,x,eps):
    if np.random.random() < eps:
        return np.random.choice(K)
    else:
        x = np.expand_dims(x,axis=0)
        return np.argmax(model(x).numpy()[0])


In [11]:
def train(model,opt,target_model,rb,gamma,batch_size):
    states, actions, rewards, next_states, dones = rb.get_minibatch()
    next_Qs = target_model.predict(next_states)
    next_Q = np.max(next_Qs,axis=1)
    targets = rewards + np.invert(dones).astype(np.float32) * gamma * next_Q
    loss = train_step(model,opt,states,actions,targets)
    

In [12]:
def play_one(env,total_t,rb,model,opt,target_model,im_transformer,gamma,batch_size,epsilon,epsilon_change,epsilon_min):
    t0 = datetime.now()
    obs = env.reset()
    obs_small = im_transformer.transform(obs)
    state = np.stack([obs_small]*4,axis=2)
    loss = None
    
    total_time_training = 0
    num_steps_in_episode = 0
    episode_reward = 0
    
    done = False
    while not done:
        if total_t % TARGET_UPDATE_PERIOD == 0:
            copy_weights(model,target_model)
            print("Copied model parameters to target network. total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD))
            
        action = sample_action(model,state,epsilon)
        obs,reward,done,_ = env.step(action)
        obs_small = im_transformer.transform(obs)
        next_state = update_state(state,obs_small)

        episode_reward += reward
        rb.add_experience(action, obs_small, reward, done)

        t0_2 = datetime.now()
        loss = train(model, opt,target_model, rb, gamma, batch_size)
        dt = datetime.now() - t0_2

        # More debugging info
        total_time_training += dt.total_seconds()
        num_steps_in_episode += 1

        state = next_state
        total_t += 1

        epsilon = max(epsilon - epsilon_change, epsilon_min)

    return total_t, episode_reward, (datetime.now() - t0), num_steps_in_episode, total_time_training/num_steps_in_episode, epsilon


In [13]:
conv_layer_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
hidden_layer_sizes = [512]
gamma = 0.99
batch_sz = 32
num_episodes = 3500
total_t = 0
rb = ReplayMemory()
episode_rewards = np.zeros(num_episodes)


In [14]:
epsilon = 1.0
epsilon_min = 0.1
epsilon_change = (epsilon - epsilon_min) / 500000

In [15]:
env = gym.envs.make("Breakout-v0")
model = DQN(K=K,conv_layer_sizes=conv_layer_sizes,hidden_layer_sizes=hidden_layer_sizes)
target_model = DQN(K=K,conv_layer_sizes=conv_layer_sizes,hidden_layer_sizes=hidden_layer_sizes)
im_transformer = ImageTransformer()
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [16]:
# populate reolay buffer
obs = env.reset()
for i in range(MIN_EXPERIENCES):
    action = np.random.choice(K)
    obs,reward,done,_ = env.step(action)
    obs_small = im_transformer.transform(obs)
    rb.add_experience(action,obs_small,reward,done)
    if done:
        obs = env.reset()

In [17]:
def running_avg(totalrewards):
    # average results over 100 episodes
    N = len(totalrewards)
    running_avg = np.empty(N)
    for t in range(N):
        running_avg[t] = totalrewards[max(0,t-100):(t+1)].mean()
    return running_avg

In [None]:
t0 = datetime.now()
for i in range(num_episodes):
    total_t, episode_reward, duration, num_steps_in_episode, time_per_step, epsilon = play_one(
        env,
        total_t,
        rb,
        model,
        opt,
        target_model,
        im_transformer,
        gamma,
        batch_sz,
        epsilon,
        epsilon_change,
        epsilon_min,
      )

    episode_rewards[i] = episode_reward

    last_100_avg = episode_rewards[max(0, i - 100):i + 1].mean()
    print("Episode:", i,
        "Duration:", duration,
        "Num steps:", num_steps_in_episode,
        "Reward:", episode_reward,
        "Training time per step:", "%.3f" % time_per_step,
        "Avg Reward (Last 100):", "%.3f" % last_100_avg,
        "Epsilon:", "%.3f" % epsilon
      )
    sys.stdout.flush()
print("Total duration:", datetime.now() - t0)
model.save_weights('model.h5')
# Plot the smoothed returns
y = running_avg(episode_rewards)
plt.plot(episode_rewards, label='orig')
plt.plot(y, label='smoothed')
plt.legend()
plt.show()




Copied model parameters to target network. total_t = 0, period = 10000
Episode: 0 Duration: 0:00:08.882938 Num steps: 176 Reward: 0.0 Training time per step: 0.045 Avg Reward (Last 100): 0.000 Epsilon: 1.000
Episode: 1 Duration: 0:00:07.190657 Num steps: 183 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 0.000 Epsilon: 0.999
Episode: 2 Duration: 0:00:10.567375 Num steps: 262 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 0.333 Epsilon: 0.999
Episode: 3 Duration: 0:00:07.105578 Num steps: 172 Reward: 0.0 Training time per step: 0.036 Avg Reward (Last 100): 0.250 Epsilon: 0.999
Episode: 4 Duration: 0:00:11.954805 Num steps: 295 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 0.600 Epsilon: 0.998
Episode: 5 Duration: 0:00:10.080000 Num steps: 257 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 0.667 Epsilon: 0.998
Episode: 6 Duration: 0:00:10.889602 Num steps: 276 Reward: 2.0 Training time per step: 0.035 Avg Reward (La

Episode: 59 Duration: 0:00:08.800456 Num steps: 203 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.633 Epsilon: 0.971
Episode: 60 Duration: 0:00:11.409208 Num steps: 269 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.639 Epsilon: 0.971
Episode: 61 Duration: 0:00:12.153274 Num steps: 295 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 1.645 Epsilon: 0.970
Episode: 62 Duration: 0:00:10.588947 Num steps: 270 Reward: 2.0 Training time per step: 0.034 Avg Reward (Last 100): 1.651 Epsilon: 0.970
Episode: 63 Duration: 0:00:11.846984 Num steps: 306 Reward: 3.0 Training time per step: 0.034 Avg Reward (Last 100): 1.672 Epsilon: 0.969
Episode: 64 Duration: 0:00:06.813523 Num steps: 174 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.646 Epsilon: 0.969
Episode: 65 Duration: 0:00:09.486357 Num steps: 230 Reward: 1.0 Training time per step: 0.036 Avg Reward (Last 100): 1.636 Epsilon: 0.969
Episode: 66 Duration: 0:00:11.1604

Episode: 118 Duration: 0:00:14.990181 Num steps: 381 Reward: 4.0 Training time per step: 0.034 Avg Reward (Last 100): 1.574 Epsilon: 0.945
Episode: 119 Duration: 0:00:09.310085 Num steps: 236 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.564 Epsilon: 0.944
Episode: 120 Duration: 0:00:07.043348 Num steps: 178 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.535 Epsilon: 0.944
Episode: 121 Duration: 0:00:15.467662 Num steps: 389 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 1.564 Epsilon: 0.943
Episode: 122 Duration: 0:00:06.958900 Num steps: 179 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.564 Epsilon: 0.943
Episode: 123 Duration: 0:00:08.433597 Num steps: 211 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.564 Epsilon: 0.943
Episode: 124 Duration: 0:00:09.350227 Num steps: 237 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.574 Epsilon: 0.942
Episode: 125 Duration: 0:00

Episode: 177 Duration: 0:00:09.738998 Num steps: 247 Reward: 2.0 Training time per step: 0.034 Avg Reward (Last 100): 1.366 Epsilon: 0.919
Episode: 178 Duration: 0:00:06.783107 Num steps: 169 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.366 Epsilon: 0.919
Episode: 179 Duration: 0:00:07.194038 Num steps: 177 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.337 Epsilon: 0.918
Episode: 180 Duration: 0:00:11.264668 Num steps: 274 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 1.356 Epsilon: 0.918
Episode: 181 Duration: 0:00:06.977211 Num steps: 176 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.337 Epsilon: 0.917
Episode: 182 Duration: 0:00:06.867669 Num steps: 174 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.317 Epsilon: 0.917
Episode: 183 Duration: 0:00:10.676216 Num steps: 269 Reward: 2.0 Training time per step: 0.034 Avg Reward (Last 100): 1.327 Epsilon: 0.917
Episode: 184 Duration: 0:00

Episode: 235 Duration: 0:00:10.627387 Num steps: 264 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.406 Epsilon: 0.892
Episode: 236 Duration: 0:00:11.498547 Num steps: 287 Reward: 2.0 Training time per step: 0.034 Avg Reward (Last 100): 1.406 Epsilon: 0.891
Episode: 237 Duration: 0:00:12.252604 Num steps: 306 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.406 Epsilon: 0.891
Episode: 238 Duration: 0:00:12.648799 Num steps: 317 Reward: 3.0 Training time per step: 0.034 Avg Reward (Last 100): 1.426 Epsilon: 0.890
Episode: 239 Duration: 0:00:18.561332 Num steps: 470 Reward: 5.0 Training time per step: 0.034 Avg Reward (Last 100): 1.475 Epsilon: 0.889
Episode: 240 Duration: 0:00:09.339780 Num steps: 236 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.475 Epsilon: 0.889
Episode: 241 Duration: 0:00:13.038895 Num steps: 325 Reward: 3.0 Training time per step: 0.035 Avg Reward (Last 100): 1.505 Epsilon: 0.888
Episode: 242 Duration: 0:00

Episode: 294 Duration: 0:00:11.478538 Num steps: 279 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 1.624 Epsilon: 0.863
Episode: 295 Duration: 0:00:09.498566 Num steps: 240 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.614 Epsilon: 0.863
Episode: 296 Duration: 0:00:06.842028 Num steps: 172 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.594 Epsilon: 0.863
Episode: 297 Duration: 0:00:10.101992 Num steps: 250 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.594 Epsilon: 0.862
Episode: 298 Duration: 0:00:08.038200 Num steps: 203 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.594 Epsilon: 0.862
Episode: 299 Duration: 0:00:11.791135 Num steps: 299 Reward: 2.0 Training time per step: 0.034 Avg Reward (Last 100): 1.604 Epsilon: 0.861
Episode: 300 Duration: 0:00:07.309941 Num steps: 184 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.604 Epsilon: 0.861
Episode: 301 Duration: 0:00

Copied model parameters to target network. total_t = 90000, period = 10000
Episode: 353 Duration: 0:00:19.885785 Num steps: 488 Reward: 6.0 Training time per step: 0.035 Avg Reward (Last 100): 1.327 Epsilon: 0.838
Episode: 354 Duration: 0:00:06.885548 Num steps: 168 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.307 Epsilon: 0.837
Episode: 355 Duration: 0:00:09.808150 Num steps: 242 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.287 Epsilon: 0.837
Episode: 356 Duration: 0:00:17.353870 Num steps: 430 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 1.287 Epsilon: 0.836
Episode: 357 Duration: 0:00:09.539876 Num steps: 238 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.287 Epsilon: 0.836
Episode: 358 Duration: 0:00:07.490492 Num steps: 187 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.287 Epsilon: 0.835
Episode: 359 Duration: 0:00:08.325078 Num steps: 208 Reward: 1.0 Training time per step: 0.

Episode: 411 Duration: 0:00:07.034347 Num steps: 177 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.178 Epsilon: 0.812
Episode: 412 Duration: 0:00:10.784930 Num steps: 267 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.188 Epsilon: 0.812
Episode: 413 Duration: 0:00:09.150136 Num steps: 230 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.198 Epsilon: 0.811
Episode: 414 Duration: 0:00:07.138704 Num steps: 178 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.188 Epsilon: 0.811
Episode: 415 Duration: 0:00:07.287293 Num steps: 181 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.168 Epsilon: 0.811
Episode: 416 Duration: 0:00:07.026985 Num steps: 174 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.168 Epsilon: 0.810
Episode: 417 Duration: 0:00:11.724731 Num steps: 285 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.188 Epsilon: 0.810
Episode: 418 Duration: 0:00

Episode: 470 Duration: 0:00:07.694251 Num steps: 179 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.297 Epsilon: 0.786
Episode: 471 Duration: 0:00:08.026129 Num steps: 201 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.297 Epsilon: 0.786
Episode: 472 Duration: 0:00:13.535103 Num steps: 322 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 1.287 Epsilon: 0.785
Episode: 473 Duration: 0:00:14.707966 Num steps: 340 Reward: 3.0 Training time per step: 0.037 Avg Reward (Last 100): 1.297 Epsilon: 0.785
Episode: 474 Duration: 0:00:07.375237 Num steps: 174 Reward: 0.0 Training time per step: 0.036 Avg Reward (Last 100): 1.297 Epsilon: 0.784
Copied model parameters to target network. total_t = 120000, period = 10000
Episode: 475 Duration: 0:00:17.816208 Num steps: 422 Reward: 4.0 Training time per step: 0.036 Avg Reward (Last 100): 1.317 Epsilon: 0.784
Episode: 476 Duration: 0:00:14.108099 Num steps: 350 Reward: 3.0 Training time per step: 0

Episode: 528 Duration: 0:00:13.755391 Num steps: 340 Reward: 3.0 Training time per step: 0.034 Avg Reward (Last 100): 1.267 Epsilon: 0.761
Episode: 529 Duration: 0:00:09.660136 Num steps: 239 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.277 Epsilon: 0.760
Episode: 530 Duration: 0:00:11.810902 Num steps: 289 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.277 Epsilon: 0.760
Episode: 531 Duration: 0:00:08.529419 Num steps: 211 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 1.277 Epsilon: 0.759
Episode: 532 Duration: 0:00:08.047204 Num steps: 200 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.277 Epsilon: 0.759
Episode: 533 Duration: 0:00:09.658303 Num steps: 234 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.287 Epsilon: 0.758
Episode: 534 Duration: 0:00:07.417380 Num steps: 181 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.257 Epsilon: 0.758
Episode: 535 Duration: 0:00

Episode: 587 Duration: 0:00:16.845853 Num steps: 420 Reward: 4.0 Training time per step: 0.034 Avg Reward (Last 100): 1.277 Epsilon: 0.734
Episode: 588 Duration: 0:00:07.239993 Num steps: 180 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.267 Epsilon: 0.733
Episode: 589 Duration: 0:00:14.852760 Num steps: 300 Reward: 2.0 Training time per step: 0.043 Avg Reward (Last 100): 1.287 Epsilon: 0.733
Episode: 590 Duration: 0:00:17.013490 Num steps: 335 Reward: 3.0 Training time per step: 0.044 Avg Reward (Last 100): 1.307 Epsilon: 0.732
Episode: 591 Duration: 0:00:14.247081 Num steps: 285 Reward: 2.0 Training time per step: 0.043 Avg Reward (Last 100): 1.327 Epsilon: 0.732
Episode: 592 Duration: 0:00:26.669002 Num steps: 519 Reward: 6.0 Training time per step: 0.044 Avg Reward (Last 100): 1.366 Epsilon: 0.731
Episode: 593 Duration: 0:00:10.343924 Num steps: 223 Reward: 1.0 Training time per step: 0.040 Avg Reward (Last 100): 1.356 Epsilon: 0.730
Copied model parameters to 

Episode: 645 Duration: 0:00:13.536898 Num steps: 306 Reward: 2.0 Training time per step: 0.038 Avg Reward (Last 100): 1.297 Epsilon: 0.708
Episode: 646 Duration: 0:00:13.364604 Num steps: 305 Reward: 2.0 Training time per step: 0.038 Avg Reward (Last 100): 1.287 Epsilon: 0.708
Episode: 647 Duration: 0:00:10.743572 Num steps: 247 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.277 Epsilon: 0.707
Episode: 648 Duration: 0:00:10.951745 Num steps: 252 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.267 Epsilon: 0.707
Episode: 649 Duration: 0:00:12.609700 Num steps: 283 Reward: 2.0 Training time per step: 0.038 Avg Reward (Last 100): 1.248 Epsilon: 0.706
Episode: 650 Duration: 0:00:07.739970 Num steps: 178 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.218 Epsilon: 0.706
Episode: 651 Duration: 0:00:13.835878 Num steps: 318 Reward: 3.0 Training time per step: 0.037 Avg Reward (Last 100): 1.238 Epsilon: 0.705
Episode: 652 Duration: 0:00

Episode: 704 Duration: 0:00:12.967513 Num steps: 292 Reward: 3.0 Training time per step: 0.038 Avg Reward (Last 100): 1.277 Epsilon: 0.682
Episode: 705 Duration: 0:00:07.307279 Num steps: 166 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.257 Epsilon: 0.681
Episode: 706 Duration: 0:00:11.866780 Num steps: 272 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.267 Epsilon: 0.681
Episode: 707 Duration: 0:00:07.494979 Num steps: 171 Reward: 0.0 Training time per step: 0.038 Avg Reward (Last 100): 1.248 Epsilon: 0.681
Episode: 708 Duration: 0:00:13.605922 Num steps: 310 Reward: 3.0 Training time per step: 0.037 Avg Reward (Last 100): 1.257 Epsilon: 0.680
Episode: 709 Duration: 0:00:11.524226 Num steps: 265 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.267 Epsilon: 0.680
Episode: 710 Duration: 0:00:07.365112 Num steps: 168 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.228 Epsilon: 0.679
Episode: 711 Duration: 0:00

Episode: 762 Duration: 0:00:11.115315 Num steps: 251 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.525 Epsilon: 0.654
Episode: 763 Duration: 0:00:10.015373 Num steps: 222 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.505 Epsilon: 0.654
Episode: 764 Duration: 0:00:17.186650 Num steps: 385 Reward: 3.0 Training time per step: 0.038 Avg Reward (Last 100): 1.505 Epsilon: 0.653
Episode: 765 Duration: 0:00:11.324693 Num steps: 258 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.525 Epsilon: 0.652
Episode: 766 Duration: 0:00:12.646944 Num steps: 288 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.535 Epsilon: 0.652
Episode: 767 Duration: 0:00:12.906423 Num steps: 294 Reward: 3.0 Training time per step: 0.037 Avg Reward (Last 100): 1.554 Epsilon: 0.651
Episode: 768 Duration: 0:00:09.040129 Num steps: 205 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.554 Epsilon: 0.651
Episode: 769 Duration: 0:00

Episode: 821 Duration: 0:00:10.542777 Num steps: 231 Reward: 1.0 Training time per step: 0.039 Avg Reward (Last 100): 1.436 Epsilon: 0.629
Episode: 822 Duration: 0:00:14.605177 Num steps: 302 Reward: 2.0 Training time per step: 0.041 Avg Reward (Last 100): 1.446 Epsilon: 0.628
Episode: 823 Duration: 0:00:08.280129 Num steps: 168 Reward: 0.0 Training time per step: 0.042 Avg Reward (Last 100): 1.436 Epsilon: 0.628
Episode: 824 Duration: 0:00:08.797469 Num steps: 175 Reward: 0.0 Training time per step: 0.043 Avg Reward (Last 100): 1.416 Epsilon: 0.628
Episode: 825 Duration: 0:00:09.873044 Num steps: 204 Reward: 0.0 Training time per step: 0.041 Avg Reward (Last 100): 1.406 Epsilon: 0.627
Episode: 826 Duration: 0:00:10.336840 Num steps: 228 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.386 Epsilon: 0.627
Episode: 827 Duration: 0:00:11.170792 Num steps: 227 Reward: 1.0 Training time per step: 0.042 Avg Reward (Last 100): 1.386 Epsilon: 0.626
Episode: 828 Duration: 0:00

Episode: 879 Duration: 0:00:12.159280 Num steps: 276 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.238 Epsilon: 0.603
Episode: 880 Duration: 0:00:12.731615 Num steps: 288 Reward: 2.0 Training time per step: 0.037 Avg Reward (Last 100): 1.257 Epsilon: 0.603
Episode: 881 Duration: 0:00:07.529477 Num steps: 173 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.257 Epsilon: 0.602
Episode: 882 Duration: 0:00:10.324211 Num steps: 232 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.228 Epsilon: 0.602
Episode: 883 Duration: 0:00:07.308299 Num steps: 165 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.218 Epsilon: 0.602
Episode: 884 Duration: 0:00:07.657609 Num steps: 173 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.208 Epsilon: 0.601
Episode: 885 Duration: 0:00:09.053263 Num steps: 206 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.188 Epsilon: 0.601
Episode: 886 Duration: 0:00

Episode: 938 Duration: 0:00:12.335887 Num steps: 293 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 1.139 Epsilon: 0.579
Episode: 939 Duration: 0:00:10.237875 Num steps: 239 Reward: 1.0 Training time per step: 0.036 Avg Reward (Last 100): 1.149 Epsilon: 0.578
Episode: 940 Duration: 0:00:10.102492 Num steps: 244 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.119 Epsilon: 0.578
Episode: 941 Duration: 0:00:09.498885 Num steps: 222 Reward: 1.0 Training time per step: 0.036 Avg Reward (Last 100): 1.129 Epsilon: 0.577
Episode: 942 Duration: 0:00:08.704825 Num steps: 199 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 1.119 Epsilon: 0.577
Episode: 943 Duration: 0:00:11.695812 Num steps: 255 Reward: 1.0 Training time per step: 0.039 Avg Reward (Last 100): 1.129 Epsilon: 0.577
Episode: 944 Duration: 0:00:10.432810 Num steps: 234 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.129 Epsilon: 0.576
Episode: 945 Duration: 0:00

Episode: 997 Duration: 0:00:11.140311 Num steps: 246 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.347 Epsilon: 0.551
Episode: 998 Duration: 0:00:07.751316 Num steps: 171 Reward: 0.0 Training time per step: 0.038 Avg Reward (Last 100): 1.337 Epsilon: 0.551
Episode: 999 Duration: 0:00:10.455906 Num steps: 226 Reward: 1.0 Training time per step: 0.039 Avg Reward (Last 100): 1.347 Epsilon: 0.551
Episode: 1000 Duration: 0:00:13.932227 Num steps: 305 Reward: 2.0 Training time per step: 0.038 Avg Reward (Last 100): 1.356 Epsilon: 0.550
Copied model parameters to target network. total_t = 250000, period = 10000
Episode: 1001 Duration: 0:00:13.256742 Num steps: 294 Reward: 2.0 Training time per step: 0.038 Avg Reward (Last 100): 1.366 Epsilon: 0.550
Episode: 1002 Duration: 0:00:13.744895 Num steps: 295 Reward: 2.0 Training time per step: 0.039 Avg Reward (Last 100): 1.386 Epsilon: 0.549
Episode: 1003 Duration: 0:00:10.319052 Num steps: 230 Reward: 1.0 Training time per ste

Episode: 1055 Duration: 0:00:14.903361 Num steps: 323 Reward: 2.0 Training time per step: 0.039 Avg Reward (Last 100): 1.624 Epsilon: 0.524
Episode: 1056 Duration: 0:00:13.824408 Num steps: 295 Reward: 2.0 Training time per step: 0.039 Avg Reward (Last 100): 1.624 Epsilon: 0.523
Episode: 1057 Duration: 0:00:07.321049 Num steps: 165 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.624 Epsilon: 0.523
Episode: 1058 Duration: 0:00:15.581411 Num steps: 340 Reward: 3.0 Training time per step: 0.038 Avg Reward (Last 100): 1.634 Epsilon: 0.522
Episode: 1059 Duration: 0:00:14.183440 Num steps: 319 Reward: 4.0 Training time per step: 0.037 Avg Reward (Last 100): 1.644 Epsilon: 0.522
Episode: 1060 Duration: 0:00:12.685384 Num steps: 282 Reward: 1.0 Training time per step: 0.038 Avg Reward (Last 100): 1.634 Epsilon: 0.521
Episode: 1061 Duration: 0:00:11.273437 Num steps: 245 Reward: 1.0 Training time per step: 0.039 Avg Reward (Last 100): 1.634 Epsilon: 0.521
Episode: 1062 Durati

Episode: 1113 Duration: 0:00:08.949173 Num steps: 212 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.446 Epsilon: 0.496
Episode: 1114 Duration: 0:00:07.684648 Num steps: 178 Reward: 0.0 Training time per step: 0.036 Avg Reward (Last 100): 1.446 Epsilon: 0.496
Copied model parameters to target network. total_t = 280000, period = 10000
Episode: 1115 Duration: 0:00:11.828070 Num steps: 253 Reward: 1.0 Training time per step: 0.039 Avg Reward (Last 100): 1.426 Epsilon: 0.496
Episode: 1116 Duration: 0:00:07.141294 Num steps: 164 Reward: 0.0 Training time per step: 0.036 Avg Reward (Last 100): 1.406 Epsilon: 0.495
Episode: 1117 Duration: 0:00:07.453188 Num steps: 179 Reward: 0.0 Training time per step: 0.034 Avg Reward (Last 100): 1.396 Epsilon: 0.495
Episode: 1118 Duration: 0:00:09.640220 Num steps: 230 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.396 Epsilon: 0.495
Episode: 1119 Duration: 0:00:11.539188 Num steps: 269 Reward: 1.0 Training time per 

Episode: 1171 Duration: 0:00:07.704512 Num steps: 172 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.376 Epsilon: 0.471
Episode: 1172 Duration: 0:00:11.148317 Num steps: 251 Reward: 1.0 Training time per step: 0.037 Avg Reward (Last 100): 1.366 Epsilon: 0.470
Episode: 1173 Duration: 0:00:11.970523 Num steps: 280 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.366 Epsilon: 0.470
Episode: 1174 Duration: 0:00:09.084908 Num steps: 217 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.337 Epsilon: 0.469
Episode: 1175 Duration: 0:00:08.648950 Num steps: 194 Reward: 0.0 Training time per step: 0.037 Avg Reward (Last 100): 1.327 Epsilon: 0.469
Episode: 1176 Duration: 0:00:09.583264 Num steps: 226 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.327 Epsilon: 0.469
Episode: 1177 Duration: 0:00:08.848117 Num steps: 210 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.307 Epsilon: 0.468
Episode: 1178 Durati

Episode: 1229 Duration: 0:00:10.726240 Num steps: 255 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.218 Epsilon: 0.445
Episode: 1230 Duration: 0:00:09.020876 Num steps: 207 Reward: 1.0 Training time per step: 0.036 Avg Reward (Last 100): 1.208 Epsilon: 0.444
Episode: 1231 Duration: 0:00:11.828819 Num steps: 276 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.208 Epsilon: 0.444
Episode: 1232 Duration: 0:00:14.576123 Num steps: 336 Reward: 3.0 Training time per step: 0.036 Avg Reward (Last 100): 1.218 Epsilon: 0.443
Episode: 1233 Duration: 0:00:09.290857 Num steps: 218 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.228 Epsilon: 0.443
Episode: 1234 Duration: 0:00:13.004648 Num steps: 308 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.248 Epsilon: 0.442
Copied model parameters to target network. total_t = 310000, period = 10000
Episode: 1235 Duration: 0:00:16.763070 Num steps: 392 Reward: 4.0 Training time per 

Episode: 1287 Duration: 0:00:14.178281 Num steps: 334 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.723 Epsilon: 0.415
Episode: 1288 Duration: 0:00:14.267038 Num steps: 326 Reward: 3.0 Training time per step: 0.036 Avg Reward (Last 100): 1.723 Epsilon: 0.414
Episode: 1289 Duration: 0:00:24.188125 Num steps: 565 Reward: 7.0 Training time per step: 0.035 Avg Reward (Last 100): 1.792 Epsilon: 0.413
Episode: 1290 Duration: 0:00:09.428403 Num steps: 219 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.772 Epsilon: 0.413
Episode: 1291 Duration: 0:00:11.977309 Num steps: 284 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 1.792 Epsilon: 0.412
Episode: 1292 Duration: 0:00:11.671595 Num steps: 277 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.802 Epsilon: 0.412
Episode: 1293 Duration: 0:00:11.284248 Num steps: 266 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 1.792 Epsilon: 0.412
Episode: 1294 Durati

Episode: 1345 Duration: 0:00:11.285190 Num steps: 262 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 2.158 Epsilon: 0.382
Episode: 1346 Duration: 0:00:11.168881 Num steps: 262 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 2.168 Epsilon: 0.382
Episode: 1347 Duration: 0:00:11.983751 Num steps: 277 Reward: 2.0 Training time per step: 0.036 Avg Reward (Last 100): 2.168 Epsilon: 0.381
Episode: 1348 Duration: 0:00:09.818490 Num steps: 226 Reward: 0.0 Training time per step: 0.036 Avg Reward (Last 100): 2.158 Epsilon: 0.381
Episode: 1349 Duration: 0:00:09.818892 Num steps: 231 Reward: 1.0 Training time per step: 0.035 Avg Reward (Last 100): 2.149 Epsilon: 0.380
Episode: 1350 Duration: 0:00:10.062965 Num steps: 233 Reward: 0.0 Training time per step: 0.035 Avg Reward (Last 100): 2.129 Epsilon: 0.380
Episode: 1351 Duration: 0:00:13.519989 Num steps: 315 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 2.149 Epsilon: 0.379
Episode: 1352 Durati

Episode: 1403 Duration: 0:00:19.786010 Num steps: 453 Reward: 5.0 Training time per step: 0.036 Avg Reward (Last 100): 2.228 Epsilon: 0.351
Episode: 1404 Duration: 0:00:16.267388 Num steps: 369 Reward: 3.0 Training time per step: 0.036 Avg Reward (Last 100): 2.228 Epsilon: 0.350
Episode: 1405 Duration: 0:00:10.622240 Num steps: 251 Reward: 2.0 Training time per step: 0.035 Avg Reward (Last 100): 2.218 Epsilon: 0.349
Episode: 1406 Duration: 0:00:19.208649 Num steps: 437 Reward: 4.0 Training time per step: 0.036 Avg Reward (Last 100): 2.228 Epsilon: 0.349
Episode: 1407 Duration: 0:00:12.619287 Num steps: 299 Reward: 1.0 Training time per step: 0.034 Avg Reward (Last 100): 2.218 Epsilon: 0.348
Episode: 1408 Duration: 0:00:14.018363 Num steps: 318 Reward: 3.0 Training time per step: 0.036 Avg Reward (Last 100): 2.218 Epsilon: 0.348
Episode: 1409 Duration: 0:00:16.128643 Num steps: 377 Reward: 3.0 Training time per step: 0.035 Avg Reward (Last 100): 2.228 Epsilon: 0.347
Episode: 1410 Durati

Episode: 1461 Duration: 0:00:15.077080 Num steps: 352 Reward: 3.0 Training time per step: 0.035 Avg Reward (Last 100): 2.861 Epsilon: 0.312
Episode: 1462 Duration: 0:00:17.924224 Num steps: 414 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 2.891 Epsilon: 0.311
Episode: 1463 Duration: 0:00:16.307789 Num steps: 373 Reward: 4.0 Training time per step: 0.036 Avg Reward (Last 100): 2.901 Epsilon: 0.310
Episode: 1464 Duration: 0:00:18.830862 Num steps: 441 Reward: 5.0 Training time per step: 0.035 Avg Reward (Last 100): 2.931 Epsilon: 0.309
Episode: 1465 Duration: 0:00:15.846296 Num steps: 365 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 2.950 Epsilon: 0.309
Episode: 1466 Duration: 0:00:14.982596 Num steps: 348 Reward: 3.0 Training time per step: 0.035 Avg Reward (Last 100): 2.970 Epsilon: 0.308
Episode: 1467 Duration: 0:00:25.993847 Num steps: 592 Reward: 8.0 Training time per step: 0.036 Avg Reward (Last 100): 3.040 Epsilon: 0.307
Episode: 1468 Durati

Episode: 1519 Duration: 0:00:25.600406 Num steps: 588 Reward: 10.0 Training time per step: 0.035 Avg Reward (Last 100): 4.267 Epsilon: 0.266
Episode: 1520 Duration: 0:00:14.348094 Num steps: 330 Reward: 3.0 Training time per step: 0.035 Avg Reward (Last 100): 4.238 Epsilon: 0.265
Episode: 1521 Duration: 0:00:29.380801 Num steps: 673 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 4.307 Epsilon: 0.264
Episode: 1522 Duration: 0:00:22.124776 Num steps: 505 Reward: 6.0 Training time per step: 0.036 Avg Reward (Last 100): 4.356 Epsilon: 0.263
Episode: 1523 Duration: 0:00:18.144743 Num steps: 422 Reward: 5.0 Training time per step: 0.035 Avg Reward (Last 100): 4.376 Epsilon: 0.262
Copied model parameters to target network. total_t = 410000, period = 10000
Episode: 1524 Duration: 0:00:16.174979 Num steps: 367 Reward: 3.0 Training time per step: 0.036 Avg Reward (Last 100): 4.376 Epsilon: 0.262
Episode: 1525 Duration: 0:00:21.673768 Num steps: 495 Reward: 5.0 Training time per

Episode: 1576 Duration: 0:00:23.500780 Num steps: 539 Reward: 6.0 Training time per step: 0.035 Avg Reward (Last 100): 5.832 Epsilon: 0.212
Episode: 1577 Duration: 0:00:19.046857 Num steps: 437 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 5.822 Epsilon: 0.211
Episode: 1578 Duration: 0:00:20.466388 Num steps: 471 Reward: 6.0 Training time per step: 0.035 Avg Reward (Last 100): 5.842 Epsilon: 0.210
Episode: 1579 Duration: 0:00:27.904420 Num steps: 639 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 5.911 Epsilon: 0.209
Episode: 1580 Duration: 0:00:19.336519 Num steps: 441 Reward: 5.0 Training time per step: 0.035 Avg Reward (Last 100): 5.921 Epsilon: 0.208
Copied model parameters to target network. total_t = 440000, period = 10000
Episode: 1581 Duration: 0:00:32.023954 Num steps: 742 Reward: 13.0 Training time per step: 0.035 Avg Reward (Last 100): 6.010 Epsilon: 0.207
Episode: 1582 Duration: 0:00:28.789462 Num steps: 656 Reward: 8.0 Training time per

Episode: 1633 Duration: 0:00:33.212201 Num steps: 754 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 7.881 Epsilon: 0.150
Episode: 1634 Duration: 0:00:38.405817 Num steps: 874 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 7.960 Epsilon: 0.148
Episode: 1635 Duration: 0:00:23.832825 Num steps: 541 Reward: 7.0 Training time per step: 0.035 Avg Reward (Last 100): 7.941 Epsilon: 0.147
Episode: 1636 Duration: 0:00:32.088143 Num steps: 728 Reward: 14.0 Training time per step: 0.035 Avg Reward (Last 100): 8.010 Epsilon: 0.146
Episode: 1637 Duration: 0:00:16.449366 Num steps: 378 Reward: 4.0 Training time per step: 0.035 Avg Reward (Last 100): 8.000 Epsilon: 0.145
Episode: 1638 Duration: 0:00:29.079600 Num steps: 659 Reward: 10.0 Training time per step: 0.035 Avg Reward (Last 100): 8.069 Epsilon: 0.144
Episode: 1639 Duration: 0:00:27.257327 Num steps: 627 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 8.089 Epsilon: 0.143
Episode: 1640 Du

Episode: 1690 Duration: 0:00:34.095542 Num steps: 769 Reward: 15.0 Training time per step: 0.035 Avg Reward (Last 100): 10.525 Epsilon: 0.100
Episode: 1691 Duration: 0:00:29.107632 Num steps: 656 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 10.525 Epsilon: 0.100
Episode: 1692 Duration: 0:00:33.005616 Num steps: 746 Reward: 10.0 Training time per step: 0.035 Avg Reward (Last 100): 10.465 Epsilon: 0.100
Episode: 1693 Duration: 0:00:34.066159 Num steps: 770 Reward: 13.0 Training time per step: 0.035 Avg Reward (Last 100): 10.535 Epsilon: 0.100
Episode: 1694 Duration: 0:00:26.108655 Num steps: 589 Reward: 13.0 Training time per step: 0.035 Avg Reward (Last 100): 10.634 Epsilon: 0.100
Episode: 1695 Duration: 0:00:25.830442 Num steps: 584 Reward: 8.0 Training time per step: 0.035 Avg Reward (Last 100): 10.584 Epsilon: 0.100
Episode: 1696 Duration: 0:00:35.538731 Num steps: 805 Reward: 16.0 Training time per step: 0.035 Avg Reward (Last 100): 10.673 Epsilon: 0.100
Episode

Episode: 1746 Duration: 0:00:30.942846 Num steps: 691 Reward: 10.0 Training time per step: 0.036 Avg Reward (Last 100): 12.267 Epsilon: 0.100
Episode: 1747 Duration: 0:00:25.269015 Num steps: 542 Reward: 7.0 Training time per step: 0.037 Avg Reward (Last 100): 12.238 Epsilon: 0.100
Episode: 1748 Duration: 0:00:35.037932 Num steps: 794 Reward: 13.0 Training time per step: 0.035 Avg Reward (Last 100): 12.238 Epsilon: 0.100
Episode: 1749 Duration: 0:00:34.845219 Num steps: 789 Reward: 16.0 Training time per step: 0.035 Avg Reward (Last 100): 12.307 Epsilon: 0.100
Episode: 1750 Duration: 0:00:31.631103 Num steps: 715 Reward: 14.0 Training time per step: 0.035 Avg Reward (Last 100): 12.337 Epsilon: 0.100
Episode: 1751 Duration: 0:00:28.073386 Num steps: 632 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 12.317 Epsilon: 0.100
Episode: 1752 Duration: 0:00:40.435371 Num steps: 921 Reward: 17.0 Training time per step: 0.035 Avg Reward (Last 100): 12.406 Epsilon: 0.100
Episode:

Episode: 1802 Duration: 0:00:29.703430 Num steps: 673 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 12.960 Epsilon: 0.100
Episode: 1803 Duration: 0:00:25.101425 Num steps: 570 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 12.921 Epsilon: 0.100
Episode: 1804 Duration: 0:00:34.775670 Num steps: 789 Reward: 12.0 Training time per step: 0.035 Avg Reward (Last 100): 12.901 Epsilon: 0.100
Episode: 1805 Duration: 0:00:41.484816 Num steps: 937 Reward: 18.0 Training time per step: 0.035 Avg Reward (Last 100): 12.990 Epsilon: 0.100
Episode: 1806 Duration: 0:00:22.990758 Num steps: 520 Reward: 7.0 Training time per step: 0.035 Avg Reward (Last 100): 12.941 Epsilon: 0.100
Episode: 1807 Duration: 0:00:32.609419 Num steps: 739 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 12.941 Epsilon: 0.100
Episode: 1808 Duration: 0:00:25.722140 Num steps: 588 Reward: 9.0 Training time per step: 0.035 Avg Reward (Last 100): 12.970 Epsilon: 0.100
Copied mo

Episode: 1858 Duration: 0:00:44.570082 Num steps: 1012 Reward: 24.0 Training time per step: 0.035 Avg Reward (Last 100): 13.881 Epsilon: 0.100
Episode: 1859 Duration: 0:00:31.743000 Num steps: 717 Reward: 16.0 Training time per step: 0.035 Avg Reward (Last 100): 13.911 Epsilon: 0.100
Copied model parameters to target network. total_t = 640000, period = 10000
Episode: 1860 Duration: 0:00:51.611088 Num steps: 1157 Reward: 25.0 Training time per step: 0.036 Avg Reward (Last 100): 14.030 Epsilon: 0.100
Episode: 1861 Duration: 0:00:41.495537 Num steps: 932 Reward: 16.0 Training time per step: 0.036 Avg Reward (Last 100): 14.000 Epsilon: 0.100
Episode: 1862 Duration: 0:00:28.721654 Num steps: 650 Reward: 11.0 Training time per step: 0.035 Avg Reward (Last 100): 14.010 Epsilon: 0.100
Episode: 1863 Duration: 0:00:39.256770 Num steps: 892 Reward: 21.0 Training time per step: 0.035 Avg Reward (Last 100): 14.149 Epsilon: 0.100
Episode: 1864 Duration: 0:00:23.283634 Num steps: 535 Reward: 12.0 Tra

Episode: 1914 Duration: 0:00:40.438697 Num steps: 912 Reward: 19.0 Training time per step: 0.035 Avg Reward (Last 100): 14.703 Epsilon: 0.100
Episode: 1915 Duration: 0:00:32.377309 Num steps: 732 Reward: 16.0 Training time per step: 0.035 Avg Reward (Last 100): 14.703 Epsilon: 0.100
Episode: 1916 Duration: 0:00:40.178702 Num steps: 922 Reward: 18.0 Training time per step: 0.035 Avg Reward (Last 100): 14.733 Epsilon: 0.100
Episode: 1917 Duration: 0:00:29.973401 Num steps: 680 Reward: 10.0 Training time per step: 0.035 Avg Reward (Last 100): 14.703 Epsilon: 0.100
Episode: 1918 Duration: 0:00:33.722213 Num steps: 757 Reward: 14.0 Training time per step: 0.036 Avg Reward (Last 100): 14.713 Epsilon: 0.100
Episode: 1919 Duration: 0:00:47.809766 Num steps: 1081 Reward: 19.0 Training time per step: 0.035 Avg Reward (Last 100): 14.762 Epsilon: 0.100
Episode: 1920 Duration: 0:00:25.606638 Num steps: 588 Reward: 12.0 Training time per step: 0.035 Avg Reward (Last 100): 14.733 Epsilon: 0.100
Copie