In [None]:
# You will need to run this block twice to make it effective
!apt-get update 
!apt-get install cmake 
!pip install --upgrade setuptools 
!pip install ez_setup 
!pip install gym==0.24.1
!pip install gym[all]

!pip install gym pyvirtualdisplay 
!apt-get install -y xvfb python-opengl ffmpeg 

In [None]:
from importlib import reload
import utils
reload(utils)
from utils import*

import gym
from gym.wrappers.monitoring import video_recorder
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

import torch
import torch.nn as nn
import numpy as np
from torch import nn
import copy
from collections import deque
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

from torch import randint
from time import sleep
import pickle
import statistics as st
from gym.core import RewardWrapper
import gc



display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""


def simulate(agent=None,env=None,epsilon=0,memory=None,render=False):
  agent.eval()
  env.reset()
  if(render):
    env.env = RecordVideo(env.env, './video')
    env.env.render()
  state,rew,done,info = env.skip_episodes(70,[0,0.1,0])
  ep_len = 0
  while not done:
      # exploitation(0) vs exploration(1)
      sample = torch.bernoulli(torch.tensor(epsilon).float())
      if(sample==1):
        A = torch.randint(0,3,(1,))
      else:
        A = agent.get_action(state)

      # progress a time step
      next_state, rew, done, info = env.step(agent.convert_action(A,state))
      #plot_image(state)
      #print(rew)

      if(done):
        break

      # collect memory
      if(memory!=None):
        memory.collect([state, A, rew, next_state])
      state = next_state

      ep_len = env.ep_len
      # stop criteria 
      if(ep_len>2000):
        break

  # readd 100 to episode reward to resync measured reward with documentation (undo the -100 penalty)
  score = env.real_rew
  if(render):
    print("score",score,"ep_len",ep_len)
    env.env.close()
    show_video()
  
  return score,ep_len


def test_model(agent, env, episodes=1):
  rewards = []
  ep_lens = []
  for i in range(0,episodes):
    rew,ep_len = simulate(agent,env)
    rewards.append(rew)
    ep_lens.append(ep_len)
    print("Test "+str(i+1)+"/"+str(episodes)+": reward =",rew," episode len =",ep_len)
  print("\nAverage Reward = ",sum(rewards)/len(rewards),"Average Ep_len = ",sum(ep_lens)/len(ep_lens),"\n")
  return rewards,ep_lens


class ExperienceReplay(object):
# one entry is [state,action,reward,next_state]
  def  __init__(self, length):
    self.experience_replay = deque(maxlen=length)
  def collect(self,experience):
    self.experience_replay.append(experience)
    return
  def sample_from_experience(self, sample_size):
    sample_size = min(sample_size,len(self.experience_replay))
    sample = random.sample(self.experience_replay,sample_size)
    state = torch.tensor([episode[0] for episode in sample]).float()
    action = torch.tensor([episode[1] for episode in sample]).float()
    reward = torch.tensor([episode[2] for episode in sample]).float()
    next_state = torch.tensor([episode[3] for episode in sample]).float()

    return state,action,reward,next_state



class DQN_Network(nn.Module):
  def __init__(self,gamma = None):
    super().__init__()
    #layers
    self.LeakyReLU = nn.LeakyReLU()
    self.conv1 = nn.Conv2d(1,8,kernel_size = 7, stride = 4,padding = 0)
    self.conv2 = nn.Conv2d(8,16,kernel_size = 3, stride = 1,padding = 2)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.fc1 = nn.Linear(577,256)
    self.fc2 = nn.Linear(256,50)
    self.fc3 = nn.Linear(50,3)
    self.batchnormCNN1 = nn.BatchNorm2d(num_features = 8)
    self.batchnormCNN2 = nn.BatchNorm2d(num_features = 16)
    self.batchnormFC1 = nn.BatchNorm1d(num_features = 256)
    self.flatten = nn.Flatten()
    self.gamma = gamma
  def forward(self,x):
    # reformat image (input = BS,96,96, or 96,96) (output = BS,1,96,96)
    x = torch.from_numpy(np.ascontiguousarray(x)).float()
    if(x.dim()==2):
      x = torch.unsqueeze(x,dim=0)
      x = torch.unsqueeze(x,dim=0)
    elif(x.dim()==3):
      x = torch.unsqueeze(x,dim=1)
    subimage = (x[:,:,84:96,13:14]-0.495)*10
    speed = torch.sum(subimage,dim=(2,3))
    x = x[:,:,:84,:]
    #plot_image(np.squeeze(x.detach().numpy()))
    
    #print(x.shape)
    x = self.batchnormCNN1(self.LeakyReLU(self.conv1(x)))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = self.batchnormCNN2(self.LeakyReLU(self.conv2(x)))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = self.flatten(x)
    #print(x.shape)
    x = torch.cat((x,speed),dim=1)
    x = self.batchnormFC1(self.LeakyReLU(self.fc1(x)))
    #x = self.LeakyReLU(self.fc1(x))
    #print(x.shape)
    x = self.LeakyReLU(self.fc2(x))
    #print(x.shape)
    x = self.fc3(x) 
    #print(x.shape)
    return x
  def get_action(self,state):
    qvals = self.forward(state)
    return torch.argmax(qvals,1) 
  def convert_action(self,action,state):
    # determine if you are going too fast
    speed = get_speed(state).item()
    if(speed>3.5):
      accel = 0
    elif(speed>2.5):
      accel = 0
    else:
      accel = 0.1
    # convert action from index, to a list of turning,engine,breaking strengths
    action = action.item()
    # Discretized action space (left-forward,straight-forward,right-forward)
    if(action == 0):
      return [-0.3,accel,0]
    elif(action == 1):
      return [0,accel,0]
    elif(action == 2):
      return [0.3,accel,0]


### Training

In [None]:
def load_memory(new,epsilon,exp_replay_size,initial_size=None):
  if(initial_size==None):
    initial_size=exp_replay_size
  # Create the model
  env = wrap_env(gym.make("CarRacing-v1").unwrapped)
  agent = DQN_Network()
  if(not new):
    agent.load_state_dict(torch.load("car-racing-dqn.pth"))
  memory = ExperienceReplay(exp_replay_size)

  # initiliaze experience replay
  index = 0
  for i in range(exp_replay_size):
      state = env.reset()
      simulate(agent,env,epsilon = epsilon, memory = memory)
      if(len(memory.experience_replay)>=initial_size):
        break
      print(len(memory.experience_replay))

  return memory


def update(agent,optimizer,loss_func,target_agent,memory,batch_size):
  agent.train()
  target_agent.eval()
  # current (S,A) Qval
  state,action,reward,next_state = memory.sample_from_experience(batch_size)
  Qvals = agent(state)
  curr_Qval = Qvals[torch.arange(Qvals.size(0)),action.long()]
  
  # best next (S,A) Qval
  with torch.no_grad():
    next_Qval, indices = torch.max(target_agent(next_state),dim=1)

  # update agent
  #print(reward + agent.gamma*next_Qval,curr_Qval)
  loss = loss_func(reward + agent.gamma*next_Qval, curr_Qval)
  loss.backward(retain_graph = False)
  optimizer.step()
  optimizer.zero_grad()


def train(new,num_ep,lr_start,epsilon_start,gamma,memory):
  # set hyperparamters
  agent = DQN_Network(gamma=gamma)
 
  # start new run
  if(new):
    reward_hist = []; ep_len_hist = []; lr_hist = []; epsilon_hist = []
  # load previous runs
  else:
    agent.load_state_dict(torch.load("car-racing-dqn.pth")); reward_hist = load_list("reward_hist.data");ep_len_hist = load_list("ep_len_hist.data");epsilon_hist = load_list("epsilon_hist.data");lr_hist = load_list("lr_hist.data")

  #initialize models
  target_agent = DQN_Network(agent.gamma)
  target_agent.load_state_dict(agent.state_dict())
  env = wrap_env(gym.make("CarRacing-v1").unwrapped)
  optimizer = torch.optim.SGD(agent.parameters(),lr_start)
  MSELoss = torch.nn.MSELoss()

  # training loop
  for ep_num in tqdm(range(0,num_ep)):
    lr = lr_start*(0.99042**ep_num)
    epsilon = epsilon_start*(0.99424**ep_num)

    for param_group in optimizer.param_groups:
      param_group['lr'] = lr

    state, done, losses, ep_len, reward = env.reset(), False, 0, 0, 0
    reward,ep_len = simulate(agent,env,epsilon = epsilon, memory = memory)       
  
    for i in range(0,30):
      update(agent,optimizer,MSELoss,target_agent,memory,batch_size=32)
    target_agent.load_state_dict(agent.state_dict())
    gc.collect(generation=2)

    if(ep_num%3==0):
      reward, ep_len = test_model(agent=agent,env=env, episodes=1)
      print("Settings: lr =",lr,"epsilon =",epsilon)
      print("Test Result: reward =",reward[0],"episode length =",ep_len[0])
      reward_hist.append(reward[0])
      ep_len_hist.append(ep_len[0])
      lr_hist.append(lr)
      epsilon_hist.append(epsilon)
    
    if(ep_num%30==0):
      # save results
      torch.save(agent.state_dict(),"car-racing-dqn.pth");save_list(reward_hist,"reward_hist.data");save_list(ep_len_hist,"ep_len_hist.data");save_list(epsilon_hist,"epsilon_hist.data");save_list(lr_hist,"lr_hist.data")

In [None]:
memory = load_memory(new=True,epsilon=1,exp_replay_size=2000)
train(new=True,num_ep = 360,lr_start=0.0003,epsilon_start=0.8,gamma=0.92,memory=memory)

  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


12
20
27
35
44
55
64
74
90
99
108
119
136
157
169
185
205
219
227
237
254
268
297
305
321
333
352
360
367
383
394
405
423
444
473
487
499
511
520
535
547
555
563
571
587
595
613
630
638
653
664
686
699
706
719
728
738
748
768
786
795
806
821
847
869
885
905
919
928
945
957
968
980
988
1010
1018
1029
1042
1051
1064
1083
1091
1100
1137
1164
1183
1195
1209
1225
1237
1254
1265
1276
1288
1299
1312
1329
1338
1346
1364
1377
1386
1394
1402
1411
1420
1432
1445
1457
1466
1479
1490
1501
1510
1520
1539
1562
1570
1580
1588
1597
1608
1625
1640
1667
1677
1690
1710
1733
1746
1758
1768
1777
1794
1802
1810
1824
1832
1846
1863
1878
1897
1913
1933
1945
1962
1979
1990


  0%|          | 1/360 [00:05<35:45,  5.98s/it]

Test 1/1: reward = 34.98156996587023  episode len = 125

Average Reward =  34.98156996587023 Average Ep_len =  125.0 

Settings: lr = 0.0003 epsilon = 0.8
Test Result: reward = 34.98156996587023 episode length = 125


  1%|          | 4/360 [00:20<31:12,  5.26s/it]

Test 1/1: reward = 37.741516245487276  episode len = 125

Average Reward =  37.741516245487276 Average Ep_len =  125.0 

Settings: lr = 0.00029146033499462634 epsilon = 0.7862554733576193
Test Result: reward = 37.741516245487276 episode length = 125


  2%|▏         | 7/360 [00:34<29:10,  4.96s/it]

Test 1/1: reward = 30.26666666666661  episode len = 110

Average Reward =  30.26666666666661 Average Ep_len =  110.0 

Settings: lr = 0.0002831637562505994 epsilon = 0.7727470867310173
Test Result: reward = 30.26666666666661 episode length = 110


  3%|▎         | 10/360 [00:49<30:18,  5.19s/it]

Test 1/1: reward = 38.65901639344254  episode len = 135

Average Reward =  38.65901639344254 Average Ep_len =  135.0 

Settings: lr = 0.0002751033441837881 epsilon = 0.7594707830792716
Test Result: reward = 38.65901639344254 episode length = 135


  4%|▎         | 13/360 [01:03<29:01,  5.02s/it]

Test 1/1: reward = 32.77353951890027  episode len = 115

Average Reward =  32.77353951890027 Average Ep_len =  115.0 

Settings: lr = 0.00026727237617982957 epsilon = 0.7464225750640932
Test Result: reward = 32.77353951890027 episode length = 115


  4%|▍         | 16/360 [01:19<32:54,  5.74s/it]

Test 1/1: reward = 109.59352517985647  episode len = 230

Average Reward =  109.59352517985647 Average Ep_len =  230.0 

Settings: lr = 0.0002596643209872764 epsilon = 0.7335985438522895
Test Result: reward = 109.59352517985647 episode length = 230


  5%|▌         | 19/360 [01:38<36:15,  6.38s/it]

Test 1/1: reward = 180.5170648464175  episode len = 340

Average Reward =  180.5170648464175 Average Ep_len =  340.0 

Settings: lr = 0.0002522728332703459 epsilon = 0.7209948379388026
Test Result: reward = 180.5170648464175 episode length = 340


  6%|▌         | 22/360 [01:55<34:52,  6.19s/it]

Test 1/1: reward = 123.67593984962474  episode len = 265

Average Reward =  123.67593984962474 Average Ep_len =  265.0 

Settings: lr = 0.0002450917483167285 epsilon = 0.7086076719899665
Test Result: reward = 123.67593984962474 episode length = 265


  7%|▋         | 25/360 [02:18<45:16,  8.11s/it]

Test 1/1: reward = 541.7079136690567  episode len = 730

Average Reward =  541.7079136690567 Average Ep_len =  730.0 

Settings: lr = 0.00023811507689604114 epsilon = 0.6964333257066396
Test Result: reward = 541.7079136690567 episode length = 730


  8%|▊         | 28/360 [02:42<48:10,  8.71s/it]

Test 1/1: reward = 341.7481481481474  episode len = 560

Average Reward =  341.7481481481474 Average Ep_len =  560.0 

Settings: lr = 0.00023133700026463792 epsilon = 0.6844681427068687
Test Result: reward = 341.7481481481474 episode length = 560


  9%|▊         | 31/360 [03:17<1:08:10, 12.43s/it]

Test 1/1: reward = 878.5328621907947  episode len = 1140

Average Reward =  878.5328621907947 Average Ep_len =  1140.0 

Settings: lr = 0.00022475186531261112 epsilon = 0.6727085294277494
Test Result: reward = 878.5328621907947 episode length = 1140


  9%|▉         | 34/360 [03:45<1:02:52, 11.57s/it]

Test 1/1: reward = 571.1999999999912  episode len = 910

Average Reward =  571.1999999999912 Average Ep_len =  910.0 

Settings: lr = 0.0002183541798489359 epsilon = 0.6611509540461538
Test Result: reward = 571.1999999999912 episode length = 910


 10%|█         | 37/360 [04:17<1:06:19, 12.32s/it]

Test 1/1: reward = 866.9818181818008  episode len = 1110

Average Reward =  866.9818181818008 Average Ep_len =  1110.0 

Settings: lr = 0.00021213860802082586 epsilon = 0.6497919454180002
Test Result: reward = 866.9818181818008 episode length = 1110


 11%|█         | 40/360 [04:45<1:01:25, 11.52s/it]

Test 1/1: reward = 525.8511705685553  episode len = 790

Average Reward =  525.8511705685553 Average Ep_len =  790.0 

Settings: lr = 0.00020609996586347878 epsilon = 0.6386280920357476
Test Result: reward = 525.8511705685553 episode length = 790


 12%|█▏        | 43/360 [05:08<50:56,  9.64s/it]

Test 1/1: reward = 259.93039513677945  episode len = 500

Average Reward =  259.93039513677945 Average Ep_len =  500.0 

Settings: lr = 0.00020023321697650197 epsilon = 0.6276560410037999
Test Result: reward = 259.93039513677945 episode length = 500


 13%|█▎        | 46/360 [05:36<56:36, 10.82s/it]

Test 1/1: reward = 664.6731629392862  episode len = 1020

Average Reward =  664.6731629392862 Average Ep_len =  1020.0 

Settings: lr = 0.0001945334683234099 epsilon = 0.6168724970315149
Test Result: reward = 664.6731629392862 episode length = 1020


 14%|█▎        | 49/360 [05:59<48:39,  9.39s/it]

Test 1/1: reward = 329.93262411347433  episode len = 490

Average Reward =  329.93262411347433 Average Ep_len =  490.0 

Settings: lr = 0.00018899596615069196 epsilon = 0.606274221443513
Test Result: reward = 329.93262411347433 episode length = 490


 14%|█▍        | 52/360 [06:29<57:54, 11.28s/it]

Test 1/1: reward = 817.6283018867798  episode len = 1065

Average Reward =  817.6283018867798 Average Ep_len =  1065.0 

Settings: lr = 0.0001836160920230458 epsilon = 0.5958580312069891
Test Result: reward = 817.6283018867798 episode length = 1065


 15%|█▌        | 55/360 [06:54<51:02, 10.04s/it]

Test 1/1: reward = 411.9100671140908  episode len = 645

Average Reward =  411.9100671140908 Average Ep_len =  645.0 

Settings: lr = 0.00017838935897147027 epsilon = 0.5856207979757379
Test Result: reward = 411.9100671140908 episode length = 645


 16%|█▌        | 58/360 [07:13<40:37,  8.07s/it]

Test 1/1: reward = 149.7303030303036  episode len = 280

Average Reward =  149.7303030303036 Average Ep_len =  280.0 

Settings: lr = 0.00017331140775100459 epsilon = 0.5755594471506006
Test Result: reward = 149.7303030303036 episode length = 280


 17%|█▋        | 61/360 [07:36<41:36,  8.35s/it]

Test 1/1: reward = 287.90617283950667  episode len = 515

Average Reward =  287.90617283950667 Average Ep_len =  515.0 

Settings: lr = 0.00016837800320499362 epsilon = 0.5656709569560564
Test Result: reward = 287.90617283950667 episode length = 515


 18%|█▊        | 64/360 [08:06<52:41, 10.68s/it]

Test 1/1: reward = 749.0315789473523  episode len = 1035

Average Reward =  749.0315789473523 Average Ep_len =  1035.0 

Settings: lr = 0.00016358503073284568 epsilon = 0.5559523575326769
Test Result: reward = 749.0315789473523 episode length = 1035


 19%|█▊        | 67/360 [08:33<50:58, 10.44s/it]

Test 1/1: reward = 547.6733788395808  episode len = 765

Average Reward =  547.6733788395808 Average Ep_len =  765.0 

Settings: lr = 0.0001589284928583382 epsilon = 0.5464007300451741
Test Result: reward = 547.6733788395808 episode length = 765


 19%|█▉        | 70/360 [09:06<59:39, 12.34s/it]

Test 1/1: reward = 889.4999999999827  episode len = 1100

Average Reward =  889.4999999999827 Average Ep_len =  1100.0 

Settings: lr = 0.0001544045058956078 epsilon = 0.5370132058057714
Test Result: reward = 889.4999999999827 episode length = 1100


 20%|██        | 73/360 [09:38<1:01:10, 12.79s/it]

Test 1/1: reward = 885.3999999999819  episode len = 1145

Average Reward =  885.3999999999819 Average Ep_len =  1145.0 

Settings: lr = 0.00015000929671004533 epsilon = 0.5277869654126367
Test Result: reward = 885.3999999999819 episode length = 1145


 21%|██        | 76/360 [10:05<54:20, 11.48s/it]

Test 1/1: reward = 490.1118644067733  episode len = 725

Average Reward =  490.1118644067733 Average Ep_len =  725.0 

Settings: lr = 0.00014573919957139373 epsilon = 0.5187192379031176
Test Result: reward = 490.1118644067733 episode length = 725


 22%|██▏       | 79/360 [10:36<54:56, 11.73s/it]

Test 1/1: reward = 549.4042071197321  episode len = 845

Average Reward =  549.4042071197321 Average Ep_len =  845.0 

Settings: lr = 0.00014159065309642376 epsilon = 0.5098072999215241
Test Result: reward = 549.4042071197321 episode length = 845


 23%|██▎       | 82/360 [10:59<45:23,  9.80s/it]

Test 1/1: reward = 337.748829431437  episode len = 565

Average Reward =  337.748829431437 Average Ep_len =  565.0 

Settings: lr = 0.00013756019727863866 epsilon = 0.5010484749012095
Test Result: reward = 337.748829431437 episode length = 565


 24%|██▎       | 85/360 [11:27<49:58, 10.90s/it]

Test 1/1: reward = 835.7312977099123  episode len = 1030

Average Reward =  835.7312977099123 Average Ep_len =  1030.0 

Settings: lr = 0.00013364447060252972 epsilon = 0.49244013226070465
Test Result: reward = 835.7312977099123 episode length = 1030


 24%|██▍       | 88/360 [12:00<56:18, 12.42s/it]

Test 1/1: reward = 891.7999999999902  episode len = 1080

Average Reward =  891.7999999999902 Average Ep_len =  1080.0 

Settings: lr = 0.00012984020723997604 epsilon = 0.48397968661366114
Test Result: reward = 891.7999999999902 episode length = 1080


 25%|██▌       | 91/360 [12:28<49:14, 10.98s/it]

Test 1/1: reward = 279.25652173913005  episode len = 575

Average Reward =  279.25652173913005 Average Ep_len =  575.0 

Settings: lr = 0.00012614423432645043 epsilon = 0.4756645969923705
Test Result: reward = 279.25652173913005 episode length = 575


 26%|██▌       | 94/360 [12:52<44:22, 10.01s/it]

Test 1/1: reward = 291.0061855670103  episode len = 490

Average Reward =  291.0061855670103 Average Ep_len =  490.0 

Settings: lr = 0.00012255346931475963 epsilon = 0.46749236608462175
Test Result: reward = 291.0061855670103 episode length = 490


 27%|██▋       | 97/360 [13:15<40:08,  9.16s/it]

Test 1/1: reward = 363.4725631768924  episode len = 550

Average Reward =  363.4725631768924 Average Ep_len =  550.0 

Settings: lr = 0.00011906491740411169 epsilon = 0.45946053948367216
Test Result: reward = 363.4725631768924 episode length = 550


 28%|██▊       | 100/360 [13:46<48:44, 11.25s/it]

Test 1/1: reward = 656.2221476509955  episode len = 920

Average Reward =  656.2221476509955 Average Ep_len =  920.0 

Settings: lr = 0.00011567566904236637 epsilon = 0.4515667049511022
Test Result: reward = 656.2221476509955 episode length = 920


 29%|██▊       | 103/360 [14:07<37:21,  8.72s/it]

Test 1/1: reward = 193.87142857142987  episode len = 345

Average Reward =  193.87142857142987 Average Ep_len =  345.0 

Settings: lr = 0.00011238289749938545 epsilon = 0.4438084916923365
Test Result: reward = 193.87142857142987 episode length = 345


 29%|██▉       | 106/360 [14:40<49:34, 11.71s/it]

Test 1/1: reward = 865.1385964912097  episode len = 1100

Average Reward =  865.1385964912097 Average Ep_len =  1100.0 

Settings: lr = 0.0001091838565094588 epsilon = 0.43618356964461136
Test Result: reward = 865.1385964912097 episode length = 1100


 30%|███       | 109/360 [15:17<57:58, 13.86s/it]

Test 1/1: reward = 854.9919614147725  episode len = 1220

Average Reward =  854.9919614147725 Average Ep_len =  1220.0 

Settings: lr = 0.00010607587798084027 epsilon = 0.42868964877717497
Test Result: reward = 854.9919614147725 episode length = 1220


 31%|███       | 112/360 [15:36<38:11,  9.24s/it]

Test 1/1: reward = 130.21584158415905  episode len = 280

Average Reward =  130.21584158415905 Average Ep_len =  280.0 

Settings: lr = 0.0001030563697704827 epsilon = 0.42132447840351156
Test Result: reward = 130.21584158415905 episode length = 280


 32%|███▏      | 115/360 [16:11<49:27, 12.11s/it]

Test 1/1: reward = 885.4999999999801  episode len = 1140

Average Reward =  885.4999999999801 Average Ep_len =  1140.0 

Settings: lr = 0.00010012281352211659 epsilon = 0.4140858465053812
Test Result: reward = 885.4999999999801 episode length = 1140


 33%|███▎      | 118/360 [16:45<51:42, 12.82s/it]

Test 1/1: reward = 735.8838283828281  episode len = 1020

Average Reward =  735.8838283828281 Average Ep_len =  1020.0 

Settings: lr = 9.72727625658687e-05 epsilon = 0.4069715790684737
Test Result: reward = 735.8838283828281 episode length = 1020


 34%|███▎      | 121/360 [17:07<38:33,  9.68s/it]

Test 1/1: reward = 218.92899022801447  episode len = 380

Average Reward =  218.92899022801447 Average Ep_len =  380.0 

Settings: lr = 9.450383987766947e-05 epsilon = 0.39997953942947573
Test Result: reward = 218.92899022801447 episode length = 380


 34%|███▍      | 124/360 [17:41<47:23, 12.05s/it]

Test 1/1: reward = 883.6999999999787  episode len = 1160

Average Reward =  883.6999999999787 Average Ep_len =  1160.0 

Settings: lr = 9.181373609674692e-05 epsilon = 0.3931076276343562
Test Result: reward = 883.6999999999787 episode length = 1160


 35%|███▌      | 127/360 [18:18<53:26, 13.76s/it]

Test 1/1: reward = 874.9142857142679  episode len = 1105

Average Reward =  874.9142857142679 Average Ep_len =  1105.0 

Settings: lr = 8.92002075995536e-05 epsilon = 0.3863537798076768
Test Result: reward = 874.9142857142679 episode length = 1105


 36%|███▌      | 130/360 [18:55<54:46, 14.29s/it]

Test 1/1: reward = 887.7999999999843  episode len = 1120

Average Reward =  887.7999999999843 Average Ep_len =  1120.0 

Settings: lr = 8.666107462852034e-05 epsilon = 0.37971596753273795
Test Result: reward = 887.7999999999843 episode length = 1120


 37%|███▋      | 133/360 [19:29<51:06, 13.51s/it]

Test 1/1: reward = 885.3664310953891  episode len = 1110

Average Reward =  885.3664310953891 Average Ep_len =  1110.0 

Settings: lr = 8.419421947407619e-05 epsilon = 0.373192197242374
Test Result: reward = 885.3664310953891 episode length = 1110


 38%|███▊      | 136/360 [19:59<43:07, 11.55s/it]

Test 1/1: reward = 352.8591240875895  episode len = 555

Average Reward =  352.8591240875895 Average Ep_len =  555.0 

Settings: lr = 8.179758470841781e-05 epsilon = 0.36678050962021597
Test Result: reward = 352.8591240875895 episode length = 555


 39%|███▊      | 139/360 [20:30<44:11, 12.00s/it]

Test 1/1: reward = 884.2734693877386  episode len = 990

Average Reward =  884.2734693877386 Average Ep_len =  990.0 

Settings: lr = 7.946917146955593e-05 epsilon = 0.3604789790122397
Test Result: reward = 884.2734693877386 episode length = 990


 39%|███▉      | 142/360 [20:53<33:53,  9.33s/it]

Test 1/1: reward = 184.21428571428703  episode len = 385

Average Reward =  184.21428571428703 Average Ep_len =  385.0 

Settings: lr = 7.720703779420727e-05 epsilon = 0.3542857128484247
Test Result: reward = 184.21428571428703 episode length = 385


 40%|████      | 145/360 [21:29<44:43, 12.48s/it]

Test 1/1: reward = 866.6814432989497  episode len = 1125

Average Reward =  866.6814432989497 Average Ep_len =  1125.0 

Settings: lr = 7.500929699814144e-05 epsilon = 0.34819885107434967
Test Result: reward = 866.6814432989497 episode length = 1125


 41%|████      | 148/360 [21:54<37:46, 10.69s/it]

Test 1/1: reward = 377.06370106761347  episode len = 570

Average Reward =  377.06370106761347 Average Ep_len =  570.0 

Settings: lr = 7.28741161026324e-05 epsilon = 0.3422165655925525
Test Result: reward = 377.06370106761347 episode length = 570


 42%|████▏     | 151/360 [22:20<34:53, 10.02s/it]

Test 1/1: reward = 241.6117647058839  episode len = 375

Average Reward =  241.6117647058839 Average Ep_len =  375.0 

Settings: lr = 7.079971430570178e-05 epsilon = 0.3363370597134889
Test Result: reward = 241.6117647058839 episode length = 375


 43%|████▎     | 154/360 [22:47<33:16,  9.69s/it]

Test 1/1: reward = 199.77500000000106  episode len = 345

Average Reward =  199.77500000000106 Average Ep_len =  345.0 

Settings: lr = 6.878436149687895e-05 epsilon = 0.3305585676159238
Test Result: reward = 199.77500000000106 episode length = 345


 44%|████▎     | 157/360 [23:10<27:45,  8.20s/it]

Test 1/1: reward = 60.07589576547214  episode len = 180

Average Reward =  60.07589576547214 Average Ep_len =  180.0 

Settings: lr = 6.68263768142394e-05 epsilon = 0.32487935381659344
Test Result: reward = 60.07589576547214 episode length = 180


 44%|████▍     | 160/360 [23:39<34:05, 10.23s/it]

Test 1/1: reward = 546.4863481228571  episode len = 810

Average Reward =  546.4863481228571 Average Ep_len =  810.0 

Settings: lr = 6.492412724251782e-05 epsilon = 0.3192977126489789
Test Result: reward = 546.4863481228571 episode length = 810


 45%|████▌     | 163/360 [24:01<28:52,  8.79s/it]

Test 1/1: reward = 273.38953068592065  episode len = 440

Average Reward =  273.38953068592065 Average Ep_len =  440.0 

Settings: lr = 6.307602625112664e-05 epsilon = 0.313811967751035
Test Result: reward = 273.38953068592065 episode length = 440


 46%|████▌     | 166/360 [24:23<27:45,  8.58s/it]

Test 1/1: reward = 268.29090909090917  episode len = 405

Average Reward =  268.29090909090917 Average Ep_len =  405.0 

Settings: lr = 6.128053247094406e-05 epsilon = 0.30842047156172
Test Result: reward = 268.29090909090917 episode length = 405


 47%|████▋     | 169/360 [24:54<35:18, 11.09s/it]

Test 1/1: reward = 864.7486590038212  episode len = 1045

Average Reward =  864.7486590038212 Average Ep_len =  1045.0 

Settings: lr = 5.953614840876811e-05 epsilon = 0.30312160482617534
Test Result: reward = 864.7486590038212 episode length = 1045


 48%|████▊     | 172/360 [25:28<39:01, 12.46s/it]

Test 1/1: reward = 826.5215946843729  episode len = 1165

Average Reward =  826.5215946843729 Average Ep_len =  1165.0 

Settings: lr = 5.784141919836449e-05 epsilon = 0.2979137761094071
Test Result: reward = 826.5215946843729 episode length = 1165


 49%|████▊     | 175/360 [25:53<32:08, 10.42s/it]

Test 1/1: reward = 340.0174496644283  episode len = 525

Average Reward =  340.0174496644283 Average Ep_len =  525.0 

Settings: lr = 5.619493138706643e-05 epsilon = 0.2927954213183221
Test Result: reward = 340.0174496644283 episode length = 525


 49%|████▉     | 178/360 [26:14<26:12,  8.64s/it]

Test 1/1: reward = 276.18469750889744  episode len = 440

Average Reward =  276.18469750889744 Average Ep_len =  440.0 

Settings: lr = 5.4595311756914746e-05 epsilon = 0.2877650032319761
Test Result: reward = 276.18469750889744 episode length = 440


 50%|█████     | 181/360 [26:36<24:31,  8.22s/it]

Test 1/1: reward = 192.6581699346417  episode len = 360

Average Reward =  192.6581699346417 Average Ep_len =  360.0 

Settings: lr = 5.304122617935478e-05 epsilon = 0.28282101103989277
Test Result: reward = 192.6581699346417 episode length = 360


 51%|█████     | 184/360 [26:58<23:58,  8.17s/it]

Test 1/1: reward = 270.6014084507052  episode len = 460

Average Reward =  270.6014084507052 Average Ep_len =  460.0 

Settings: lr = 5.153137850253497e-05 epsilon = 0.2779619598883142
Test Result: reward = 270.6014084507052 episode length = 460


 52%|█████▏    | 187/360 [27:24<26:47,  9.29s/it]

Test 1/1: reward = 314.24966887417145  episode len = 530

Average Reward =  314.24966887417145 Average Ep_len =  530.0 

Settings: lr = 5.00645094702791e-05 epsilon = 0.27318639043424753
Test Result: reward = 314.24966887417145 episode length = 530


 53%|█████▎    | 190/360 [27:47<24:09,  8.52s/it]

Test 1/1: reward = 163.07586206896647  episode len = 350

Average Reward =  163.07586206896647 Average Ep_len =  350.0 

Settings: lr = 4.863939567183063e-05 epsilon = 0.26849286840717335
Test Result: reward = 163.07586206896647 episode length = 350


 54%|█████▎    | 193/360 [28:03<18:47,  6.75s/it]

Test 1/1: reward = 45.340590405903946  episode len = 135

Average Reward =  45.340590405903946 Average Ep_len =  135.0 

Settings: lr = 4.7254848521493125e-05 epsilon = 0.2638799841782838
Test Result: reward = 45.340590405903946 episode length = 135


 54%|█████▍    | 196/360 [28:20<16:39,  6.09s/it]

Test 1/1: reward = 43.037588652482164  episode len = 135

Average Reward =  43.037588652482164 Average Ep_len =  135.0 

Settings: lr = 4.5909713267315705e-05 epsilon = 0.259346352337122
Test Result: reward = 43.037588652482164 episode length = 135


 55%|█████▌    | 199/360 [28:47<22:43,  8.47s/it]

Test 1/1: reward = 309.66946107784383  episode len = 555

Average Reward =  309.66946107784383 Average Ep_len =  555.0 

Settings: lr = 4.4602868027996924e-05 epsilon = 0.25489061127549467
Test Result: reward = 309.66946107784383 episode length = 555


 56%|█████▌    | 202/360 [29:27<33:44, 12.81s/it]

Test 1/1: reward = 866.5292418772465  episode len = 1080

Average Reward =  866.5292418772465 Average Ep_len =  1080.0 

Settings: lr = 4.3333222857203645e-05 epsilon = 0.2505114227785338
Test Result: reward = 866.5292418772465 episode length = 1080


 57%|█████▋    | 205/360 [29:57<31:29, 12.19s/it]

Test 1/1: reward = 761.1061302681903  episode len = 930

Average Reward =  761.1061302681903 Average Ep_len =  930.0 

Settings: lr = 4.209971883452459e-05 epsilon = 0.24620747162278345
Test Result: reward = 761.1061302681903 episode length = 930


 58%|█████▊    | 208/360 [30:24<26:46, 10.57s/it]

Test 1/1: reward = 224.42352941176537  episode len = 340

Average Reward =  224.42352941176537 Average Ep_len =  340.0 

Settings: lr = 4.090132718230039e-05 epsilon = 0.24197746518119276
Test Result: reward = 224.42352941176537 episode length = 340


 59%|█████▊    | 211/360 [30:51<25:58, 10.46s/it]

Test 1/1: reward = 499.5747603833802  episode len = 785

Average Reward =  499.5747603833802 Average Ep_len =  785.0 

Settings: lr = 3.973704840759363e-05 epsilon = 0.2378201330348944
Test Result: reward = 499.5747603833802 episode length = 785


 59%|█████▉    | 214/360 [31:14<22:05,  9.08s/it]

Test 1/1: reward = 170.25479452054867  episode len = 315

Average Reward =  170.25479452054867 Average Ep_len =  315.0 

Settings: lr = 3.8605911468583084e-05 epsilon = 0.2337342265916536
Test Result: reward = 170.25479452054867 episode length = 315


 60%|██████    | 217/360 [31:35<19:08,  8.03s/it]

Test 1/1: reward = 163.14628975265083  episode len = 310

Average Reward =  163.14628975265083 Average Ep_len =  310.0 

Settings: lr = 3.7506972964687045e-05 epsilon = 0.22971851871087204
Test Result: reward = 163.14628975265083 episode length = 310


 61%|██████    | 220/360 [31:59<19:37,  8.41s/it]

Test 1/1: reward = 205.2181184668999  episode len = 350

Average Reward =  205.2181184668999 Average Ep_len =  350.0 

Settings: lr = 3.643931634974026e-05 epsilon = 0.22577180333503477
Test Result: reward = 205.2181184668999 episode length = 350


 62%|██████▏   | 223/360 [32:22<19:03,  8.35s/it]

Test 1/1: reward = 217.99706744868195  episode len = 425

Average Reward =  217.99706744868195 Average Ep_len =  425.0 

Settings: lr = 3.540205116756821e-05 epsilon = 0.22189289512748883
Test Result: reward = 217.99706744868195 episode length = 425


 63%|██████▎   | 226/360 [32:42<16:52,  7.56s/it]

Test 1/1: reward = 141.91379310344908  episode len = 300

Average Reward =  141.91379310344908 Average Ep_len =  300.0 

Settings: lr = 3.4394312309321114e-05 epsilon = 0.21808062911644535
Test Result: reward = 141.91379310344908 episode length = 300


 64%|██████▎   | 229/360 [33:01<14:49,  6.79s/it]

Test 1/1: reward = 53.444444444444315  episode len = 155

Average Reward =  53.444444444444315 Average Ep_len =  155.0 

Settings: lr = 3.3415259291948445e-05 epsilon = 0.2143338603450977
Test Result: reward = 53.444444444444315 episode length = 155


 64%|██████▍   | 232/360 [33:17<12:59,  6.09s/it]

Test 1/1: reward = 97.77394636015346  episode len = 205

Average Reward =  97.77394636015346 Average Ep_len =  205.0 

Settings: lr = 3.246407555721199e-05 epsilon = 0.21065146352775077
Test Result: reward = 97.77394636015346 episode length = 205


 65%|██████▌   | 235/360 [33:32<11:43,  5.62s/it]

Test 1/1: reward = 56.42727272727255  episode len = 160

Average Reward =  56.42727272727255 Average Ep_len =  160.0 

Settings: lr = 3.153996779065289e-05 epsilon = 0.20703233271185872
Test Result: reward = 56.42727272727255 episode length = 160


 66%|██████▌   | 238/360 [33:47<10:55,  5.37s/it]

Test 1/1: reward = 59.271698113207414  episode len = 160

Average Reward =  59.271698113207414 Average Ep_len =  160.0 

Settings: lr = 3.064216525994472e-05 epsilon = 0.20347538094586823
Test Result: reward = 59.271698113207414 episode length = 160


 67%|██████▋   | 241/360 [34:04<11:58,  6.04s/it]

Test 1/1: reward = 67.00858085808581  episode len = 185

Average Reward =  67.00858085808581 Average Ep_len =  185.0 

Settings: lr = 2.976991917208064e-05 epsilon = 0.1999795399527694
Test Result: reward = 67.00858085808581 episode length = 185


 68%|██████▊   | 244/360 [34:23<12:50,  6.64s/it]

Test 1/1: reward = 261.0000000000002  episode len = 385

Average Reward =  261.0000000000002 Average Ep_len =  385.0 

Settings: lr = 2.892250204885858e-05 epsilon = 0.19654375980925454
Test Result: reward = 261.0000000000002 episode length = 385


 69%|██████▊   | 247/360 [34:38<10:44,  5.71s/it]

Test 1/1: reward = 56.00216606498182  episode len = 160

Average Reward =  56.00216606498182 Average Ep_len =  160.0 

Settings: lr = 2.809920712014363e-05 epsilon = 0.19316700863038955
Test Result: reward = 56.00216606498182 episode length = 160


 69%|██████▉   | 250/360 [34:56<11:17,  6.16s/it]

Test 1/1: reward = 57.73309608540911  episode len = 165

Average Reward =  57.73309608540911 Average Ep_len =  165.0 

Settings: lr = 2.729934773440151e-05 epsilon = 0.18984827225970283
Test Result: reward = 57.73309608540911 episode length = 165


 70%|███████   | 253/360 [35:14<11:08,  6.25s/it]

Test 1/1: reward = 81.97519379844977  episode len = 185

Average Reward =  81.97519379844977 Average Ep_len =  185.0 

Settings: lr = 2.652225678601153e-05 epsilon = 0.18658655396459853
Test Result: reward = 81.97519379844977 episode length = 185


 71%|███████   | 256/360 [35:30<10:10,  5.87s/it]

Test 1/1: reward = 52.74126984126972  episode len = 170

Average Reward =  52.74126984126972 Average Ep_len =  170.0 

Settings: lr = 2.5767286158881407e-05 epsilon = 0.18338087413700296
Test Result: reward = 52.74126984126972 episode length = 170


 72%|███████▏  | 259/360 [35:47<10:27,  6.22s/it]

Test 1/1: reward = 46.299999999999876  episode len = 160

Average Reward =  46.299999999999876 Average Ep_len =  160.0 

Settings: lr = 2.5033806185899917e-05 epsilon = 0.18023026999915406
Test Result: reward = 46.299999999999876 episode length = 160


 73%|███████▎  | 262/360 [36:04<09:37,  5.90s/it]

Test 1/1: reward = 51.537809187278995  episode len = 155

Average Reward =  51.537809187278995 Average Ep_len =  155.0 

Settings: lr = 2.4321205123776464e-05 epsilon = 0.1771337953144455
Test Result: reward = 51.537809187278995 episode length = 155


 74%|███████▎  | 265/360 [36:21<09:17,  5.86s/it]

Test 1/1: reward = 49.806600660065854  episode len = 160

Average Reward =  49.806600660065854 Average Ep_len =  160.0 

Settings: lr = 2.3628888642829706e-05 epsilon = 0.17409052010323875
Test Result: reward = 49.806600660065854 episode length = 160


 74%|███████▍  | 268/360 [36:35<08:14,  5.38s/it]

Test 1/1: reward = 53.386411149825626  episode len = 160

Average Reward =  53.386411149825626 Average Ep_len =  160.0 

Settings: lr = 2.295627933129956e-05 epsilon = 0.17109953036355763
Test Result: reward = 53.386411149825626 episode length = 160


 75%|███████▌  | 271/360 [36:51<08:01,  5.41s/it]

Test 1/1: reward = 45.98831168831155  episode len = 155

Average Reward =  45.98831168831155 Average Ep_len =  155.0 

Settings: lr = 2.2302816213769293e-05 epsilon = 0.16815992779658168
Test Result: reward = 45.98831168831155 episode length = 155


 76%|███████▌  | 274/360 [37:06<07:43,  5.39s/it]

Test 1/1: reward = 53.24444444444431  episode len = 160

Average Reward =  53.24444444444431 Average Ep_len =  160.0 

Settings: lr = 2.166795428329594e-05 epsilon = 0.1652708295368555
Test Result: reward = 53.24444444444431 episode length = 160


 77%|███████▋  | 277/360 [37:23<07:58,  5.76s/it]

Test 1/1: reward = 52.52852233676964  episode len = 160

Average Reward =  52.52852233676964 Average Ep_len =  160.0 

Settings: lr = 2.1051164046858948e-05 epsilon = 0.16243136788713342
Test Result: reward = 52.52852233676964 episode length = 160


 78%|███████▊  | 280/360 [37:39<07:29,  5.62s/it]

Test 1/1: reward = 51.881818181818026  episode len = 160

Average Reward =  51.881818181818026 Average Ep_len =  160.0 

Settings: lr = 2.045193108374781e-05 epsilon = 0.1596406900577796
Test Result: reward = 51.881818181818026 episode length = 160


 79%|███████▊  | 283/360 [37:55<07:08,  5.57s/it]

Test 1/1: reward = 54.645945945945826  episode len = 160

Average Reward =  54.645945945945826 Average Ep_len =  160.0 

Settings: lr = 1.9869755616520497e-05 epsilon = 0.15689795791064556
Test Result: reward = 54.645945945945826 episode length = 160


 79%|███████▉  | 286/360 [38:10<06:41,  5.43s/it]

Test 1/1: reward = 70.1321543408361  episode len = 195

Average Reward =  70.1321543408361 Average Ep_len =  195.0 

Settings: lr = 1.9304152094174742e-05 epsilon = 0.1542023477073481
Test Result: reward = 70.1321543408361 episode length = 195


 80%|████████  | 289/360 [38:26<06:27,  5.45s/it]

Test 1/1: reward = 29.433546325878535  episode len = 120

Average Reward =  29.433546325878535 Average Ep_len =  120.0 

Settings: lr = 1.875464878718463e-05 epsilon = 0.15155304986187143
Test Result: reward = 29.433546325878535 episode length = 120


 81%|████████  | 292/360 [38:39<05:37,  4.96s/it]

Test 1/1: reward = 25.80087463556845  episode len = 120

Average Reward =  25.80087463556845 Average Ep_len =  120.0 

Settings: lr = 1.822078739406465e-05 epsilon = 0.14894926869742073
Test Result: reward = 25.80087463556845 episode length = 120


 82%|████████▏ | 295/360 [38:53<05:11,  4.79s/it]

Test 1/1: reward = 34.49498207885296  episode len = 120

Average Reward =  34.49498207885296 Average Ep_len =  120.0 

Settings: lr = 1.770212265913316e-05 epsilon = 0.1463902222074522
Test Result: reward = 34.49498207885296 episode length = 120


 83%|████████▎ | 298/360 [39:10<05:52,  5.69s/it]

Test 1/1: reward = 72.56770186335412  episode len = 205

Average Reward =  72.56770186335412 Average Ep_len =  205.0 

Settings: lr = 1.719822200115639e-05 epsilon = 0.14387514182080924
Test Result: reward = 72.56770186335412 episode length = 205


 84%|████████▎ | 301/360 [39:31<06:38,  6.75s/it]

Test 1/1: reward = 155.25338078291895  episode len = 295

Average Reward =  155.25338078291895 Average Ep_len =  295.0 

Settings: lr = 1.6708665152563318e-05 epsilon = 0.1414032721708937
Test Result: reward = 155.25338078291895 episode length = 295


 84%|████████▍ | 304/360 [39:50<06:25,  6.89s/it]

Test 1/1: reward = 110.83333333333371  episode len = 220

Average Reward =  110.83333333333371 Average Ep_len =  220.0 

Settings: lr = 1.623304380893048e-05 epsilon = 0.13897387086880286
Test Result: reward = 110.83333333333371 episode length = 220


 85%|████████▌ | 307/360 [40:09<05:57,  6.74s/it]

Test 1/1: reward = 106.37756653992437  episode len = 225

Average Reward =  106.37756653992437 Average Ep_len =  225.0 

Settings: lr = 1.577096128844441e-05 epsilon = 0.13658620828036405
Test Result: reward = 106.37756653992437 episode length = 225


 86%|████████▌ | 310/360 [40:28<05:36,  6.73s/it]

Test 1/1: reward = 83.00622837370257  episode len = 205

Average Reward =  83.00622837370257 Average Ep_len =  205.0 

Settings: lr = 1.5322032201057643e-05 epsilon = 0.134239567307
Test Result: reward = 83.00622837370257 episode length = 205


 87%|████████▋ | 313/360 [40:49<05:43,  7.32s/it]

Test 1/1: reward = 175.9219931271485  episode len = 335

Average Reward =  175.9219931271485 Average Ep_len =  335.0 

Settings: lr = 1.4885882127062377e-05 epsilon = 0.13193324317035912
Test Result: reward = 175.9219931271485 episode length = 335


 88%|████████▊ | 316/360 [41:09<05:26,  7.42s/it]

Test 1/1: reward = 169.1042253521136  episode len = 315

Average Reward =  169.1042253521136 Average Ep_len =  315.0 

Settings: lr = 1.4462147304813739e-05 epsilon = 0.12966654320064572
Test Result: reward = 169.1042253521136 episode length = 315


 89%|████████▊ | 319/360 [41:26<04:27,  6.54s/it]

Test 1/1: reward = 33.383850931676946  episode len = 130

Average Reward =  33.383850931676946 Average Ep_len =  130.0 

Settings: lr = 1.4050474327342151e-05 epsilon = 0.12743878662858737
Test Result: reward = 33.383850931676946 episode length = 130


 89%|████████▉ | 322/360 [41:40<03:23,  5.36s/it]

Test 1/1: reward = 36.872180451127726  episode len = 115

Average Reward =  36.872180451127726 Average Ep_len =  115.0 

Settings: lr = 1.3650519847601804e-05 epsilon = 0.12524930438097576
Test Result: reward = 36.872180451127726 episode length = 115


 90%|█████████ | 325/360 [41:55<03:06,  5.32s/it]

Test 1/1: reward = 75.82033898305087  episode len = 170

Average Reward =  75.82033898305087 Average Ep_len =  170.0 

Settings: lr = 1.3261950292109393e-05 epsilon = 0.12309743887972079
Test Result: reward = 75.82033898305087 episode length = 170


 91%|█████████ | 328/360 [42:10<02:49,  5.29s/it]

Test 1/1: reward = 37.305050505050424  episode len = 130

Average Reward =  37.305050505050424 Average Ep_len =  130.0 

Settings: lr = 1.2884441582734289e-05 epsilon = 0.12098254384435683
Test Result: reward = 37.305050505050424 episode length = 130


 92%|█████████▏| 331/360 [42:23<02:22,  4.91s/it]

Test 1/1: reward = 36.304950495049404  episode len = 130

Average Reward =  36.304950495049404 Average Ep_len =  130.0 

Settings: lr = 1.2517678866408098e-05 epsilon = 0.11890398409794212
Test Result: reward = 36.304950495049404 episode length = 130


 93%|█████████▎| 334/360 [42:39<02:22,  5.49s/it]

Test 1/1: reward = 42.17840531561451  episode len = 140

Average Reward =  42.17840531561451 Average Ep_len =  140.0 

Settings: lr = 1.2161356252528197e-05 epsilon = 0.11686113537629289
Test Result: reward = 42.17840531561451 episode length = 140


 94%|█████████▎| 337/360 [42:53<01:54,  4.99s/it]

Test 1/1: reward = 36.52280701754378  episode len = 125

Average Reward =  36.52280701754378 Average Ep_len =  125.0 

Settings: lr = 1.1815176557836208e-05 epsilon = 0.11485338414049498
Test Result: reward = 36.52280701754378 episode length = 125


 94%|█████████▍| 340/360 [43:10<01:56,  5.81s/it]

Test 1/1: reward = 164.99844961240373  episode len = 285

Average Reward =  164.99844961240373 Average Ep_len =  285.0 

Settings: lr = 1.147885105855866e-05 epsilon = 0.11288012739263671
Test Result: reward = 164.99844961240373 episode length = 285


 95%|█████████▌| 343/360 [43:27<01:40,  5.90s/it]

Test 1/1: reward = 24.89870550161807  episode len = 105

Average Reward =  24.89870550161807 Average Ep_len =  105.0 

Settings: lr = 1.1152099249603095e-05 epsilon = 0.11094077249470742
Test Result: reward = 24.89870550161807 episode length = 105


 96%|█████████▌| 346/360 [43:48<01:38,  7.05s/it]

Test 1/1: reward = 240.16348122867046  episode len = 395

Average Reward =  240.16348122867046 Average Ep_len =  395.0 

Settings: lr = 1.0834648610608797e-05 epsilon = 0.10903473699060767
Test Result: reward = 240.16348122867046 episode length = 395


 97%|█████████▋| 349/360 [44:11<01:30,  8.20s/it]

Test 1/1: reward = 331.28291814946516  episode len = 490

Average Reward =  331.28291814946516 Average Ep_len =  490.0 

Settings: lr = 1.0526234378657011e-05 epsilon = 0.10716144843121718
Test Result: reward = 331.28291814946516 episode length = 490


 98%|█████████▊| 352/360 [44:27<00:53,  6.68s/it]

Test 1/1: reward = 158.11118881118955  episode len = 305

Average Reward =  158.11118881118955 Average Ep_len =  305.0 

Settings: lr = 1.0226599327451084e-05 epsilon = 0.10532034420246845
Test Result: reward = 158.11118881118955 episode length = 305


 99%|█████████▊| 355/360 [44:46<00:33,  6.71s/it]

Test 1/1: reward = 112.48411552346606  episode len = 245

Average Reward =  112.48411552346606 Average Ep_len =  245.0 

Settings: lr = 9.93549355278238e-06 epsilon = 0.10351087135637403
Test Result: reward = 112.48411552346606 episode length = 245


 99%|█████████▉| 358/360 [45:01<00:12,  6.09s/it]

Test 1/1: reward = 142.33170731707372  episode len = 280

Average Reward =  142.33170731707372 Average Ep_len =  280.0 

Settings: lr = 9.65267426410301e-06 epsilon = 0.10173248644495686
Test Result: reward = 142.33170731707372 episode length = 280


100%|██████████| 360/360 [45:11<00:00,  7.53s/it]


In [None]:
env = wrap_env(gym.make("CarRacing-v1").unwrapped)
agent = DQN_Network()
agent.load_state_dict(torch.load("car-racing-dqn.pth"))

simulate(agent=agent,env=env,render=True)
#test_model(agent,env,episodes=10)

In [None]:
print("lr_hist")
hist = load_list("lr_hist.data")
for stage in hist:
  print(stage)
print("\nepsilon_hist")
hist = load_list("epsilon_hist.data")
for stage in hist:
  print(stage)
print("\nep_len_hist")
hist = load_list("ep_len_hist.data")
for stage in hist:
  print(st.mean(stage))
print("\nreward_hist")
hist = load_list("reward_hist.data")
for stage in hist:
  print(st.mean(stage))

lr_hist
0.0005
0.0005
0.00025
0.00025
0.000125
0.000125
6.2e-05
6.2e-05
3.1e-05
3.1e-05
1.6e-05
1.6e-05

epsilon_hist


FileNotFoundError: ignored