In [None]:
# You will need to run this block twice to make it effective
!apt-get update 
!apt-get install cmake 
!pip install --upgrade setuptools 
!pip install ez_setup 
!pip install gym==0.24.1
!pip install gym[all]

!pip install gym pyvirtualdisplay 
!apt-get install -y xvfb python-opengl ffmpeg 

In [None]:
from importlib import reload
import utils
reload(utils)
from utils import*

import gym
from gym.wrappers.monitoring import video_recorder
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

import torch
import torch.nn as nn
import numpy as np
from torch import nn
import copy
from collections import deque
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

from torch import randint
from time import sleep
import pickle
import statistics as st
from gym.core import RewardWrapper
import gc



display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""


def simulate(agent=None,env=None,epsilon=0,memory=None,render=False):
  agent.eval()
  env.reset()
  if(render):
    env.env = RecordVideo(env.env, './video')
    env.env.render()
  state,rew,done,info = env.skip_episodes(70,[0,0.1,0])
  ep_len = 0
  while not done:
      # exploitation(0) vs exploration(1)
      sample = torch.bernoulli(torch.tensor(epsilon).float())
      if(sample==1):
        A = torch.randint(0,3,(1,))
      else:
        A = agent.get_action(state)

      # progress a time step
      next_state, rew, done, info = env.step(agent.convert_action(A,state))
      #plot_image(state)
      #print(rew)

      if(done):
        break

      # collect memory
      if(memory!=None):
        memory.collect([state, A, rew, next_state])
      state = next_state

      ep_len = env.ep_len
      # stop criteria 
      if(ep_len>2000):
        break

  # readd 100 to episode reward to resync measured reward with documentation (undo the -100 penalty)
  score = env.real_rew
  if(render):
    print("score",score,"ep_len",ep_len)
    env.env.close()
    show_video()
  
  return score,ep_len


def test_model(agent, env, episodes=1):
  rewards = []
  ep_lens = []
  for i in range(0,episodes):
    rew,ep_len = simulate(agent,env)
    rewards.append(rew)
    ep_lens.append(ep_len)
    print("Test "+str(i+1)+"/"+str(episodes)+": reward =",rew," episode len =",ep_len)
  print("\nAverage Reward = ",sum(rewards)/len(rewards),"Average Ep_len = ",sum(ep_lens)/len(ep_lens),"\n")
  return rewards,ep_lens


class ExperienceReplay(object):
# one entry is [state,action,reward,next_state]
  def  __init__(self, length):
    self.experience_replay = deque(maxlen=length)
  def collect(self,experience):
    self.experience_replay.append(experience)
    return
  def sample_from_experience(self, sample_size):
    sample_size = min(sample_size,len(self.experience_replay))
    sample = random.sample(self.experience_replay,sample_size)
    state = torch.tensor([episode[0] for episode in sample]).float()
    action = torch.tensor([episode[1] for episode in sample]).float()
    reward = torch.tensor([episode[2] for episode in sample]).float()
    next_state = torch.tensor([episode[3] for episode in sample]).float()

    return state,action,reward,next_state



class DQN_Network_Simple(nn.Module):
  def __init__(self,gamma = None):
    super().__init__()
    #layers
    self.LeakyReLU = nn.LeakyReLU()
    self.conv1 = nn.Conv2d(1,4,kernel_size = 7, stride = 4,padding = 0)
    self.conv2 = nn.Conv2d(4,8,kernel_size = 3, stride = 1,padding = 2)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.fc1 = nn.Linear(289,100)
    self.fc2 = nn.Linear(100,3)
    self.batchnormCNN1 = nn.BatchNorm2d(num_features = 4)
    self.batchnormCNN2 = nn.BatchNorm2d(num_features = 8)
    self.flatten = nn.Flatten()
    self.gamma = gamma
  def forward(self,x):
    # reformat image (input = BS,96,96, or 96,96) (output = BS,1,96,96)
    x = torch.from_numpy(np.ascontiguousarray(x)).float()
    if(x.dim()==2):
      x = torch.unsqueeze(x,dim=0)
      x = torch.unsqueeze(x,dim=0)
    elif(x.dim()==3):
      x = torch.unsqueeze(x,dim=1)
    subimage = (x[:,:,84:96,13:14]-0.495)*10
    speed = torch.sum(subimage,dim=(2,3))
    x = x[:,:,:84,:]
    #plot_image(np.squeeze(x.detach().numpy()))
    
    #print(x.shape)
    #x = self.batchnormCNN1(self.LeakyReLU(self.conv1(x)))
    x = self.LeakyReLU(self.conv1(x))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    #x = self.batchnormCNN2(self.LeakyReLU(self.conv2(x)))
    x = self.LeakyReLU(self.conv2(x))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = self.flatten(x)
    #print(x.shape)
    x = torch.cat((x,speed),dim=1)
    #x = self.batchnormFC1(self.LeakyReLU(self.fc1(x)))
    x = self.LeakyReLU(self.fc1(x))
    #print(x.shape)
    x = self.fc2(x)
    #print(x.shape)
    return x
  def get_action(self,state):
    qvals = self.forward(state)
    return torch.argmax(qvals,1) 
  def convert_action(self,action,state):
    # determine if you are going too fast
    speed = get_speed(state).item()
    if(speed>3.5):
      accel = 0
    elif(speed>2.5):
      accel = 0
    else:
      accel = 0.1
    # convert action from index, to a list of turning,engine,breaking strengths
    action = action.item()
    # Discretized action space (left-forward,straight-forward,right-forward)
    if(action == 0):
      return [-0.3,accel,0]
    elif(action == 1):
      return [0,accel,0]
    elif(action == 2):
      return [0.3,accel,0]


### Training

In [None]:
def load_memory(new,epsilon,exp_replay_size,initial_size=None):
  if(initial_size==None):
    initial_size=exp_replay_size
  # Create the model
  env = wrap_env(gym.make("CarRacing-v1").unwrapped)
  agent = DQN_Network_Simple()
  if(not new):
    agent.load_state_dict(torch.load("car-racing-dqn.pth"))
  memory = ExperienceReplay(exp_replay_size)

  # initiliaze experience replay
  index = 0
  for i in range(exp_replay_size):
      state = env.reset()
      simulate(agent,env,epsilon = epsilon, memory = memory)
      if(len(memory.experience_replay)>=initial_size):
        break
      print(len(memory.experience_replay))

  return memory


def update(agent,optimizer,loss_func,target_agent,memory,batch_size):
  agent.train()
  target_agent.eval()
  # current (S,A) Qval
  state,action,reward,next_state = memory.sample_from_experience(batch_size)
  Qvals = agent(state)
  curr_Qval = Qvals[torch.arange(Qvals.size(0)),action.long()]
  
  # best next (S,A) Qval
  with torch.no_grad():
    next_Qval, indices = torch.max(target_agent(next_state),dim=1)

  # update agent
  #print(reward + agent.gamma*next_Qval,curr_Qval)
  loss = loss_func(reward + agent.gamma*next_Qval, curr_Qval)
  loss.backward(retain_graph = False)
  optimizer.step()
  optimizer.zero_grad()


def train(new,num_ep,lr_start,epsilon_start,gamma,memory):
  # set hyperparamters
  agent = DQN_Network_Simple(gamma=gamma)
 
  # start new run
  if(new):
    reward_hist = []; ep_len_hist = []; lr_hist = []; epsilon_hist = []
  # load previous runs
  else:
    agent.load_state_dict(torch.load("car-racing-dqn.pth")); reward_hist = load_list("reward_hist.data");ep_len_hist = load_list("ep_len_hist.data");epsilon_hist = load_list("epsilon_hist.data");lr_hist = load_list("lr_hist.data")

  #initialize models
  target_agent = DQN_Network_Simple(agent.gamma)
  target_agent.load_state_dict(agent.state_dict())
  env = wrap_env(gym.make("CarRacing-v1").unwrapped)
  optimizer = torch.optim.SGD(agent.parameters(),lr_start)
  MSELoss = torch.nn.MSELoss()

  # training loop
  for ep_num in tqdm(range(0,num_ep)):
    lr = lr_start*(0.99042**ep_num)
    epsilon = epsilon_start*(0.99424**ep_num)

    for param_group in optimizer.param_groups:
      param_group['lr'] = lr

    state, done, losses, ep_len, reward = env.reset(), False, 0, 0, 0
    reward,ep_len = simulate(agent,env,epsilon = epsilon, memory = memory)       
  
    for i in range(0,30):
      update(agent,optimizer,MSELoss,target_agent,memory,batch_size=32)
    target_agent.load_state_dict(agent.state_dict())
    gc.collect(generation=2)

    if(ep_num%3==0):
      reward, ep_len = test_model(agent=agent,env=env, episodes=1)
      print("Settings: lr =",lr,"epsilon =",epsilon)
      print("Test Result: reward =",reward[0],"episode length =",ep_len[0])
      reward_hist.append(reward[0])
      ep_len_hist.append(ep_len[0])
      lr_hist.append(lr)
      epsilon_hist.append(epsilon)
    
    if(ep_num%30==0):
      # save results
      torch.save(agent.state_dict(),"car-racing-dqn.pth");save_list(reward_hist,"reward_hist.data");save_list(ep_len_hist,"ep_len_hist.data");save_list(epsilon_hist,"epsilon_hist.data");save_list(lr_hist,"lr_hist.data")

In [None]:
memory = load_memory(new=True,epsilon=1,exp_replay_size=500)
train(new=True,num_ep = 360,lr_start=0.0003,epsilon_start=0.8,gamma=0.92,memory=memory)

  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


12
20
43
62
71
103
111
134
148
159
167
176
211
220
233
241
261
272
280
310
320
328
340
351
364
372
384
391
405
414
433
445
457
473
490


  0%|          | 1/360 [00:05<33:38,  5.62s/it]

Test 1/1: reward = 27.494444444444394  episode len = 105

Average Reward =  27.494444444444394 Average Ep_len =  105.0 

Settings: lr = 0.0003 epsilon = 0.8
Test Result: reward = 27.494444444444394 episode length = 105


  1%|          | 4/360 [00:19<30:02,  5.06s/it]

Test 1/1: reward = 30.211846689895395  episode len = 115

Average Reward =  30.211846689895395 Average Ep_len =  115.0 

Settings: lr = 0.00029146033499462634 epsilon = 0.7862554733576193
Test Result: reward = 30.211846689895395 episode length = 115


  2%|▏         | 7/360 [00:36<32:32,  5.53s/it]

Test 1/1: reward = 52.427450980392045  episode len = 160

Average Reward =  52.427450980392045 Average Ep_len =  160.0 

Settings: lr = 0.0002831637562505994 epsilon = 0.7727470867310173
Test Result: reward = 52.427450980392045 episode length = 160


  3%|▎         | 10/360 [00:51<31:06,  5.33s/it]

Test 1/1: reward = 54.4754385964911  episode len = 155

Average Reward =  54.4754385964911 Average Ep_len =  155.0 

Settings: lr = 0.0002751033441837881 epsilon = 0.7594707830792716
Test Result: reward = 54.4754385964911 episode length = 155


  4%|▎         | 13/360 [01:05<29:46,  5.15s/it]

Test 1/1: reward = 55.228571428571314  episode len = 160

Average Reward =  55.228571428571314 Average Ep_len =  160.0 

Settings: lr = 0.00026727237617982957 epsilon = 0.7464225750640932
Test Result: reward = 55.228571428571314 episode length = 160


  4%|▍         | 16/360 [01:20<30:32,  5.33s/it]

Test 1/1: reward = 57.48421052631566  episode len = 160

Average Reward =  57.48421052631566 Average Ep_len =  160.0 

Settings: lr = 0.0002596643209872764 epsilon = 0.7335985438522895
Test Result: reward = 57.48421052631566 episode length = 160


  5%|▌         | 19/360 [01:36<32:03,  5.64s/it]

Test 1/1: reward = 101.9379310344831  episode len = 220

Average Reward =  101.9379310344831 Average Ep_len =  220.0 

Settings: lr = 0.0002522728332703459 epsilon = 0.7209948379388026
Test Result: reward = 101.9379310344831 episode length = 220


  6%|▌         | 22/360 [01:53<33:44,  5.99s/it]

Test 1/1: reward = 109.18530351437742  episode len = 245

Average Reward =  109.18530351437742 Average Ep_len =  245.0 

Settings: lr = 0.0002450917483167285 epsilon = 0.7086076719899665
Test Result: reward = 109.18530351437742 episode length = 245


  7%|▋         | 25/360 [02:08<30:31,  5.47s/it]

Test 1/1: reward = 62.4249999999999  episode len = 155

Average Reward =  62.4249999999999 Average Ep_len =  155.0 

Settings: lr = 0.00023811507689604114 epsilon = 0.6964333257066396
Test Result: reward = 62.4249999999999 episode length = 155


  8%|▊         | 28/360 [02:23<29:28,  5.33s/it]

Test 1/1: reward = 54.26979865771797  episode len = 160

Average Reward =  54.26979865771797 Average Ep_len =  160.0 

Settings: lr = 0.00023133700026463792 epsilon = 0.6844681427068687
Test Result: reward = 54.26979865771797 episode length = 160


  9%|▊         | 31/360 [02:40<32:36,  5.95s/it]

Test 1/1: reward = 163.80344827586268  episode len = 290

Average Reward =  163.80344827586268 Average Ep_len =  290.0 

Settings: lr = 0.00022475186531261112 epsilon = 0.6727085294277494
Test Result: reward = 163.80344827586268 episode length = 290


  9%|▉         | 34/360 [02:56<31:22,  5.77s/it]

Test 1/1: reward = 63.95267175572517  episode len = 160

Average Reward =  63.95267175572517 Average Ep_len =  160.0 

Settings: lr = 0.0002183541798489359 epsilon = 0.6611509540461538
Test Result: reward = 63.95267175572517 episode length = 160


 10%|█         | 37/360 [03:10<28:02,  5.21s/it]

Test 1/1: reward = 48.415384615384525  episode len = 160

Average Reward =  48.415384615384525 Average Ep_len =  160.0 

Settings: lr = 0.00021213860802082586 epsilon = 0.6497919454180002
Test Result: reward = 48.415384615384525 episode length = 160


 11%|█         | 40/360 [03:24<26:53,  5.04s/it]

Test 1/1: reward = 46.01081081081071  episode len = 145

Average Reward =  46.01081081081071 Average Ep_len =  145.0 

Settings: lr = 0.00020609996586347878 epsilon = 0.6386280920357476
Test Result: reward = 46.01081081081071 episode length = 145


 12%|█▏        | 43/360 [03:38<26:03,  4.93s/it]

Test 1/1: reward = 46.8999999999999  episode len = 130

Average Reward =  46.8999999999999 Average Ep_len =  130.0 

Settings: lr = 0.00020023321697650197 epsilon = 0.6276560410037999
Test Result: reward = 46.8999999999999 episode length = 130


 13%|█▎        | 46/360 [03:56<32:20,  6.18s/it]

Test 1/1: reward = 96.2050359712233  episode len = 220

Average Reward =  96.2050359712233 Average Ep_len =  220.0 

Settings: lr = 0.0001945334683234099 epsilon = 0.6168724970315149
Test Result: reward = 96.2050359712233 episode length = 220


 14%|█▎        | 49/360 [04:12<29:46,  5.74s/it]

Test 1/1: reward = 50.83356643356631  episode len = 155

Average Reward =  50.83356643356631 Average Ep_len =  155.0 

Settings: lr = 0.00018899596615069196 epsilon = 0.606274221443513
Test Result: reward = 50.83356643356631 episode length = 155


 14%|█▍        | 52/360 [04:36<41:10,  8.02s/it]

Test 1/1: reward = 384.8675675675654  episode len = 575

Average Reward =  384.8675675675654 Average Ep_len =  575.0 

Settings: lr = 0.0001836160920230458 epsilon = 0.5958580312069891
Test Result: reward = 384.8675675675654 episode length = 575


 15%|█▌        | 55/360 [04:53<34:36,  6.81s/it]

Test 1/1: reward = 99.37662337662367  episode len = 235

Average Reward =  99.37662337662367 Average Ep_len =  235.0 

Settings: lr = 0.00017838935897147027 epsilon = 0.5856207979757379
Test Result: reward = 99.37662337662367 episode length = 235


 16%|█▌        | 58/360 [05:28<57:13, 11.37s/it]

Test 1/1: reward = 795.4141479099499  episode len = 1240

Average Reward =  795.4141479099499 Average Ep_len =  1240.0 

Settings: lr = 0.00017331140775100459 epsilon = 0.5755594471506006
Test Result: reward = 795.4141479099499 episode length = 1240


 17%|█▋        | 61/360 [05:49<44:03,  8.84s/it]

Test 1/1: reward = 183.13898305084842  episode len = 370

Average Reward =  183.13898305084842 Average Ep_len =  370.0 

Settings: lr = 0.00016837800320499362 epsilon = 0.5656709569560564
Test Result: reward = 183.13898305084842 episode length = 370


 18%|█▊        | 64/360 [06:20<56:23, 11.43s/it]

Test 1/1: reward = 701.1117437722309  episode len = 995

Average Reward =  701.1117437722309 Average Ep_len =  995.0 

Settings: lr = 0.00016358503073284568 epsilon = 0.5559523575326769
Test Result: reward = 701.1117437722309 episode length = 995


 19%|█▊        | 67/360 [06:52<59:47, 12.24s/it]

Test 1/1: reward = 688.1820512820382  episode len = 1130

Average Reward =  688.1820512820382 Average Ep_len =  1130.0 

Settings: lr = 0.0001589284928583382 epsilon = 0.5464007300451741
Test Result: reward = 688.1820512820382 episode length = 1130


 19%|█▉        | 70/360 [07:15<46:40,  9.66s/it]

Test 1/1: reward = 274.8178082191781  episode len = 470

Average Reward =  274.8178082191781 Average Ep_len =  470.0 

Settings: lr = 0.0001544045058956078 epsilon = 0.5370132058057714
Test Result: reward = 274.8178082191781 episode length = 470


 20%|██        | 73/360 [07:37<42:40,  8.92s/it]

Test 1/1: reward = 392.54285714285425  episode len = 645

Average Reward =  392.54285714285425 Average Ep_len =  645.0 

Settings: lr = 0.00015000929671004533 epsilon = 0.5277869654126367
Test Result: reward = 392.54285714285425 episode length = 645


 21%|██        | 76/360 [07:54<33:15,  7.03s/it]

Test 1/1: reward = 106.68788927335677  episode len = 245

Average Reward =  106.68788927335677 Average Ep_len =  245.0 

Settings: lr = 0.00014573919957139373 epsilon = 0.5187192379031176
Test Result: reward = 106.68788927335677 episode length = 245


 22%|██▏       | 79/360 [08:11<30:09,  6.44s/it]

Test 1/1: reward = 130.78823529411818  episode len = 270

Average Reward =  130.78823529411818 Average Ep_len =  270.0 

Settings: lr = 0.00014159065309642376 epsilon = 0.5098072999215241
Test Result: reward = 130.78823529411818 episode length = 270


 23%|██▎       | 82/360 [08:34<37:55,  8.19s/it]

Test 1/1: reward = 355.00322580645025  episode len = 575

Average Reward =  355.00322580645025 Average Ep_len =  575.0 

Settings: lr = 0.00013756019727863866 epsilon = 0.5010484749012095
Test Result: reward = 355.00322580645025 episode length = 575


 24%|██▎       | 85/360 [08:55<36:28,  7.96s/it]

Test 1/1: reward = 216.96451612903365  episode len = 410

Average Reward =  216.96451612903365 Average Ep_len =  410.0 

Settings: lr = 0.00013364447060252972 epsilon = 0.49244013226070465
Test Result: reward = 216.96451612903365 episode length = 410


 24%|██▍       | 88/360 [09:28<52:21, 11.55s/it]

Test 1/1: reward = 843.4816793893009  episode len = 1065

Average Reward =  843.4816793893009 Average Ep_len =  1065.0 

Settings: lr = 0.00012984020723997604 epsilon = 0.48397968661366114
Test Result: reward = 843.4816793893009 episode length = 1065


 25%|██▌       | 91/360 [10:05<1:00:22, 13.47s/it]

Test 1/1: reward = 793.3769230769053  episode len = 1295

Average Reward =  793.3769230769053 Average Ep_len =  1295.0 

Settings: lr = 0.00012614423432645043 epsilon = 0.4756645969923705
Test Result: reward = 793.3769230769053 episode length = 1295


 26%|██▌       | 94/360 [10:26<43:18,  9.77s/it]

Test 1/1: reward = 137.00821917808292  episode len = 305

Average Reward =  137.00821917808292 Average Ep_len =  305.0 

Settings: lr = 0.00012255346931475963 epsilon = 0.46749236608462175
Test Result: reward = 137.00821917808292 episode length = 305


 27%|██▋       | 97/360 [10:47<35:25,  8.08s/it]

Test 1/1: reward = 91.00683229813694  episode len = 235

Average Reward =  91.00683229813694 Average Ep_len =  235.0 

Settings: lr = 0.00011906491740411169 epsilon = 0.45946053948367216
Test Result: reward = 91.00683229813694 episode length = 235


 28%|██▊       | 100/360 [11:11<37:17,  8.61s/it]

Test 1/1: reward = 219.81538461538582  episode len = 445

Average Reward =  219.81538461538582 Average Ep_len =  445.0 

Settings: lr = 0.00011567566904236637 epsilon = 0.4515667049511022
Test Result: reward = 219.81538461538582 episode length = 445


 29%|██▊       | 103/360 [11:44<49:18, 11.51s/it]

Test 1/1: reward = 832.8289855072352  episode len = 1090

Average Reward =  832.8289855072352 Average Ep_len =  1090.0 

Settings: lr = 0.00011238289749938545 epsilon = 0.4438084916923365
Test Result: reward = 832.8289855072352 episode length = 1090


 29%|██▉       | 106/360 [12:09<40:26,  9.55s/it]

Test 1/1: reward = 180.58627450980504  episode len = 350

Average Reward =  180.58627450980504 Average Ep_len =  350.0 

Settings: lr = 0.0001091838565094588 epsilon = 0.43618356964461136
Test Result: reward = 180.58627450980504 episode length = 350


 30%|███       | 109/360 [12:43<50:38, 12.11s/it]

Test 1/1: reward = 877.5398523985123  episode len = 1075

Average Reward =  877.5398523985123 Average Ep_len =  1075.0 

Settings: lr = 0.00010607587798084027 epsilon = 0.42868964877717497
Test Result: reward = 877.5398523985123 episode length = 1075


 31%|███       | 112/360 [13:05<37:14,  9.01s/it]

Test 1/1: reward = 92.5230769230773  episode len = 240

Average Reward =  92.5230769230773 Average Ep_len =  240.0 

Settings: lr = 0.0001030563697704827 epsilon = 0.42132447840351156
Test Result: reward = 92.5230769230773 episode length = 240


 32%|███▏      | 115/360 [13:24<31:06,  7.62s/it]

Test 1/1: reward = 96.40529801324541  episode len = 225

Average Reward =  96.40529801324541 Average Ep_len =  225.0 

Settings: lr = 0.00010012281352211659 epsilon = 0.4140858465053812
Test Result: reward = 96.40529801324541 episode length = 225


 33%|███▎      | 118/360 [13:44<29:35,  7.34s/it]

Test 1/1: reward = 118.41085271317877  episode len = 245

Average Reward =  118.41085271317877 Average Ep_len =  245.0 

Settings: lr = 9.72727625658687e-05 epsilon = 0.4069715790684737
Test Result: reward = 118.41085271317877 episode length = 245


 34%|███▎      | 121/360 [14:06<31:03,  7.80s/it]

Test 1/1: reward = 107.03513513513555  episode len = 280

Average Reward =  107.03513513513555 Average Ep_len =  280.0 

Settings: lr = 9.450383987766947e-05 epsilon = 0.39997953942947573
Test Result: reward = 107.03513513513555 episode length = 280


 34%|███▍      | 124/360 [14:30<31:53,  8.11s/it]

Test 1/1: reward = 178.0287671232885  episode len = 340

Average Reward =  178.0287671232885 Average Ep_len =  340.0 

Settings: lr = 9.181373609674692e-05 epsilon = 0.3931076276343562
Test Result: reward = 178.0287671232885 episode length = 340


 35%|███▌      | 127/360 [14:47<27:17,  7.03s/it]

Test 1/1: reward = 129.5937694704058  episode len = 320

Average Reward =  129.5937694704058 Average Ep_len =  320.0 

Settings: lr = 8.92002075995536e-05 epsilon = 0.3863537798076768
Test Result: reward = 129.5937694704058 episode length = 320


 36%|███▌      | 130/360 [15:09<28:34,  7.45s/it]

Test 1/1: reward = 115.09347079037849  episode len = 255

Average Reward =  115.09347079037849 Average Ep_len =  255.0 

Settings: lr = 8.666107462852034e-05 epsilon = 0.37971596753273795
Test Result: reward = 115.09347079037849 episode length = 255


 37%|███▋      | 133/360 [15:27<26:18,  6.95s/it]

Test 1/1: reward = 100.53943661971861  episode len = 225

Average Reward =  100.53943661971861 Average Ep_len =  225.0 

Settings: lr = 8.419421947407619e-05 epsilon = 0.373192197242374
Test Result: reward = 100.53943661971861 episode length = 225


 38%|███▊      | 136/360 [15:55<35:28,  9.50s/it]

Test 1/1: reward = 374.51631205673397  episode len = 650

Average Reward =  374.51631205673397 Average Ep_len =  650.0 

Settings: lr = 8.179758470841781e-05 epsilon = 0.36678050962021597
Test Result: reward = 374.51631205673397 episode length = 650


 39%|███▊      | 139/360 [16:13<27:36,  7.49s/it]

Test 1/1: reward = 102.10000000000028  episode len = 225

Average Reward =  102.10000000000028 Average Ep_len =  225.0 

Settings: lr = 7.946917146955593e-05 epsilon = 0.3604789790122397
Test Result: reward = 102.10000000000028 episode length = 225


 39%|███▉      | 142/360 [16:42<35:43,  9.83s/it]

Test 1/1: reward = 252.35172413793222  episode len = 440

Average Reward =  252.35172413793222 Average Ep_len =  440.0 

Settings: lr = 7.720703779420727e-05 epsilon = 0.3542857128484247
Test Result: reward = 252.35172413793222 episode length = 440


 40%|████      | 145/360 [17:02<28:12,  7.87s/it]

Test 1/1: reward = 158.98102189781113  episode len = 305

Average Reward =  158.98102189781113 Average Ep_len =  305.0 

Settings: lr = 7.500929699814144e-05 epsilon = 0.34819885107434967
Test Result: reward = 158.98102189781113 episode length = 305


 41%|████      | 148/360 [17:18<23:27,  6.64s/it]

Test 1/1: reward = 113.27284768211975  episode len = 255

Average Reward =  113.27284768211975 Average Ep_len =  255.0 

Settings: lr = 7.28741161026324e-05 epsilon = 0.3422165655925525
Test Result: reward = 113.27284768211975 episode length = 255


 42%|████▏     | 151/360 [17:38<23:31,  6.75s/it]

Test 1/1: reward = 99.30215827338164  episode len = 225

Average Reward =  99.30215827338164 Average Ep_len =  225.0 

Settings: lr = 7.079971430570178e-05 epsilon = 0.3363370597134889
Test Result: reward = 99.30215827338164 episode length = 225


 43%|████▎     | 154/360 [18:03<28:00,  8.16s/it]

Test 1/1: reward = 194.48688524590293  episode len = 380

Average Reward =  194.48688524590293 Average Ep_len =  380.0 

Settings: lr = 6.878436149687895e-05 epsilon = 0.3305585676159238
Test Result: reward = 194.48688524590293 episode length = 380


 44%|████▎     | 157/360 [18:26<28:02,  8.29s/it]

Test 1/1: reward = 195.24328358209053  episode len = 360

Average Reward =  195.24328358209053 Average Ep_len =  360.0 

Settings: lr = 6.68263768142394e-05 epsilon = 0.32487935381659344
Test Result: reward = 195.24328358209053 episode length = 360


 44%|████▍     | 160/360 [18:52<30:37,  9.19s/it]

Test 1/1: reward = 248.59589041095992  episode len = 420

Average Reward =  248.59589041095992 Average Ep_len =  420.0 

Settings: lr = 6.492412724251782e-05 epsilon = 0.3192977126489789
Test Result: reward = 248.59589041095992 episode length = 420


 45%|████▌     | 163/360 [19:15<28:21,  8.64s/it]

Test 1/1: reward = 120.70000000000054  episode len = 290

Average Reward =  120.70000000000054 Average Ep_len =  290.0 

Settings: lr = 6.307602625112664e-05 epsilon = 0.313811967751035
Test Result: reward = 120.70000000000054 episode length = 290


 46%|████▌     | 166/360 [19:37<25:56,  8.03s/it]

Test 1/1: reward = 93.4470588235297  episode len = 240

Average Reward =  93.4470588235297 Average Ep_len =  240.0 

Settings: lr = 6.128053247094406e-05 epsilon = 0.30842047156172
Test Result: reward = 93.4470588235297 episode length = 240


 47%|████▋     | 169/360 [20:00<26:13,  8.24s/it]

Test 1/1: reward = 226.92706270627193  episode len = 435

Average Reward =  226.92706270627193 Average Ep_len =  435.0 

Settings: lr = 5.953614840876811e-05 epsilon = 0.30312160482617534
Test Result: reward = 226.92706270627193 episode length = 435


 48%|████▊     | 172/360 [20:21<24:22,  7.78s/it]

Test 1/1: reward = 192.66390977443726  episode len = 325

Average Reward =  192.66390977443726 Average Ep_len =  325.0 

Settings: lr = 5.784141919836449e-05 epsilon = 0.2979137761094071
Test Result: reward = 192.66390977443726 episode length = 325


 49%|████▊     | 175/360 [20:41<23:07,  7.50s/it]

Test 1/1: reward = 134.63946587537168  episode len = 340

Average Reward =  134.63946587537168 Average Ep_len =  340.0 

Settings: lr = 5.619493138706643e-05 epsilon = 0.2927954213183221
Test Result: reward = 134.63946587537168 episode length = 340


 49%|████▉     | 178/360 [21:08<25:58,  8.56s/it]

Test 1/1: reward = 164.8000000000009  episode len = 350

Average Reward =  164.8000000000009 Average Ep_len =  350.0 

Settings: lr = 5.4595311756914746e-05 epsilon = 0.2877650032319761
Test Result: reward = 164.8000000000009 episode length = 350


 50%|█████     | 181/360 [21:31<24:57,  8.37s/it]

Test 1/1: reward = 177.68628158844837  episode len = 315

Average Reward =  177.68628158844837 Average Ep_len =  315.0 

Settings: lr = 5.304122617935478e-05 epsilon = 0.28282101103989277
Test Result: reward = 177.68628158844837 episode length = 315


 51%|█████     | 184/360 [21:56<24:59,  8.52s/it]

Test 1/1: reward = 157.0791519434636  episode len = 300

Average Reward =  157.0791519434636 Average Ep_len =  300.0 

Settings: lr = 5.153137850253497e-05 epsilon = 0.2779619598883142
Test Result: reward = 157.0791519434636 episode length = 300


 52%|█████▏    | 187/360 [22:14<20:40,  7.17s/it]

Test 1/1: reward = 124.11515151515201  episode len = 270

Average Reward =  124.11515151515201 Average Ep_len =  270.0 

Settings: lr = 5.00645094702791e-05 epsilon = 0.27318639043424753
Test Result: reward = 124.11515151515201 episode length = 270


 53%|█████▎    | 190/360 [22:36<22:06,  7.80s/it]

Test 1/1: reward = 237.57777777777892  episode len = 400

Average Reward =  237.57777777777892 Average Ep_len =  400.0 

Settings: lr = 4.863939567183063e-05 epsilon = 0.26849286840717335
Test Result: reward = 237.57777777777892 episode length = 400


 54%|█████▎    | 193/360 [23:03<27:10,  9.77s/it]

Test 1/1: reward = 562.5467432950119  episode len = 770

Average Reward =  562.5467432950119 Average Ep_len =  770.0 

Settings: lr = 4.7254848521493125e-05 epsilon = 0.2638799841782838
Test Result: reward = 562.5467432950119 episode length = 770


 54%|█████▍    | 196/360 [23:26<24:41,  9.03s/it]

Test 1/1: reward = 175.59420849420954  episode len = 325

Average Reward =  175.59420849420954 Average Ep_len =  325.0 

Settings: lr = 4.5909713267315705e-05 epsilon = 0.259346352337122
Test Result: reward = 175.59420849420954 episode length = 325


 55%|█████▌    | 199/360 [23:49<23:31,  8.77s/it]

Test 1/1: reward = 252.45263157894848  episode len = 435

Average Reward =  252.45263157894848 Average Ep_len =  435.0 

Settings: lr = 4.4602868027996924e-05 epsilon = 0.25489061127549467
Test Result: reward = 252.45263157894848 episode length = 435


 56%|█████▌    | 202/360 [24:15<23:28,  8.92s/it]

Test 1/1: reward = 163.88027210884442  episode len = 365

Average Reward =  163.88027210884442 Average Ep_len =  365.0 

Settings: lr = 4.3333222857203645e-05 epsilon = 0.2505114227785338
Test Result: reward = 163.88027210884442 episode length = 365


 57%|█████▋    | 205/360 [24:34<19:27,  7.53s/it]

Test 1/1: reward = 111.64713804713854  episode len = 260

Average Reward =  111.64713804713854 Average Ep_len =  260.0 

Settings: lr = 4.209971883452459e-05 epsilon = 0.24620747162278345
Test Result: reward = 111.64713804713854 episode length = 260


 58%|█████▊    | 208/360 [24:54<19:09,  7.56s/it]

Test 1/1: reward = 160.53505976095698  episode len = 305

Average Reward =  160.53505976095698 Average Ep_len =  305.0 

Settings: lr = 4.090132718230039e-05 epsilon = 0.24197746518119276
Test Result: reward = 160.53505976095698 episode length = 305


 59%|█████▊    | 211/360 [25:36<34:11, 13.77s/it]

Test 1/1: reward = 863.8714285714099  episode len = 1145

Average Reward =  863.8714285714099 Average Ep_len =  1145.0 

Settings: lr = 3.973704840759363e-05 epsilon = 0.2378201330348944
Test Result: reward = 863.8714285714099 episode length = 1145


 59%|█████▉    | 214/360 [26:13<34:48, 14.30s/it]

Test 1/1: reward = 891.0999999999833  episode len = 1085

Average Reward =  891.0999999999833 Average Ep_len =  1085.0 

Settings: lr = 3.8605911468583084e-05 epsilon = 0.2337342265916536
Test Result: reward = 891.0999999999833 episode length = 1085


 60%|██████    | 217/360 [26:48<33:07, 13.90s/it]

Test 1/1: reward = 867.3800766283422  episode len = 1055

Average Reward =  867.3800766283422 Average Ep_len =  1055.0 

Settings: lr = 3.7506972964687045e-05 epsilon = 0.22971851871087204
Test Result: reward = 867.3800766283422 episode length = 1055


 61%|██████    | 220/360 [27:30<36:42, 15.73s/it]

Test 1/1: reward = 856.0297297297076  episode len = 1235

Average Reward =  856.0297297297076 Average Ep_len =  1235.0 

Settings: lr = 3.643931634974026e-05 epsilon = 0.22577180333503477
Test Result: reward = 856.0297297297076 episode length = 1235


 62%|██████▏   | 223/360 [28:19<40:24, 17.70s/it]

Test 1/1: reward = 876.0824561403316  episode len = 1165

Average Reward =  876.0824561403316 Average Ep_len =  1165.0 

Settings: lr = 3.540205116756821e-05 epsilon = 0.22189289512748883
Test Result: reward = 876.0824561403316 episode length = 1165


 63%|██████▎   | 226/360 [28:44<26:51, 12.03s/it]

Test 1/1: reward = 120.90989399293333  episode len = 270

Average Reward =  120.90989399293333 Average Ep_len =  270.0 

Settings: lr = 3.4394312309321114e-05 epsilon = 0.21808062911644535
Test Result: reward = 120.90989399293333 episode length = 270


 64%|██████▎   | 229/360 [29:19<28:29, 13.05s/it]

Test 1/1: reward = 850.2979238754208  episode len = 1150

Average Reward =  850.2979238754208 Average Ep_len =  1150.0 

Settings: lr = 3.3415259291948445e-05 epsilon = 0.2143338603450977
Test Result: reward = 850.2979238754208 episode length = 1150


 64%|██████▍   | 232/360 [29:48<24:08, 11.32s/it]

Test 1/1: reward = 121.60546448087479  episode len = 310

Average Reward =  121.60546448087479 Average Ep_len =  310.0 

Settings: lr = 3.246407555721199e-05 epsilon = 0.21065146352775077
Test Result: reward = 121.60546448087479 episode length = 310


 65%|██████▌   | 235/360 [30:23<26:33, 12.75s/it]

Test 1/1: reward = 381.20593471809843  episode len = 725

Average Reward =  381.20593471809843 Average Ep_len =  725.0 

Settings: lr = 3.153996779065289e-05 epsilon = 0.20703233271185872
Test Result: reward = 381.20593471809843 episode length = 725


 66%|██████▌   | 238/360 [30:48<20:31, 10.09s/it]

Test 1/1: reward = 97.87707509881447  episode len = 205

Average Reward =  97.87707509881447 Average Ep_len =  205.0 

Settings: lr = 3.064216525994472e-05 epsilon = 0.20347538094586823
Test Result: reward = 97.87707509881447 episode length = 205


 67%|██████▋   | 241/360 [31:15<18:31,  9.34s/it]

Test 1/1: reward = 125.8777777777785  episode len = 265

Average Reward =  125.8777777777785 Average Ep_len =  265.0 

Settings: lr = 2.976991917208064e-05 epsilon = 0.1999795399527694
Test Result: reward = 125.8777777777785 episode length = 265


 68%|██████▊   | 244/360 [31:39<17:44,  9.18s/it]

Test 1/1: reward = 297.0207612456747  episode len = 485

Average Reward =  297.0207612456747 Average Ep_len =  485.0 

Settings: lr = 2.892250204885858e-05 epsilon = 0.19654375980925454
Test Result: reward = 297.0207612456747 episode length = 485


 69%|██████▊   | 247/360 [32:01<15:16,  8.11s/it]

Test 1/1: reward = 118.02209737827759  episode len = 240

Average Reward =  118.02209737827759 Average Ep_len =  240.0 

Settings: lr = 2.809920712014363e-05 epsilon = 0.19316700863038955
Test Result: reward = 118.02209737827759 episode length = 240


 69%|██████▉   | 250/360 [32:26<16:20,  8.91s/it]

Test 1/1: reward = 355.1202898550709  episode len = 540

Average Reward =  355.1202898550709 Average Ep_len =  540.0 

Settings: lr = 2.729934773440151e-05 epsilon = 0.18984827225970283
Test Result: reward = 355.1202898550709 episode length = 540


 70%|███████   | 253/360 [32:49<15:07,  8.48s/it]

Test 1/1: reward = 129.80000000000072  episode len = 325

Average Reward =  129.80000000000072 Average Ep_len =  325.0 

Settings: lr = 2.652225678601153e-05 epsilon = 0.18658655396459853
Test Result: reward = 129.80000000000072 episode length = 325


 71%|███████   | 256/360 [33:13<15:07,  8.72s/it]

Test 1/1: reward = 136.3844155844161  episode len = 290

Average Reward =  136.3844155844161 Average Ep_len =  290.0 

Settings: lr = 2.5767286158881407e-05 epsilon = 0.18338087413700296
Test Result: reward = 136.3844155844161 episode length = 290


 72%|███████▏  | 259/360 [33:35<14:09,  8.41s/it]

Test 1/1: reward = 119.25384615384662  episode len = 265

Average Reward =  119.25384615384662 Average Ep_len =  265.0 

Settings: lr = 2.5033806185899917e-05 epsilon = 0.18023026999915406
Test Result: reward = 119.25384615384662 episode length = 265


 73%|███████▎  | 262/360 [33:57<13:07,  8.04s/it]

Test 1/1: reward = 122.45072463768153  episode len = 260

Average Reward =  122.45072463768153 Average Ep_len =  260.0 

Settings: lr = 2.4321205123776464e-05 epsilon = 0.1771337953144455
Test Result: reward = 122.45072463768153 episode length = 260


 74%|███████▎  | 265/360 [34:21<13:52,  8.76s/it]

Test 1/1: reward = 168.48027210884422  episode len = 320

Average Reward =  168.48027210884422 Average Ep_len =  320.0 

Settings: lr = 2.3628888642829706e-05 epsilon = 0.17409052010323875
Test Result: reward = 168.48027210884422 episode length = 320


 74%|███████▍  | 268/360 [34:51<16:20, 10.66s/it]

Test 1/1: reward = 461.20793650793155  episode len = 750

Average Reward =  461.20793650793155 Average Ep_len =  750.0 

Settings: lr = 2.295627933129956e-05 epsilon = 0.17109953036355763
Test Result: reward = 461.20793650793155 episode length = 750


 75%|███████▌  | 271/360 [35:18<14:53, 10.04s/it]

Test 1/1: reward = 207.1000000000015  episode len = 425

Average Reward =  207.1000000000015 Average Ep_len =  425.0 

Settings: lr = 2.2302816213769293e-05 epsilon = 0.16815992779658168
Test Result: reward = 207.1000000000015 episode length = 425


 76%|███████▌  | 274/360 [35:42<12:57,  9.05s/it]

Test 1/1: reward = 146.77407407407478  episode len = 270

Average Reward =  146.77407407407478 Average Ep_len =  270.0 

Settings: lr = 2.166795428329594e-05 epsilon = 0.1652708295368555
Test Result: reward = 146.77407407407478 episode length = 270


 77%|███████▋  | 276/360 [35:55<10:56,  7.81s/it]


KeyboardInterrupt: ignored

In [None]:
env = wrap_env(gym.make("CarRacing-v1").unwrapped)
agent = DQN_Network_Simple()
agent.load_state_dict(torch.load("car-racing-dqn.pth"))

simulate(agent=agent,env=env,render=True)
#test_model(agent,env,episodes=10)

  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "
  f"Overwriting existing videos at {self.video_folder} folder "


score 453.7528052805233 ep_len 740


(453.7528052805233, 740)

In [None]:
print("lr_hist")
hist = load_list("lr_hist.data")
for stage in hist:
  print(stage)
print("\nepsilon_hist")
hist = load_list("epsilon_hist.data")
for stage in hist:
  print(stage)
print("\nep_len_hist")
hist = load_list("ep_len_hist.data")
for stage in hist:
  print(st.mean(stage))
print("\nreward_hist")
hist = load_list("reward_hist.data")
for stage in hist:
  print(st.mean(stage))