Import Torch Packages

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
import torch.optim as optim

#### Import Gym Packages

In [2]:
from ale_py import ALEInterface
ale = ALEInterface()

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [3]:
from ale_py.roms import Breakout
ale.loadROM(Breakout)

Game console created:
  ROM file:  /home/beegass/.virtualenvs/dl_1/lib/python3.8/site-packages/AutoROM/roms/breakout.bin
  Cart Name: Breakout - Breakaway IV (1978) (Atari)
  Cart MD5:  f34f08e5eb96e500e851a80be3277a56
  Display Format:  AUTO-DETECT ==> NTSC
  ROM Size:        2048
  Bankswitch Type: AUTO-DETECT ==> 2K

Running ROM file...
Random seed is 1648049376


In [4]:
import gym
from gym.wrappers import FrameStack
from gym.wrappers.monitoring.video_recorder import VideoRecorder

#### All Other Packages

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange
import random
import copy
from loguru import logger
import wandb
import math

  def _figure_formats_changed(self, name, old, new):


In [6]:
wandb.login(key="5966d774f384473f3d7ed674ef762b1a26a54d63")

[34m[1mwandb[0m: Currently logged in as: [33mbeegass[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/beegass/.netrc


True

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check if GPU is available

##### The Meat And Potatoes 

In [8]:
class Gym_Env():
    def __init__(self, env_name, max_steps=1000, max_episodes=10000):
        self.le_env = FrameStack(gym.make(env_name, render_mode='rgb_array'), 4)
        self.max_steps = max_steps
        self.max_episodes = max_episodes
        

In [9]:
class Replay_Buffer():
    def __init__(self, capacity, mini_batch_size=128):
        self.rb = []
        self.capacity = capacity
        self.mini_batch_size = mini_batch_size
        self.current_batch = None

    def get_rb_batch(self):
        sample = random.sample(self.rb, self.mini_batch_size)
        states, actions, next_states, rewards, done = zip(*sample[0: (self.mini_batch_size - 1)])
        return states, actions, next_states, rewards, done
    
    def add_to_rb(self, new_transition):
        if len(self.rb) >= self.capacity:
            del self.rb[0] 
        self.rb.append(new_transition)

In [10]:
class Deep_Q_Network(nn.Module):
    def __init__(self, in_channels=4, num_actions=4):
        super(Deep_Q_Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.fc5 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.fc5(x)

In [11]:
class Agent(nn.Module):
    def __init__(self, pred_model):
        super(Agent, self).__init__()
        self.agent = pred_model
        self.target = None
        
    def agent_policy(self, state, pred_model=True, grad=False):
        q_val = -1
        state = state.to(device)
        if pred_model:
            if grad:
                q_val = self.agent(state)
            else:
                with torch.no_grad():
                    q_val = self.agent(state)
        else:
            with torch.no_grad():
                q_val = self.target(state)
        return q_val
    
    def copy_pred_to_target(self):
        self.target = copy.deepcopy(self.agent)
    

##### Epsilon

In [12]:
def epsilon_greedy(environment, the_agent, state, epsilon=0.1):
    prob = random.random()
    action = environment.le_env.action_space.sample() # pick action from action space
    if prob < 1 - epsilon:
        q_val = the_agent.agent_policy(state) # retrieve best action, based off its action-value action
        action = torch.argmax(q_val)
    return action

In [13]:
def epsilon_decay(environment, episode_num, p_init=0.7, p_end=0.1):
    episode_num += 1
    current_episode_rate = (environment.max_episodes - episode_num) / environment.max_episodes
    epsilon_decay_rate = max(current_episode_rate, 0)
    return ((p_init - p_end) * (epsilon_decay_rate) + p_end)

##### Preprocess

In [14]:
def preprocess(state):
    
    # convert state to numpy array and then to torch tensor
    frame = torch.from_numpy(np.array(state).astype(np.float32))
    
    # reshape so that grayscaling is possible
    reshaped_frame = frame.reshape(4, 3, 210, 160)
    
    # grayscale image
    gray_frame = T.Grayscale()(reshaped_frame)
    
    # reshape image so network can process it
    reshaped_gray_frame = gray_frame.reshape(1, 4, 210, 160)
    
    # downscale image to 84x84
    small_gray_frame = T.Resize((84, 84))(reshaped_gray_frame)
    
    return small_gray_frame
    

##### Global Variables

In [15]:
def config_tune() -> None:
    cfg = {
        'method': 'bayes', #grid, random
        'metric': {
            'name': 'mean_reward',
            'goal': 'maximize'   
        },
        'parameters': {
            'alpha': {'distribution': 'uniform',
                                      'min': 0.00001,
                                      'max': 1},
            'gamma': {'distribution': 'uniform',
                                      'min': 0.01,
                                      'max': 1},
            'epsilon': {'distribution': 'uniform',
                                      'min': 0.01,
                                      'max': 1},
            'decay_rate': {'distribution': 'uniform',
                                      'min': 0.01,
                                      'max': 1},
            'delta': {'distribution': 'normal',
                                      'mu': 3,
                                      'sigma': 1},
            'target_freq': {'distribution': 'int_uniform',
                                      'min': 1,
                                      'max': 9999},
            'batch_size': {
                'value': 64
            },
            'replay_buffer_size': {
                'value': 10000
            },
            'max_episodes': {
                'value': 1000 
            },
            'max_steps': {
                'value': 10000
            }
        }
    }
    
    return cfg

In [16]:
def vanilla_config() -> None:
    cfg = {
        'alpha': 0.95,
        'gamma': 0.23,
        'epsilon': 0.95,
        'batch_size': 32,
        'replay_buffer_size': 5000,
        'max_episodes': 10000,
        'max_steps': 10000,
    }
    
    return cfg

##### Buidling Optimizers and Schedulers

In [17]:
def build_optimizer(model, optimizer_name='adam', learning_rate=0.01, weight_decay=0.01, momentum=0.9):
    try:
        optimizer = None
        if optimizer_name == "sgd":
            optimizer = optim.SGD(model.parameters(), 
                                  lr=learning_rate, 
                                  momentum=momentum)
            
        elif optimizer_name == "adam":
            optimizer = optim.Adam(model.parameters(), 
                                   lr=learning_rate, 
                                   weight_decay=weight_decay)
               
        return optimizer
    except:
        print("Error: Invalid optimizer specified.")
        sys.exit(1)

In [18]:
def build_scheduler(optimizer, sched_name='reduce_lr', patience=5, verbose=True):
    try: 
        sched = None
        if sched_name == "reduce_lr":
            sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode='min', 
                                                         patience=patience, 
                                                         verbose=verbose)
        elif sched_name == 'TODO':
            pass
            #TODO: add other scheduler
            
        return sched
    except:
        logger.error("Error: Invalid scheduler specified.")
        sys.exit(1)

##### Training Loop

In [19]:
def train(episode_tuple, replay_buffer, the_agent, loss_fn, optimizer, scheduler, gamma=0.95):
    #1. copy weights from prediction network to target target network
    (episode_num, target_freq) = episode_tuple
    if the_agent.target is not None:
        if episode_num % target_freq == 0:
            the_agent.copy_pred_to_target() 
    else:
        the_agent.copy_pred_to_target()
    
    #2. retrieve (s, a, r, s') from mini_batch
    states, actions, next_states, rewards, done = replay_buffer.get_rb_batch()
    
    states = torch.vstack(states)
    actions = torch.tensor(actions, dtype = torch.long, device = device).detach()
    next_states = torch.vstack(next_states)
    rewards = torch.tensor(rewards, dtype = torch.float, device = device)
    done = torch.tensor(done, dtype=torch.int32, device = device)
    
    pred_q_val_matrix = the_agent.agent_policy(states, grad=True)
    pred_q_val = torch.gather(pred_q_val_matrix, 1, actions.unsqueeze(1)).squeeze(1)
    target_q_val_matrix = the_agent.agent_policy(next_states, pred_model=False)
    target_q_val = torch.max(target_q_val_matrix, dim=1)[0]
    #zero_or_one = torch.ones(done.shape, dtype=torch.float, device = device) - done
    y_j = (rewards + (gamma * target_q_val)) * (1 - done)
    
    output = loss_fn(pred_q_val, y_j.detach())
    optimizer.zero_grad()
    output.backward()
    optimizer.step()

##### Episode Specific Loop

In [20]:
def run_episode(video, render, episode_tuple, environment, the_agent, replay_buffer, state, epsilon, gamma, loss_fn, optimizer, scheduler):
    step_count = 0
    cumulative_reward = 0
    while True:
        
        #0. either explore or exploit
        action = epsilon_greedy(environment=environment,
                                the_agent=the_agent,
                                state=state,
                                epsilon=epsilon)
        
        #1. render environment only when flag is set to True
        if render:
            video.capture_frame()
        
        #2. pass action to environment
        (next_state, reward, done, metadata) = environment.le_env.step(action)
        
        #3. get s' back from environment and preprocess (s' -> preprocessed_s')
        preprocessed_next_state = preprocess(next_state)
        
        #4. add transition (s, a, s', r) to replay buffer
        replay_buffer.add_to_rb((state, action, preprocessed_next_state, reward, done))
        
        #5. if replay buffer is full, sample mini batch and update model
        if len(replay_buffer.rb) > replay_buffer.mini_batch_size and not epsilon <= 0:
            train(episode_tuple, replay_buffer, the_agent, loss_fn, optimizer, scheduler, gamma)
            
        
        cumulative_reward += reward
        
        #6. check max number of time steps has been reached or if game is complete
        if step_count >= environment.max_steps or done:
            step_count += 1
            return cumulative_reward, step_count, video
        
        state = preprocessed_next_state
        
        step_count += 1
        
        

##### Episodic Loop  

In [21]:
def run():
    cfg = vanilla_config()
    with wandb.init(project="BeeGass-Agents", entity="beegass", config=cfg, monitor_gym=True):
        config = wandb.config
        
        # initialize gym environment
        environment = Gym_Env(env_name='ALE/Breakout-v5', max_steps=config.max_steps, max_episodes=config.max_episodes)
        
        # initialize prediction network
        #pred_net = Deep_Q_Network(environment.le_env.action_space.n).to(device)
        pred_net = Deep_Q_Network().to(device)
        
        # initialize agent that contains both prediction network and target network
        the_agent = Agent(pred_model=pred_net)
        
        # define loss function
        loss_fn = nn.HuberLoss(reduction='mean', delta=config.delta)
        
        # define optimizer
        optimizer = build_optimizer(model=the_agent.agent, 
                                    optimizer_name='adam', 
                                    learning_rate=config.alpha, 
                                    weight_decay=0.01, 
                                    momentum=0.9)
        
        # define scheduler
        scheduler = build_scheduler(optimizer, 
                                    sched_name='reduce_lr', 
                                    patience=5, 
                                    verbose=True)
        
        # initialize replay buffer
        replay_buffer = Replay_Buffer(capacity=config.replay_buffer_size, mini_batch_size=config.batch_size)
        
        render_flag = False
        epsilon = config.epsilon
        episode_cumulative_reward = 0
        total_steps = 0
        for e in range(environment.max_episodes):
            video = None
            if e + 1 % 10000 == 0:
                render_flag = True
                video = VideoRecorder(environment.le_env, f"./videos/breakout/episode{e}.mp4", enabled=True)
            else:
                render_flag = False
            
            # 0. get initial state, s_{0}, and preprocess it (s_{0} -> preprocessed_s)
            state = environment.le_env.reset()
            
            # 0.1: preprocess state
            preprocessed_state = preprocess(state)
            
            # 1. iterate over steps in episode
            cumulative_reward, step_count, video = run_episode(video=video,
                                                render=render_flag,
                                                environment=environment,
                                                episode_tuple=(e, config.target_freq), 
                                                the_agent=the_agent,
                                                replay_buffer=replay_buffer,
                                                state=preprocessed_state, 
                                                epsilon=epsilon,
                                                gamma=config.gamma,
                                                loss_fn=loss_fn,
                                                optimizer=optimizer, 
                                                scheduler=scheduler)
            
            episode_cumulative_reward += cumulative_reward
            total_steps += step_count
            wandb.log({"episode": e, "total_reward": episode_cumulative_reward, "total_steps": total_steps}, step=e)
            wandb.log({"episode": e, "total_reward": episode_cumulative_reward}, step=e)
            wandb.log({"episode": e, "mean_reward": (episode_cumulative_reward/(e+1))}, step=e)
            wandb.log({"episode": e, "reward": cumulative_reward}, step=e)
            wandb.log({"episode": e, "step_count": step_count}, step=e)
            wandb.log({"episode": e, "step_count": total_steps}, step=e)
            
            if render_flag:
                wandb.log({"video": wandb.Video(f"./videos/breakout/episode{e}.mp4", fps=4, format="mp4")})
                video.close
            
            # 3. decay epsilon
            epsilon = config.decay_rate * epsilon
            wandb.log({"episode": e, "epsilon": epsilon}, step=e)

In [22]:
# run()

In [23]:
sweep_id = wandb.sweep(config_tune(), project="dqn-sweeps")

Create sweep with ID: irapekpn
Sweep URL: https://wandb.ai/beegass/dqn-sweeps/sweeps/irapekpn


In [24]:
wandb.agent(sweep_id, run, count=20)

[34m[1mwandb[0m: Agent Starting Run: dnk03p6q with config:
[34m[1mwandb[0m: 	alpha: 0.8074247877577279
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.5796111193697523
[34m[1mwandb[0m: 	delta: 4.490587113067569
[34m[1mwandb[0m: 	epsilon: 0.2945653667272183
[34m[1mwandb[0m: 	gamma: 0.9522693204600168
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]
  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,█▇▆▃▂▂▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂▂▂▂▂▁▁▁▁▂▁▂▂
reward,▃▆▃▃▃▅▂▃▇▁▃▂▂▁▁▂▂▁▃▁▂▁▃▃█▂▅▃▆▃▂▁▂▁▃▃▅▁▃▃
step_count,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
total_reward,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
total_steps,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
episode,999.0
epsilon,0.0
mean_reward,1.477
reward,3.0
step_count,203324.0
total_reward,1477.0
total_steps,203324.0


[34m[1mwandb[0m: Agent Starting Run: cabbvr9r with config:
[34m[1mwandb[0m: 	alpha: 0.009648365332802826
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.745921892258523
[34m[1mwandb[0m: 	delta: 3.85730649755779
[34m[1mwandb[0m: 	epsilon: 0.4782102231285333
[34m[1mwandb[0m: 	gamma: 0.4148933537576313
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▂▁▇▆▆▆▇▆▅▅▅▆▆▆▆▆▆▆▇██████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
reward,█▄▁▁▅▄▄▄▂▁▂▅▄▁▄▅▄▅▄▄▁▄▄▁▄▂█▁▁▂▄▇▄▂▄▁▅▂▄▅
step_count,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
total_reward,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
total_steps,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████

0,1
episode,999.0
epsilon,0.0
mean_reward,2.105
reward,3.0
step_count,287196.0
total_reward,2105.0
total_steps,287196.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4qs112cc with config:
[34m[1mwandb[0m: 	alpha: 0.9814171722967366
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.4615484594183751
[34m[1mwandb[0m: 	delta: 3.553558683010013
[34m[1mwandb[0m: 	epsilon: 0.8199924630598746
[34m[1mwandb[0m: 	gamma: 0.29160595831824426
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▄▇█▇▆▅▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward,█▄▃▁▂▁▂▂▃▄▃▂▁▄▁▁▅▄▃▂▂▂▁▄▁▃▂▂▁▂▅▃▃▄▂▂▁▁▁▁
step_count,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅█
total_reward,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
total_steps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅█

0,1
episode,999.0
epsilon,0.0
mean_reward,1.554
reward,0.0
step_count,568119.0
total_reward,1554.0
total_steps,568119.0


[34m[1mwandb[0m: Agent Starting Run: 6rrv5jxc with config:
[34m[1mwandb[0m: 	alpha: 0.04070038499582011
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.6317389853786282
[34m[1mwandb[0m: 	delta: 3.834270353255744
[34m[1mwandb[0m: 	epsilon: 0.5846041121665984
[34m[1mwandb[0m: 	gamma: 0.4754705213868177
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▅▄██▅▄▅▅▆▅▅▅▃▄▄▅▅▅▅▅▄▄▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
reward,▅▅▇▄▄▁▇▁▁▂▇▄▂▄▁▄▄▅▄▅▂▁▂▁▁▄▅█▄▁▄▂▂▂▄▁▄▅▂▄
step_count,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
total_reward,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
total_steps,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████

0,1
episode,999.0
epsilon,0.0
mean_reward,1.716
reward,0.0
step_count,228057.0
total_reward,1716.0
total_steps,228057.0


[34m[1mwandb[0m: Agent Starting Run: mu65z6gg with config:
[34m[1mwandb[0m: 	alpha: 0.09949199131261176
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.7280591931037893
[34m[1mwandb[0m: 	delta: 3.918747613713569
[34m[1mwandb[0m: 	epsilon: 0.34962903355724617
[34m[1mwandb[0m: 	gamma: 0.29757915208029334
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,█▂▁▄▂▁▃▄▄▄▅▅▅▅▅▄▄▅▅▅▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄
reward,▃▃▁▃▂▁▃▆▃▂▂▁▄▂▃▂▃▇█▂▃▂▄▃▂▂▂▄▄▄▂▃▂▄▁▃▁▁▃▂
step_count,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
total_reward,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
total_steps,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
episode,999.0
epsilon,0.0
mean_reward,1.815
reward,1.0
step_count,222864.0
total_reward,1815.0
total_steps,222864.0


[34m[1mwandb[0m: Agent Starting Run: 2t4q45ky with config:
[34m[1mwandb[0m: 	alpha: 0.5115322438406518
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.20446788497553017
[34m[1mwandb[0m: 	delta: 3.905408738604125
[34m[1mwandb[0m: 	epsilon: 0.46993510629904983
[34m[1mwandb[0m: 	gamma: 0.28227734606823274
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▃██▇▇█▇▇▇▇▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
reward,▁▅▃▄▄▁█▄▂▂▁▂▁▃▃▃▂▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
total_reward,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇███████████████████████
total_steps,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██

0,1
episode,999.0
epsilon,0.0
mean_reward,0.761
reward,0.0
step_count,5405225.0
total_reward,761.0
total_steps,5405225.0


[34m[1mwandb[0m: Agent Starting Run: w4pvssiy with config:
[34m[1mwandb[0m: 	alpha: 0.09929158166099952
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.8511896931477622
[34m[1mwandb[0m: 	delta: 3.873262879616561
[34m[1mwandb[0m: 	epsilon: 0.4668353228218358
[34m[1mwandb[0m: 	gamma: 0.34563830190497935
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▂█▅▅▃▃▄▆▅▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
reward,▄▅▁▅▄▄▅▄▄▂▄▅▅▁▅▁▄▁▁▂▂▂▄▄▄▁▄▁▅▅▄█▄▅▄▁▄▄▁▁
step_count,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
total_reward,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
total_steps,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
episode,999.0
epsilon,0.0
mean_reward,1.713
reward,2.0
step_count,242417.0
total_reward,1713.0
total_steps,242417.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7y5fsau9 with config:
[34m[1mwandb[0m: 	alpha: 0.07691775987689141
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.6189665904829935
[34m[1mwandb[0m: 	delta: 3.2837908365266952
[34m[1mwandb[0m: 	epsilon: 0.3202308068478132
[34m[1mwandb[0m: 	gamma: 0.5440796202070324
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▆▃▁▃▅█▆▅▄▅▄▄▄▄▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▄▄▄▄▄▄▄▃▃▄▄
reward,▁▆▅▆▁█▁▁▁▃▆▅▃▁▆▅▆▆▅▆▅▁▁▅▆▅▃▅▅▁▅▆▅▅▃█▁█▁▅
step_count,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
total_reward,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
total_steps,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
episode,999.0
epsilon,0.0
mean_reward,1.644
reward,2.0
step_count,222285.0
total_reward,1644.0
total_steps,222285.0


[34m[1mwandb[0m: Agent Starting Run: tgmrat5q with config:
[34m[1mwandb[0m: 	alpha: 0.13318994424339095
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.768138760462493
[34m[1mwandb[0m: 	delta: 3.807195969202033
[34m[1mwandb[0m: 	epsilon: 0.42057950627416274
[34m[1mwandb[0m: 	gamma: 0.6413366651477639
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,▅▁▄▆▇▇▇▇██▇▇▇█████▇▇▇▇▆▆▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄
reward,▂▃▄▁▁▃▂▃▃█▁▂▃▄▁▃▃▄▁▁▅▂▁▄▁▂▂▂▂▂▃▂▁▃▁▁▃▄▁▁
step_count,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
total_reward,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
total_steps,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
episode,999.0
epsilon,0.0
mean_reward,1.567
reward,0.0
step_count,203912.0
total_reward,1567.0
total_steps,203912.0


[34m[1mwandb[0m: Agent Starting Run: 2eocfu51 with config:
[34m[1mwandb[0m: 	alpha: 0.0168088119157564
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decay_rate: 0.6987549253786745
[34m[1mwandb[0m: 	delta: 4.083356225890131
[34m[1mwandb[0m: 	epsilon: 0.4745287005916566
[34m[1mwandb[0m: 	gamma: 0.19885097752688832
[34m[1mwandb[0m: 	max_episodes: 1000
[34m[1mwandb[0m: 	max_steps: 10000
[34m[1mwandb[0m: 	replay_buffer_size: 10000


  deprecation(
  deprecation(





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_reward,█▄▂▁▁▁▁▁▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward,▂▅▆▃▂▂▂▃▃▂▃▄▂▁▃▂▁▅▃▄▂▃▁▄▁▄▂▁▅▁█▃▃▄▄▃▂▅▃▁
step_count,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
total_reward,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
total_steps,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████

0,1
episode,999.0
epsilon,0.0
mean_reward,1.928
reward,1.0
step_count,277876.0
total_reward,1928.0
total_steps,277876.0
