Import Torch Packages

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
import torch.optim as optim

#### Import Gym Packages

In [2]:
from ale_py import ALEInterface
ale = ALEInterface()

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [3]:
from ale_py.roms import Breakout
ale.loadROM(Breakout)

Game console created:
  ROM file:  /home/beegass/.virtualenvs/dl_1/lib/python3.8/site-packages/AutoROM/roms/breakout.bin
  Cart Name: Breakout - Breakaway IV (1978) (Atari)
  Cart MD5:  f34f08e5eb96e500e851a80be3277a56
  Display Format:  AUTO-DETECT ==> NTSC
  ROM Size:        2048
  Bankswitch Type: AUTO-DETECT ==> 2K

Running ROM file...
Random seed is 1650306551


In [4]:
import gym
from gym.wrappers import (
    FrameStack, 
    AtariPreprocessing, 
    RecordEpisodeStatistics
)

from stable_baselines3.common.atari_wrappers import (
    ClipRewardEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    MaxAndSkipEnv,
    NoopResetEnv
)

#### All Other Packages

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange
import random
import copy
from loguru import logger
import wandb
import time
import math

In [6]:
wandb.login(key="5966d774f384473f3d7ed674ef762b1a26a54d63")

[34m[1mwandb[0m: Currently logged in as: [33mbeegass[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/beegass/.netrc


True

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check if GPU is available

##### The Meat And Potatoes 

In [8]:
def make_env(env_name, seed=42):
    env = gym.make(env_name, frameskip=1, repeat_action_probability=0)
    env = AtariPreprocessing(env)
    env = RecordEpisodeStatistics(env)
    env = ClipRewardEnv(env)
    env = EpisodicLifeEnv(env)
    env = FrameStack(env, 4)
    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env 
        

In [9]:
class Replay_Buffer():
    def __init__(self, capacity, mini_batch_size=128):
        self.rb = []
        self.capacity = capacity
        self.mini_batch_size = mini_batch_size

    def get_rb_batch(self):
        sample = random.sample(self.rb, self.mini_batch_size)
        states, actions, next_states, rewards, done = zip(*sample[:(self.mini_batch_size)])
        preprocessed_states = preprocess_two(states)
        preprocessed_next_states = preprocess_two(next_states)
        return preprocessed_states, actions, preprocessed_next_states, rewards, done
    
    def add_to_rb(self, new_transition):
        if len(self.rb) >= self.capacity:
            del self.rb[0] 
        self.rb.append(new_transition)

In [10]:
# class DQN(nn.Module):
#     def __init__(self, in_channels=4, num_actions=4):
#         super(DQN, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
#         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
#         self.fc4 = nn.Linear(7 * 7 * 64, 512)
#         self.fc5 = nn.Linear(512, num_actions)

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = F.relu(self.conv2(x))
#         x = F.relu(self.conv3(x))
#         x = F.relu(self.fc4(x.view(x.size(0), -1)))
#         return self.fc5(x)

In [11]:
class DQN(nn.Module):
    def __init__(self, in_channels=4, num_actions=4, img_h=84, img_w=84):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1

        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(img_w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(img_h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, num_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [12]:
class Agent(nn.Module):
    def __init__(self, pred_model, target_model):
        super(Agent, self).__init__()
        self.prediction_net = pred_model
        self.target_net = target_model 
        
    def epsilon_greedy(self, env, state, epsilon):
        prob = random.random()
        q_val = torch.zeros(1, 4).to(device)
        action = env.action_space.sample() # pick action from action space
        if prob < 1 - epsilon.val:
            q_val = self.agent_policy(state=state, pred_model=True, grad=False) # retrieve best action, based off its action-value 
            action = torch.argmax(q_val, 1)
        return action, (torch.max(q_val, dim=1)[0]).detach()
        
    def agent_policy(self, state, pred_model=True, grad=False):
        q_val = None
        # 0.1: preprocess state
        preprocessed_state = preprocess_two(state) # preprocess(state)
        preprocessed_state = preprocessed_state.to(device)
        if pred_model:
            if grad:
                q_val = self.prediction_net(preprocessed_state)
            else:
                with torch.no_grad():
                    q_val = self.prediction_net(preprocessed_state)
        else:
            with torch.no_grad():
                q_val = self.target_net(preprocessed_state)
        return q_val
    
    def copy_pred_to_target(self):
        self.target_net.load_state_dict(self.prediction_net.state_dict())
        self.target_net.eval()

##### Epsilon

In [13]:
class Epsilon():
    def __init__(self, epsilon_start=1, p_init=0.9, p_end=0.05, decay_rate=200, max_episodes=10000, max_steps=10000):
        self.val = epsilon_start
        self.p_init = p_init
        self.p_end = p_end
        self.decay = decay_rate
        self.max_episodes = max_episodes
        self.max_steps = max_steps 

    def linear_epsilon_decay(self, episode_num):
        epsilon_decay_rate = max(((self.max_episodes - episode_num) / self.max_episodes), 0)
        self.val = ((self.p_init - self.p_end) * (epsilon_decay_rate)) + self.p_end

    def quad_epsilon_decay(self, episode_num):
        epsilon_decay_rate = max(math.exp(-1. * episode_num / self.decay), 0)
        self.val = ((self.p_init - self.p_end) * (epsilon_decay_rate)) + self.p_end


##### Preprocess

In [14]:
def preprocess(state):
    
    # convert state to numpy array and then to torch tensor
    frame = torch.from_numpy(np.array(state).astype(np.float32))
    
    # reshape so that grayscaling is possible
    reshaped_frame = frame.reshape(4, 3, 210, 160)
    
    # grayscale image
    gray_frame = T.Grayscale()(reshaped_frame)
    
    # reshape image so network can process it
    reshaped_gray_frame = gray_frame.reshape(1, 4, 210, 160)
    
    # downscale image to 84x84
    small_gray_frame = T.Resize((84, 84))(reshaped_gray_frame)
    
    return small_gray_frame
    

In [15]:
def preprocess_two(state):
    convert_arr = None
    if len(state) > 4:
        # add additional dimension to numpy array, so we can add batch then lower dimensionality
        convert_arr = np.squeeze(np.array(np.expand_dims(state, 0)).astype(np.float32), axis=0)
    else: 
        # add additional dimension to numpy array
        convert_arr = np.array(np.expand_dims(state, 0)).astype(np.float32)
    
    # convert state to numpy array and then to torch tensor
    return torch.from_numpy(convert_arr)
    
    

##### Global Variables

In [16]:
def config_tune() -> None:
    cfg = {
        'method': 'bayes', #grid, random
        'metric': {
            'name': 'Mean Episodic Reward',
            'goal': 'maximize' #minimize, maximize  
        },
        'parameters': {
            'lr': {'distribution': 'uniform',
                                      'min': 0.000001,
                                      'max': 0.001},
            'weight_decay': {'distribution': 'uniform',
                                      'min': 0.1,
                                      'max': 0.99},
            'replay_buffer_size': {
                'value': 1000000    
            },
            'gamma': {
                'value': 0.99    
            },
            'target_freq': {'distribution': 'int_uniform',
                                      'min': 1,
                                      'max': 5000},
            'p_end': {
                'value': 0.01
            },
            'p_init': {
                'value': 0.999
            },
            'epsilon_decay_rate': {'distribution': 'int_uniform',
                                      'min': 1,
                                      'max': 1500},
            'epsilon_policy': {
                'values': ['linear', 'quad']
            },
            'batch_size': {
                'value': 128
            },
            'max_episodes': {
                'value': 100000
            },
            'max_steps': {
                'value': 100000
            }
        }
    }
    
    return cfg

In [17]:
def vanilla_config() -> None:
    cfg = {
        'batch_size': 128,
        'gamma': 0.999,
        'lr': 0.1929,
        'max_episodes': 1000,
        'max_steps': 100000,
        'p_end': 0.05,
        'p_init': 0.9,
        'replay_buffer_size': 1000000,
        'target_freq': 70,
        'weight_decay': 0.5424,
        'epsilon_decay_rate': 23,
        'epsilon_policy': 'linear',
    }
    
    return cfg

##### Buidling Optimizers and Schedulers

In [18]:
def build_optimizer(model, optimizer_name='adam', learning_rate=0.01, weight_decay=0.01, momentum=0.9):
    try:
        optimizer = None
        if optimizer_name == "sgd":
            optimizer = optim.SGD(model.parameters(), 
                                  lr=learning_rate, 
                                  momentum=momentum)
            
        elif optimizer_name == "adam":
            optimizer = optim.Adam(model.parameters(), 
                                   lr=learning_rate, 
                                   weight_decay=weight_decay)
               
        return optimizer
    except:
        print("Error: Invalid optimizer specified.")
        sys.exit(1)

In [19]:
def build_scheduler(optimizer, sched_name='reduce_lr', patience=5, verbose=True):
    try: 
        sched = None
        if sched_name == "reduce_lr":
            sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode='min', 
                                                         patience=patience, 
                                                         verbose=verbose)
        elif sched_name == 'TODO':
            pass
            #TODO: add other scheduler
            
        return sched
    except:
        logger.error("Error: Invalid scheduler specified.")
        sys.exit(1)

##### Training Loop

In [20]:
def train(replay_buffer, the_agent, loss_fn, optimizer, scheduler, gamma=0.95):
    #1. retrieve (s, a, r, s') from mini_batch
    states, actions, next_states, rewards, done = replay_buffer.get_rb_batch()
    
    actions = torch.tensor(actions, dtype = torch.long, device = device)
    rewards = torch.tensor(rewards, dtype = torch.float, device = device).detach()
    done = torch.tensor(done, dtype=torch.int32, device = device).detach()
    
    pred_q_val_matrix = the_agent.agent_policy(states, pred_model=True, grad=True)
    # print(f"action {actions}")
    pred_q_val = torch.gather(pred_q_val_matrix, 1, actions.unsqueeze(1)).squeeze(1)
    # print(f"pred_q_val {pred_q_val}")
    target_q_val_matrix = the_agent.agent_policy(next_states.detach(), pred_model=False, grad=False).detach()
    # print(f"target_q_val_matrix {target_q_val_matrix}")
    target_q_val = torch.max(target_q_val_matrix, dim=1)[0]
    # print(f"target_q_val {target_q_val}")
    #zero_or_one = torch.ones(done.shape, dtype=torch.float, device = device) - done
    y_j = rewards + ((gamma * target_q_val) * (1 - done))
    # print(f"y_j {y_j}")
    loss = loss_fn(pred_q_val, y_j.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

##### Episode Specific Loop

In [21]:
def run_episode(env, the_agent, replay_buffer, epsilon, gamma, loss_fn, optimizer, scheduler):
    step_count = 0
    cumulative_reward = 0
    cumulative_loss = 0
    cumulative_q_val = 0
    
    # 0. get initial state, s_{0}
    state = env.reset(seed=42)
    
    while True:
        #1. either explore or exploit
        action, action_q_val = the_agent.epsilon_greedy(env=env, 
                                          state=state, 
                                          epsilon=epsilon)
        
        #2. pass action to environment
        (next_state, reward, done, info) = env.step(action)
        
        #3. add transition (s, a, s', r) to replay buffer
        replay_buffer.add_to_rb((state, action, next_state, reward, done))
        
        #4. if replay buffer is full, sample mini batch and update model
        if len(replay_buffer.rb) > replay_buffer.mini_batch_size and not epsilon.val <= 0.000001:
            loss = train(replay_buffer, the_agent, loss_fn, optimizer, scheduler, gamma)
            cumulative_loss += loss
            
        
        cumulative_reward += reward
        cumulative_q_val += action_q_val
        state = next_state
        
        #5. check max number of time steps has been reached or if game is complete
        if step_count >= epsilon.max_steps or done:
            step_count += 1
            return cumulative_loss, cumulative_reward, cumulative_q_val, step_count, info
        
        step_count += 1
        
        

##### Episodic Loop  

In [22]:
def run():
    cfg = vanilla_config()
    with wandb.init(project="BeeGass-Agents", entity="beegass", config=cfg, monitor_gym=True, mode="offline"):
        config = wandb.config
        
        # initialize gym environment
        env = make_env(env_name='ALE/Breakout-v5', seed=42)
        
        # set values for epsilon 
        epsilon = Epsilon(epsilon_start=1, 
                          p_init=config.p_init, 
                          p_end=config.p_end, 
                          decay_rate=config.epsilon_decay_rate, 
                          max_episodes=config.max_episodes, 
                          max_steps=config.max_steps)
        
        eps_policy = config.epsilon_policy
        
        # initialize prediction network
        #pred_net = Deep_Q_Network(environment.le_env.action_space.n).to(device)
        pred_net = DQN(4, 4).to(device)
        target_net = DQN(4, 4).to(device)
        
        # initialize agent that contains both prediction network and target network
        the_agent = Agent(pred_model=pred_net, target_model=target_net)
        the_agent.copy_pred_to_target()
        
        # define loss function
        loss_fn = nn.SmoothL1Loss() #nn.HuberLoss(reduction='mean', delta=config.delta)
        
        # define optimizer
        optimizer = build_optimizer(model=the_agent.prediction_net, 
                                    optimizer_name='adam', 
                                    learning_rate=config.lr,
                                    weight_decay=config.weight_decay)
        
        # define scheduler
        scheduler = build_scheduler(optimizer, 
                                    sched_name='reduce_lr', 
                                    patience=5, 
                                    verbose=True)
        
        # initialize replay buffer
        replay_buffer = Replay_Buffer(capacity=config.replay_buffer_size, mini_batch_size=config.batch_size)
        
        episode_cumulative_reward = 0
        episode_cumulative_loss = 0 
        episode_cumulative_q_val = 0
        total_steps = 0
        
        for e in range(epsilon.max_episodes):
            
            # 1. iterate over steps in episode
            cumulative_loss, cumulative_reward, cumulative_q_val, step_count, episode_info  = run_episode(env=env, 
                                                                                                          the_agent=the_agent,
                                                                                                          replay_buffer=replay_buffer, 
                                                                                                          epsilon=epsilon,
                                                                                                          gamma=config.gamma,
                                                                                                          loss_fn=loss_fn,
                                                                                                          optimizer=optimizer, 
                                                                                                          scheduler=scheduler)
            
            env.close()
            
            # 3. decay epsilon
            # epsilon = config.decay_rate * epsilon
            if eps_policy == "linear":
                epsilon.linear_epsilon_decay(e+1)
            else:
                epsilon.quad_epsilon_decay(e+1)
                
            if e % config.target_freq == 0:
                the_agent.copy_pred_to_target()
            
            if not e+1 <= 5:
                episode_cumulative_reward += cumulative_reward
                episode_cumulative_loss += cumulative_loss
                episode_cumulative_q_val += cumulative_q_val
                total_steps += step_count
                
                wandb.log({"episode": e, "Episode Info": episode_info}, step=e)
                wandb.log({"episode": e, "Mean Episodic Action Value ": episode_cumulative_q_val/(e+1)}, step=e)
                wandb.log({"episode": e, "Mean Episodic Reward": (episode_cumulative_reward/(e+1))}, step=e)
                wandb.log({"episode": e, "Reward Per Episode": cumulative_reward}, step=e)
                wandb.log({"episode": e, "Step Count": step_count}, step=e)
                wandb.log({"episode": e, "Total Step Count": total_steps}, step=e)
                wandb.log({"episode": e, "Loss Per Episode": cumulative_loss}, step=e)
                wandb.log({"episode": e, "Mean Episodic Loss": episode_cumulative_loss/(e+1)}, step=e)
                wandb.log({"episode": e, "Epsilon": epsilon.val}, step=e)
                
            if (episode_cumulative_loss/(e+1)) >= 1000:
                break 

In [23]:
# run()

In [24]:
sweep_id = wandb.sweep(config_tune(), project="dqn-sweeps")

Create sweep with ID: c1dd5f8z
Sweep URL: https://wandb.ai/beegass/dqn-sweeps/sweeps/c1dd5f8z


In [25]:
wandb.agent(sweep_id, run, count=12)

[34m[1mwandb[0m: Agent Starting Run: zsalnpxe with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epsilon_decay_rate: 1212
[34m[1mwandb[0m: 	epsilon_policy: linear
[34m[1mwandb[0m: 	gamma: 0.99
[34m[1mwandb[0m: 	lr: 0.0002201643888647923
[34m[1mwandb[0m: 	max_episodes: 100000
[34m[1mwandb[0m: 	max_steps: 100000
[34m[1mwandb[0m: 	p_end: 0.01
[34m[1mwandb[0m: 	p_init: 0.999
[34m[1mwandb[0m: 	replay_buffer_size: 1000000
[34m[1mwandb[0m: 	target_freq: 4280
[34m[1mwandb[0m: 	weight_decay: 0.8803563469687061


A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]
  deprecation(
  deprecation(
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.





VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epsilon,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
Loss Per Episode,▂▁▂▃█▃▂▃▂▂▃▃▂▃▂▂▃▄▂▂▂▂▃▂▂▂█▂▃▃▃▂▃▃▃▃▂▂▃▂
Mean Episodic Action Value,▂▁▁▁▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
Mean Episodic Loss,▁▁▂▃▄▅▆▇▇▇▇▇▇███████████████████████████
Mean Episodic Reward,█▃▂▂▁▃▃▄▄▃▃▃▃▃▃▃▂▂▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
Reward Per Episode,▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁
Step Count,▂▁▁▂█▁▁▁▁▁▂▂▁▂▁▁▁▃▁▁▁▁▂▁▁▁█▁▂▃▂▁▁▁▃▃▁▁▂▁
Total Step Count,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Epsilon,0.97318
Loss Per Episode,2.14172
Mean Episodic Action Value,0.72054
Mean Episodic Loss,2.52398
Mean Episodic Reward,0.19954
Reward Per Episode,0.0
Step Count,29.0
Total Step Count,90680.0
episode,2610.0
