Import Torch Packages

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.optim as optim

#### Import Gym Packages

In [2]:
from ale_py import ALEInterface
ale = ALEInterface()

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [3]:
from ale_py.roms import Breakout
ale.loadROM(Breakout)

Game console created:
  ROM file:  /home/beegass/.virtualenvs/dl_1/lib/python3.8/site-packages/AutoROM/roms/breakout.bin
  Cart Name: Breakout - Breakaway IV (1978) (Atari)
  Cart MD5:  f34f08e5eb96e500e851a80be3277a56
  Display Format:  AUTO-DETECT ==> NTSC
  ROM Size:        2048
  Bankswitch Type: AUTO-DETECT ==> 2K

Running ROM file...
Random seed is 1647638730


In [4]:
import gym
from gym.wrappers import FrameStack

#### All Other Packages

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
import random
import copy
import typing
from typing import Callable
import PIL 
from PIL import Image
from abc import ABC, abstractmethod
from loguru import logger

  def _figure_formats_changed(self, name, old, new):


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check if GPU is available

##### The Meat And Potatoes 

In [7]:
class Gym_Env():
    def __init__(self, env_name, max_steps=1000, max_episodes=10000):
        self.le_env = FrameStack(gym.make(env_name, render_mode='rgb_array'), 4)
        self.max_steps = max_steps
        self.max_episodes = max_episodes
        

In [8]:
class Replay_Buffer():
    def __init__(self, capacity, mini_batch_size=128):
        self.rb = []
        self.capacity = capacity
        self.mini_batch_size = mini_batch_size
        self.current_batch = None

    def sample_rb(self):
        return random.sample(self.rb, batch_size=self.mini_batch_size)
    
    def add_to_rb(self, new_transition):
        if len(self.rb) >= self.capacity:
            del self.rb[0] 
        self.rb.append(new_transition)

In [9]:
# ripped from https://nn.labml.ai/rl/dqn/index.html
class Deep_Q_Network(nn.Module):
    def __init__(self):
        super(Deep_Q_Network, self).__init__()
        
        self.network = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
        self.activation = nn.ReLU()
        
        self.state_value = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=1)
            )
        
        self.action_value = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=4),
        )
    
    def forward(self, x):
        x = self.network(x)
        x = x.reshape((-1, 7 * 7 * 64))
        x = self.activation(self.lin(x))
        action_value = self.action_value(x)
        state_value = self.state_value(x)
        action_score_centered = action_value - action_value.mean(dim=-1, keepdim=True)
        q = state_value + action_score_centered
        return q  
        
    

In [10]:
class Agent(nn.Module):
    def __init__(self, pred_model):
        super(Agent, self).__init__()
        self.agent = pred_model
        self.target = None
        
    def agent_policy(self, state, pred_model=True, grad=False):
        q_val = -1
        state = state.to(device)
        if pred_model:
            if grad:
                q_val = self.agent(state)
            else:
                with torch.no_grad():
                    q_val = self.agent(state)
        else:
            with torch.no_grad():
                q_val = self.target(state)
        return torch.argmax(q_val), q_val
    
    def copy_pred_to_target(self):
        self.target = copy.deepcopy(self.agent)
    

##### Epsilon

In [11]:
def epsilon_greedy(environment, the_agent, state, epsilon=0.1):
    prob = random.random()
    action = environment.le_env.action_space.sample() # pick action from action space
    if prob < 1 - epsilon:
        action, _ = the_agent.agent_policy(state) # retrieve best action, based off its action-value action
    return action

In [12]:
def epsilon_decay(environment, episode_num, p_init=0.7, p_end=0.1):
    episode_num += 1
    current_episode_rate = (environment.max_episodes - episode_num) / environment.max_episodes
    epsilon_decay_rate = max(current_episode_rate, 0)
    return ((p_init - p_end) * (epsilon_decay_rate) + p_end)

##### Preprocess

In [13]:
def preprocess(state):
    
    # convert state to numpy array and then to torch tensor
    frame = torch.from_numpy(np.array(state).astype(np.float32))
    
    # reshape so that grayscaling is possible
    reshaped_frame = frame.reshape(4, 3, 210, 160)
    
    # grayscale image
    gray_frame = T.Grayscale()(reshaped_frame)
    
    # reshape image so network can process it
    reshaped_gray_frame = gray_frame.reshape(1, 4, 210, 160)
    
    # downscale image to 84x84
    small_gray_frame = T.Resize((84, 84))(reshaped_gray_frame)
    
    return small_gray_frame
    

##### Rendering

In [14]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [15]:
def show_state(img, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(img['rgb'])
    plt.axis('off')
    plt.show()

    display.clear_output(wait=True)
    display.display(plt.gcf())

##### Global Variables

In [16]:
MAX_EPISODES = 10000
MAX_STEPS = 10000
REPLAY_BUFFER_SIZE = 100000
MINI_BATCH_SIZE = 128
EPSILON = 0.1
GAMMA = 0.99

##### Buidling Optimizers and Schedulers

In [17]:
def build_optimizer(model, optimizer_name='adam', learning_rate=0.01, weight_decay=0.01, momentum=0.9):
    try:
        optimizer = None
        if optimizer_name == "sgd":
            optimizer = optim.SGD(model.parameters(), 
                                  lr=learning_rate, 
                                  momentum=momentum)
            
        elif optimizer_name == "adam":
            optimizer = optim.Adam(model.parameters(), 
                                   lr=learning_rate, 
                                   weight_decay=weight_decay)
               
        return optimizer
    except:
        print("Error: Invalid optimizer specified.")
        sys.exit(1)

In [18]:
def build_scheduler(optimizer, sched_name='reduce_lr', patience=5, verbose=True):
    try: 
        sched = None
        if sched_name == "reduce_lr":
            sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode='min', 
                                                         patience=patience, 
                                                         verbose=verbose)
        elif sched_name == 'TODO':
            pass
            #TODO: add other scheduler
            
        return sched
    except:
        logger.error("Error: Invalid scheduler specified.")
        sys.exit(1)

##### Training Loop

In [19]:
def train(replay_buffer, the_agent, optimizer, scheduler):
    mini_batch = replay_buffer.sample_rb()
    
    #1. copy weights from prediction network to target target network
    the_agent.copy_pred_to_target()
    
    #2. init y_j
    y_j = 0 
    
    #3. retrieve (s, a, r, s') from mini_batch
    for transition in mini_batch:
        (state, _, next_state, reward, done) = transition
        _, pred_highest_q_val = the_agent.agent_policy(state, grad=True)
        y_j = torch.FloatTensor([reward])
        if not done:
            _, target_highest_q_val = the_agent.agent_policy(next_state, pred_model=False)
            y_j += GAMMA * target_highest_q_val
        loss = nn.MSELoss(pred_highest_q_val, y_j.detach())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 
    the_agent.target = None

##### Episode Specific Loop

In [20]:
def run_episode(render, environment, the_agent, state, epsilon, optimizer, scheduler):
    replay_buffer = Replay_Buffer(capacity=REPLAY_BUFFER_SIZE, mini_batch_size=MINI_BATCH_SIZE)
    
    step_count = 0
    metadata = None
    render_results = []
    while True:
        
        #0. either explore or exploit
        action = epsilon_greedy(environment=environment,
                                the_agent=the_agent,
                                state=state,
                                epsilon=epsilon)
        
        #1. render environment only when flag is set to True
        if render:
            # TODO: render environment is currently not working
            show_state(metadata, step=step_count)
        
        #2. pass action to environment
        (next_state, reward, done, metadata) = environment.le_env.step(action)
        
        #3. get s' back from environment and preprocess (s' -> preprocessed_s')
        preprocessed_next_state = preprocess(next_state)
        
        #4. add transition (s, a, s', r) to replay buffer
        replay_buffer.add_to_rb((state, action, preprocessed_next_state, reward, done))
        
        #5. if replay buffer is full, sample mini batch and update model
        if len(replay_buffer.rb) >= replay_buffer.capacity:
            train(replay_buffer, the_agent, optimizer, scheduler)
        
        #6. check max number of time steps has been reached or if game is complete
        if step_count >= environment.max_steps or done:
            break
        
        state = preprocessed_next_state
        
        step_count += 1
        
        

##### Episodic Loop  

In [21]:
def run(epsilon=EPSILON):
    # initialize gym environment
    environment = Gym_Env(env_name='ALE/Breakout-v5', max_steps=MAX_STEPS, max_episodes=MAX_EPISODES)
    
    # initialize prediction network
    #pred_net = Deep_Q_Network(environment.le_env.action_space.n).to(device)
    pred_net = Deep_Q_Network().to(device)
    
    # initialize agent that contains both prediction network and target network
    the_agent = Agent(pred_model=pred_net)
    
    # define loss function
    loss_fn = nn.MSELoss()
    
    # define optimizer
    optimizer = build_optimizer(model=the_agent.agent, 
                                optimizer_name='adam', 
                                learning_rate=0.01, 
                                weight_decay=0.01, 
                                momentum=0.9)
    
    # define scheduler
    scheduler = build_scheduler(optimizer, 
                                sched_name='reduce_lr', 
                                patience=5, 
                                verbose=True)
    
    render_flag = False
    for e in range(environment.max_episodes):
        
        if e + 1 % 100 == 0:
            render_flag = True
        else:
            render_flag = False
        
        # 0. get initial state, s_{0}, and preprocess it (s_{0} -> preprocessed_s)
        state = environment.le_env.reset()
        
        # 0.1: preprocess state
        preprocessed_state = preprocess(state)
        
        # 1. iterate over steps in episode
        run_episode(render=render_flag,
                    environment=environment, 
                    the_agent=the_agent,
                    state=preprocessed_state, 
                    epsilon=epsilon, 
                    optimizer=optimizer, 
                    scheduler=scheduler)
        
        # 2. close rendering
        if e + 1 % 100 == 0:
            logger.info(f"Episode {e+1} complete.")
        environment.le_env.close()
        
        # 3. decay epsilon
        epsilon = epsilon_decay(environment=environment, episode_num=e)
    logger.info("Training complete.")

In [22]:
run()

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]
  deprecation(
  deprecation(


KeyboardInterrupt: 