Import Torch Packages

In [182]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.optim as optim

#### Import Gym Packages

In [183]:
import gym
from gym.wrappers import FrameStack

#### All Other Packages

In [184]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
import random
import copy
import typing
from typing import Callable
import PIL 
from PIL import Image
from abc import ABC, abstractmethod

##### The Meat And Potatoes 

In [185]:
class Gym_Env():
    def __init__(self, env_name, max_steps=1000, max_episodes=10000):
        self.env = gym.make(env_name)
        self.max_steps = max_steps
        self.max_episodes = max_episodes
        

In [186]:
class Replay_Buffer():
    def __init__(self, capacity, mini_batch_size):
        self.rb = []
        self.capacity = capacity
        self.mini_batch_size = mini_batch_size
        self.current_batch = None

    def sample_rb(self):
        self.current_batch = random.sample(self.rb, batch_size=self.mini_batch_size)
    
    def add_to_rb(self, new_transition):
        if len(self.rb) >= self.capacity:
            del self.rb[0] 
        self.rb.append(new_transition)

In [187]:
class Deep_Q_Network(nn.Module):
    def __init__(self):
        super(Deep_Q_Network, self).__init__()
        
        self.network = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=5, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )
    
    def forward(self, x):
        return self.network(x)
    

In [188]:
class Agent(nn.Module):
    def __init__(self, pred_model):
        super(Agent, self).__init__()
        self.agent = pred_model
        self.target = None
        
    def get_action_val(self, state, no_grad=True):
        if no_grad:
            with torch.no_grad():
                q_val = self.agent(state)
        else:
            q_val = self.agent(state)
        return torch.argmax(q_val), q_val
    
    def copy_pred_to_target(self):
        self.target = copy.deepcopy(self.agent)
    

In [189]:
resize = T.Compose([T.ToPILImage(),
                            T.Resize(40, interpolation = Image.CUBIC),
                            T.ToTensor()])

def get_cart_location(screen_width, state, environment):
    world_width = environment.env.x_threshold * 2
    scale = screen_width / world_width
    # return middle/center location of the cart body
    return int(state * scale + screen_width / 2.0)

def get_screen(state, environment):
    screen = environment.env.render(mode='rgb_array').transpose((2, 0, 1))
    # crop image as cart is in lower half, top and bottom screen unimportant
    _, screen_height, screen_width = screen.shape
    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
    view_width = int(screen_width * 0.6)
    cart_location = get_cart_location(screen_width, state, environment)
    if cart_location < view_width // 2:
        slice_range = slice(view_width)
    elif cart_location > (screen_width - view_width // 2):
        slice_range = slice(-view_width, None)
    else:
        slice_range = slice(cart_location - view_width // 2,
                            cart_location + view_width // 2)
    # crop so that we have square image centered on cart
    screen = screen[:, :, slice_range]
    # convert to float, rescale and conver 
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    # convert to tensor
    screen = torch.from_numpy(screen)
    # screen capture is of size 3x160x360
    # insert singleton dim at index 0 
    # to change size to 1x3x40x90
    # size follows from (160) or (360) / (1 img + 3 channels)
    return resize(screen).unsqueeze(0)

##### Global Variables

In [190]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check if GPU is available
MAX_EPISODES = 10000
MAX_STEPS = 10000
REPLAY_BUFFER_SIZE = 5000
MINI_BATCH_SIZE = 36
EPSILON = 0.95

##### Buidling Optimizers and Schedulers

In [191]:
def build_optimizer(model, optimizer_name='adam', learning_rate=0.01, weight_decay=0.01, momentum=0.9):
    try:
        optimizer = None
        if optimizer_name == "sgd":
            optimizer = optim.SGD(model.parameters(), 
                                  lr=learning_rate, 
                                  momentum=momentum)
            
        elif optimizer_name == "adam":
            optimizer = optim.Adam(model.parameters(), 
                                   lr=learning_rate, 
                                   weight_decay=weight_decay)
               
        return optimizer
    except:
        print("Error: Invalid optimizer specified.")
        sys.exit(1)

In [192]:
def build_scheduler(optimizer, sched_name='reduce_lr', patience=5, verbose=True):
    try: 
        sched = None
        if sched_name == "reduce_lr":
            sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode='min', 
                                                         patience=patience, 
                                                         verbose=verbose)
        elif sched_name == 'TODO':
            pass
            #TODO: add other scheduler
            
        return sched
    except:
        print("Error: Invalid scheduler specified.")
        sys.exit(1)

##### Training Loop

In [193]:
def get_best_q_val(state, is_pred=True, no_grad=False):
    if is_pred:
        action_val, q_val_arr  = the_agent.agent.get_action_val(state, no_grad=no_grad)
    else: 
        action_val, q_val_arr  = the_agent.target.get_action_val(state)
    return pred_q_val_arr[pred_action_val]

In [194]:
def train(replay_buffer, the_agent, loss_fn, optimizer, scheduler):
    replay_buffer.sample_rb()
    mini_batch = replay_buffer.current_batch\
    
    #1. copy weights from pred to target
    the_agent.copy_pred_to_target()
    
    #2. retrieve (s, a, r, s') from mini_batch
    for transition in mini_batch:
        (state, action, next_state, reward, done) = transition
        pred_highest_q_val = get_best_q_val(state)
        y_j = torch.FloatTensor([reward])
        if not done:
            target_highest_q_val = get_best_q_val(state, is_pred=False, no_grad=True)
            y_j += GAMMA * target_highest_q_val
        loss = loss_fn(pred_highest_q_val, y_j.detach())
        loss.backward()
        optimizer.step() 
    the_agent.target = None

In [195]:
def run_episode(environment, the_agent, loss_fn, optimizer, scheduler):
    replay_buffer = Replay_Buffer(capacity=REPLAY_BUFFER_SIZE, mini_batch_size=MINI_BATCH_SIZE)
    pred_agent = the_agent.agent
    #0. get initial state, s_{0}, and preprocess it (s_{0} -> preprocessed_s)
    state = environment.env.reset()
    state = get_screen(state, environment)
    action = 0
    next_state = state
    reward = 0
    done = False
    
    episode_active = True
    step_count = 0
    while episode_active and (step_count < environment.max_steps):
        random_action_prob = random.uniform(0.0, 1.0)
        if random_action_prob < EPSILON:
            action = environment.env.action_space.sample()
        else:
            action, _ = pred_agent.get_action_val(state)
        
        #1. pass action to environment
        next_state, reward, done, _ = environment.env.step(action)
        
        #2. get s' back from environment and preprocess (s' -> preprocessed_s')
        next_state = get_screen(next_state, environment)
        
        #3. add transition (s, a, s', r) to replay buffer
        replay_buffer.add_to_rb((state, action, next_state, reward, done))
        state = next_state
        
        #4. if replay buffer is full, sample mini batch and update model
        if len(replay_buffer.rb) == replay_buffer.capacity:
            train(replay_buffer, the_agent, loss_fn, optimizer, scheduler)
        
        #5. check max number of time steps has been reached or if game is complete
        if step_count >= environment.max_steps or done:
            episode_active = False
            environment.env.reset()
            
        step_count += 1
        
        

In [196]:
def run():
    # initialize variables for epsilon decay
    p_init = 0.7
    p_end = 0.1
    
    # initialize gym environment
    environment = Gym_Env(env_name='CartPole-v1', max_steps=MAX_STEPS, max_episodes=MAX_EPISODES)
    
    # initialize prediction network
    pred_net = Deep_Q_Network()
    
    # initialize agent that contains both prediction network and target network
    the_agent = Agent(pred_model=pred_net)
    
    # define loss function
    loss_fn = nn.MSELoss()
    
    # define optimizer
    optimizer = build_optimizer(model=the_agent.agent, 
                                optimizer_name='adam', 
                                learning_rate=0.01, 
                                weight_decay=0.01, 
                                momentum=0.9)
    
    # define scheduler
    scheduler = build_scheduler(optimizer, 
                                sched_name='reduce_lr', 
                                patience=5, 
                                verbose=True)
    
    episode = 0 # episode counter
    for e in trange(environment.max_episodes):
        run_episode(environment, the_agent, loss_fn, optimizer, scheduler)
        episode = e + 1
        current_episode_rate = (environment.max_episodes - episode) / environment.max_episodes
        epsilon_decay_rate = max(current_episode_rate, 0)
        EPSILON = ((p_init - p_end) * epsilon_decay_rate) + p_end

In [197]:
run()

  0%|          | 0/10000 [00:00<?, ?it/s]


TypeError: only size-1 arrays can be converted to Python scalars