Import Torch Packages

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

#### Import Gym Packages

In [None]:
import gym
from gym.wrappers import FrameStack

#### All Other Packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
import random
import copy
import typing
from typing import Callable 
from abc import ABC, abstractmethod

In [None]:
class Gym_Env():
    def __init__(self, env_name, max_steps=1000, max_episodes=10000):
        self.env = gym.make(env_name)
        self.max_steps = max_steps
        self.max_episodes = max_episodes
        

In [None]:
class Transition():
    def __init__(self, state, action, next_state, reward, done):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state
        self.game_complete = done
        self.transition = (self.state, self.action, self.next_state, self.reward)
    
    def change_state(self, state):
        return self.next_state = state

In [None]:
class Replay_Buffer():
    def __init__(self, capacity, mini_batch_size):
        self.rb = []
        self.capacity = capacity
        self.mini_batch_size = mini_batch_size
        self.current_batch = None
    
    def sample_rb(self):
        self.current_batch = random.sample(self.rb, batch_size=self.mini_batch_size)
    
    def add_to_rb(self, new_transition):
        if len(self.rb) >= self.capacity:
            del self.rb[0] 
        self.rb.append(new_transition)

In [None]:
class Deep_Q_Network(nn.Module):
    def __init__(self):
        super(Deep_Q_Network_Agent, self).__init__()
        
        self.network = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            n.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=5, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )
    
    def forward(self, x):
        return self.network(x)
    

In [None]:
class Agent(nn.Module):
    def __init__(self, pred_model):
        self.agent = pred_model
        self.target = self.copy_pred_to_target()
        
    def get_action_val(self, state):
        with torch.no_grad():
            q_val = self.agent(state)
        return torch.argmax(q_val)
    
    def copy_pred_to_target(self):
        self.target = copy.deepcopy(self.agent)
    

In [None]:
class Data_Preprocess(ABC):
    
    @abstractmethod
    def preprocess_state(self):
        pass
    
    @abstractmethod
    def preprocess_state(self):
        pass

##### Global Variables

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check if GPU is available
MAX_EPISODES = None
MAX_STEPS = None
REPLAY_BUFFER_SIZE = None
MINI_BATCH_SIZE = None
EPSILON = None

##### Buidling Optimizers and Schedulers

In [None]:
def build_optimizer(model, optimizer_name='adam', learning_rate=0.01, weight_decay=0.01, momentum=0.9):
    try:
        optimizer = None
        if optimizer_name == "sgd":
            optimizer = optim.SGD(model.parameters(), 
                                  lr=learning_rate, 
                                  momentum=momentum)
            
        elif optimizer_name == "adam":
            optimizer = optim.Adam(model.parameters(), 
                                   lr=learning_rate, 
                                   weight_decay=weight_decay)
               
        return optimizer
    except:
        print("Error: Invalid optimizer specified.")
        sys.exit(1)

In [None]:
def build_scheduler(optimizer, sched_name='reduce_lr', patience=5, verbose=True):
    try: 
        sched = None
        if sched_name == "reduce_lr":
            sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode='min', 
                                                         patience=patience, 
                                                         verbose=verbose)
        elif sched_name == 'TODO':
            pass
            #TODO: add other scheduler
            
        return sched
    except:
        print("Error: Invalid scheduler specified.")
        sys.exit(1)

##### Training Loop

In [None]:
def train(replay_buffer):
    replay_buffer.sample_rb()
    mini_batch = replay_buffer.current_batch\
    
    #1. copy weights from pred to target
    
    #2. retrieve (s, a, r, s') from mini_batch
    
    #3.1 pass to pred model s from (s, a, r, s')
    
    #3.2 pass to target model s' from (s, a, r, s')
    #hint: make sure target model is not performing back propagation
    
    #4. pass values generated from pred model to target model to bellman equation
    
    #5. use loss to perform back propagation on pred model
    #hint: use bellman equation to evaluate loss 
    
    #6. destory target model

In [None]:
def run_episode(environment, the_agent, loss_fn, optimizer, scheduler):
    replay_buffer = Replay_Buffer(capacity=REPLAY_BUFFER_SIZE, mini_batch_size=MINI_BATCH_SIZE)
    pred_agent = the_agent.agent
    #0. get initial state, s_{0}, and preprocess it (s_{0} -> preprocessed_s)
    # TODO: preprocess s_{0}
    
    episode_active = True
    step_count = 0
    while episode_active and (step_count < environment.max_steps):
        random_action_prob = random.uniform(0.0, 1.0)
        if random_action_prob < EPSILON:
            action = environment.env.action_space.sample()
        else:
            action = pred_agent.get_action_val(state)
        
        #1. pass action to environment
        next_state, reward, done, _ = environment.env.step(action)
        
        #2. get s' back from enviroment and preprocess (s' -> preprocessed_s')
        # TODO: preprocess s'
        
        #3. add transition (s, a, s', r) to replay buffer
        replay_buffer.add_to_rb(Transition(state=state, action=action, next_state=next_state, reward=reward, done=done))
        
        #4. if replay buffer is full, sample mini batch and update model
        if len(replay_buffer.rb) == replay_buffer.capacity:
            train(replay_buffer, the_agent, loss_fn, optimizer, scheduler)
        
        #5. check max number of time steps has been reached or if game is complete
        if step_count >= environment.max_steps or done:
            episode_active = False
            environment.env.reset()
            
        step_count += 1
        
        

In [None]:
def run():
    # initialize variables for epsilon decay
    p_init = 0.7
    p_end = 0.1
    
    # initialize gym environment
    environment = Gym_Env(env_name='CartPole-v1', max_steps=MAX_STEPS, max_episodes=MAX_EPISODES)
    
    # initialize prediction network
    pred_net = Deep_Q_Network()
    
    # initialize agent that contains both prediction network and target network
    the_agent = Agent(pred_model=pred_model)
    
    # define loss function
    loss_fn = nn.MSELoss()
    
    # define optimizer
    optimizer = build_optimizer(model=the_agent.agent, 
                                optimizer_name='adam', 
                                learning_rate=0.01, 
                                weight_decay=0.01, 
                                momentum=0.9)
    
    # define scheduler
    scheduler = build_scheduler(optimizer, 
                                sched_name='reduce_lr', 
                                patience=5, 
                                verbose=True)
    
    episode = 0 # episode counter
    for e in trange(environment.max_episodes):
        run_episode(environment, the_agent, loss_fn, optimizer, scheduler)
        episode = e + 1
        current_episode_rate = (environment.max_episodes - episode) / environment.max_episodes
        epsilon_decay_rate = max(current_episode_rate, 0)
        epsilon = ((p_init - p_end) * epsilon_decay_rate) + p_end