# Deep Q-Network
Source: https://github.com/higgsfield/RL-Adventure/blob/master/1.dqn.ipynb

In [2]:
import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F

In [3]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

## Custom Environment
References: https://towardsdatascience.com/creating-a-custom-openai-gym-environment-for-stock-trading-be532be3910e

`action_space`: all possible actions an agent can take in the environment <br>
`observation_space`: all of the environments data to be observed by the agent <br>
`reset`: resets environment to an initial state <br>
`step`: step in the environment where an action will be provided by the model which must be executed and is followed by an observation <br>
`render`: human interpretable rendition of the environment

In [18]:
from gym import spaces
import gym

import random

In [178]:
MIN_REWARD_SCORE = 0

class RAMEnv(gym.Env):
    """A RAM Environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        super(RAMEnv, self).__init__()
        
        self.reward_range = (MIN_REWARD_SCORE, 1)    # availability * (1-cost incurred due to modifications)
        
        # Actions (very basic)
        # 1. do nothing
        # 2. remove 1 machine
        self.action_space = spaces.MultiBinary(1)
        # Observation 
        # Availability of system
        # Add cost later etc.
        self.observation_space = spaces.Discrete(10)

    def reset(self):
        # Reset the state of the environment to an initial state
        self.no_machines = 10
        self.availability = 0.85
        
        self.current_step = 0
        
        return self._next_observation()
    
    def _next_observation(self):
        
        if self.no_machines > 5:
            self.availability = random.uniform(0,0.85)
        else:
            self.availability = random.uniform(0,0.95)
        
        obs = [self.no_machines, self.availability]
        
        return obs

    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)
        
        self.current_step += 1
        done = True if self.no_machines == 0 or self.availability <= 0 else False
        
        reward = self.availability
        
        obs = self._next_observation()
        
        return obs, reward, done, {}
    
    def _take_action(self, action):
        
        if action == 0:
            # do nothing
            pass
        elif action == 1:
            self.no_machines -= 1
    
    def render(self, mode='human'):
        print(f'No Machines: {self.no_machines} - Availability: {self.availability}')

## Replay Buffer

In [198]:
from collections import deque

class ReplayBuffer(object):
    """ Replay buffer - captures history of DQN 
    
    Arguments
    ---------
        capacity : int
            Capacity of buffer (steps?)
    """
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [199]:
env = RAMEnv()

In [211]:
env.observation_space

Discrete(10)

## Epsilon Greedy Exploration

In [None]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

# The following function modifies the exploration coefficient based on frame idx
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [None]:
plt.plot([epsilon_by_frame(i) for i in range(10000)])

## Deep Q-Network

In [212]:
class DQN(nn.Module):
    """ Deep Q-Network
    
    Arguments
    ---------
        num_inputs : int
            Number of inputs equals the observation space shape, this will be a flattened tensor from a 2D image
        num_actions : int
            NUmber of actions agent can take
    """
    def __init__(self, num_inputs, num_actions):
        super(DQN, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(env.observation_space.n, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, env.action_space.n)
        )
        
    def forward(self, x):
        return self.layers(x)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
            q_value = self.forward(state)
            action  = q_value.max(1)[1].item()
        else:
            action = random.randrange(env.action_space.n)
        return action

In [213]:
model = DQN(env.observation_space.n, env.action_space.n)

In [214]:
print(model)

DQN(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [215]:
if USE_CUDA:
    model = model.cuda()
    
optimizer = optim.Adam(model.parameters())

replay_buffer = ReplayBuffer(1000)

## Computing Temporal Difference Loss

In [None]:
def compute_td_loss(batch_size):
    """ Computes the temporal difference loss
    
    Arguments
    ---------
        batch_size : int
            TODO
    Returns
    -------
        loss : float(?)
            TODO
    """
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    q_values      = model(state)
    next_q_values = model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

## Training

In [None]:
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = model.act(state, epsilon)
    print(f'Action: {action}')
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        print(f'Loss: {loss.item()}')
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)