# DQN with NAF (Normalized Advantage Function) for Continuous Action Spaces

In [1]:
import copy
import gym
import random
import torch

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics , TimeLimit


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

In [2]:
## Creating Q Network :
class NafDQN(nn.Module):
    
    def __init__(self , hidden_size , obs_size , action_dims , max_action) : # max_value scale our action
        super().__init__()
        self.action_dims = action_dims
        self.max_action = torch.from_numpy(max_action).to(device)
        self.net = nn.Sequential(
            nn.Linear(obs_size , hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size , hidden_size),
            nn.ReLU()
        )
        
        self.linear_mu = nn.Linear(hidden_size , action_dims)
        
        self.linear_value = nn.Linear(hidden_size , 1)
        
        self.linear_matrix = nn.Linear(hidden_size , int(action_dims * (action_dims+1)/2)) 
        # A trick to make the computation of P more efficent and to ensure its PD
        
    
    
    # Mu : Compute the action with the highest Q-value
    @torch.no_grad()
    def mu(self , x ):
        x = self.net(x)
        x = self.linear_mu(x)
        x = torch.tanh(x) * self.max_action # -> to keep the outputs in the range of action space
        return x
    
    # Value : Compute the value of each state
    @torch.no_grad()
    def value(self , x):
        x = self.net(x)
        x = self.linear_value(x)
        return x
    
    
    # Forward : Compute Q as a function of mu and value
    def forward(self , x , a):
        x = self.net(x)
        mu = torch.tanh(self.linear_mu(x)) * self.max_action
        value = self.linear_value(x)
        
        #P(x)
        matrix = torch.tanh(self.linear_matrix(x))
        L = torch.zeros((x.shape[0] , self.action_dims , self.action_dims)).to(device) # batch_size * action_dims * action_dim
        trill_indices = torch.tril_indices(row=self.action_dims , col = self.action_dims).to(device)
        L[ : , trill_indices[0] , trill_indices[1]] = matrix
        L.diagonal(dim1=1 , dim2=2).exp_() # to ensure P is PD
        P = L * L.transpose(2,1)
        
        u_mu = (a-mu).unsqueeze(dim=1)
        u_mu_t = u_mu.transpose(1,2)
        
        adv = - 0.5 * u_mu @ P @ u_mu_t
        adv = adv.squeeze(dim=-1)
        
        Q = value + adv
        
        return Q
        

In [3]:
## Creating Policy
def noisy_policy(state , env , net , epsilon=0.0):
    state = torch.tensor([state]).to(device)
    amin = torch.from_numpy(env.action_space.low).to(device)
    amax = torch.from_numpy(env.action_space.high).to(device)
    mu = net.mu(state) # estimating best action
    mu = mu + torch.normal(0,epsilon , mu.size() , device=device) # for exploration :
    #at first epsilon is high however as the agent gets better epsilon gets smaller as well
    action = mu.clamp(amin , amax)
    action = action.squeeze().cpu().numpy()
    return action

In [4]:
## Creating Replay Buffer:
class ReplayBuffer:
    
    def __init__(self , capacity):
        self.buffer = deque(maxlen=capacity) #    it's like a list but manages its contents automaticly
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
    
    def sample(self , batch_size):
        return random.sample(self.buffer , batch_size)

In [5]:
class RLDataset(IterableDataset):
    
    def __init__ (self , buffer , sample_size = 200):
        self.buffer = buffer
        self.sample_size = sample_size
        
    def __iter__(self):
        for experience in self.buffer.sample(self.sample_size):
            yield experience # returns by request of pytorch

In [6]:
class RepeatActionWrapper(gym.Wrapper): # every time we select an action we apply it multiple times before moving on
                                        # to the next state and action. This makes our choice of actions more consistant &
                                        # it also simplifies learning process because the agent has to choose an action less often
    def __init__(self , env , n): 
        super().__init__(env)
        self.env=env
        self.n = n
    
    def step(self , action):
        done = False
        tatal_reward = 0
        
        for _ in range(self.n):
            next_state , reward , done , info = self.env.step(action)
            if done:
                break
        
        return next_state , reward , done , info

In [7]:
## Creating Environment
def create_environment(name):
    env = gym.make(name)
    env = TimeLimit(env , max_episode_steps = 400)  #terminates after 400 steps
    env = RecordVideo(env , video_folder = './videos' , episode_trigger=lambda x: x%50==0 )
    env = RepeatActionWrapper(env , n = 8)
    env = RecordEpisodeStatistics(env)
    return env

In [8]:
def polyak_average(net , target_net , tau=0.01):
    for qp,tp in zip(net.parameters() , target_net.parameters()):
        tp.data.copy_(tau * qp.data + (1-tau)*tp.data)

In [9]:
class NAFDeepQLearning(LightningModule):
    
    def __init__(self , env_name , policy = noisy_policy , capacity = 100_000 , batch_size=256 , lr = 1e-4 ,
                 hidden_size=512 , gamma = 0.99 , loss_fn = F.smooth_l1_loss , optim = AdamW , eps_start = 2.0,
                 eps_end=0.1 , eps_last_episode=1000 , samples_per_epoch=1000 , tau=0.01 ):
        super().__init__()
        self.env = create_environment(env_name)
        obs_size = self.env.observation_space.shape[0]
        action_dims = self.env.action_space.shape[0]
        max_action = self.env.action_space.high
        
        self.q_net= NafDQN(hidden_size , obs_size , action_dims , max_action).to(device)
        self.target_q_net = copy.deepcopy(self.q_net)
        
        self.policy = policy
        self.buffer = ReplayBuffer(capacity=capacity)
        
        self.save_hyperparameters()
        
        while len(self.buffer)  < self.hparams.samples_per_epoch:
            self.play_episode(epsilon=self.hparams.eps_start)
        
    @torch.no_grad()
    def play_episode(self ,policy=None ,  epsilon =0):
        state = self.env.reset()
        done = False
        while not done :
            
            if policy:
                action = policy(state , self.env , self.q_net , epsilon = epsilon)
            else:
                action = self.env.action_space.sample()
            next_state , reward , done , _ = self.env.step(action)
            exp = (state , action , reward , done , next_state)
            self.buffer.append(exp)
            state = next_state
            
     # forward
    def forward(self , x):
        output = self.q_net.mu(x)
        return output
    
    
    # configure optimizers
    def configure_optimizers(self):
        q_net_optimizer = self.hparams.optim(self.q_net.parameters() , lr = self.hparams.lr)
        return [q_net_optimizer]
    
    
    # create dataloader
    def train_dataloader(self):
        dataset = RLDataset(self.buffer , self.hparams.samples_per_epoch)
        dataloader = DataLoader(dataset=dataset ,batch_size=self.hparams.batch_size )
        return dataloader
    
    
    # training step
    def training_step(self , batch , batch_idx):
        states , actions , rewards , dones , next_states = batch
        rewards = rewards.unsqueeze(1)
        dones = dones.unsqueeze(1)
        
        action_values = self.q_net(states,actions)
        next_state_values = self.target_q_net.value(next_states)
        next_state_values[dones]=0.0
        
        
        target = rewards + self.hparams.gamma * next_state_values
        
        loss = self.hparams.loss_fn(action_values , target)
        self.log('episode/Q-loss' , loss)
        return loss
    
        # training epoch end
    def training_epoch_end(self, training_step_outputs):
        epsilon = max(self.hparams.eps_end , self.hparams.eps_start - self.current_epoch/self.hparams.eps_last_episode)
        self.play_episode(policy=self.policy , epsilon=epsilon)
        polyak_average(self.q_net , self.target_q_net , tau = self.hparams.tau)
        
        self.log('episode/Return' , self.env.return_queue[-1])



In [2]:
#!rm -r lightning_logs/
#!rm -r videos/
%load_ext tensorboard
%tensorboard --logdir lightning_logs/version_0

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [14]:
algo = NAFDeepQLearning('LunarLanderContinuous-v2' , lr=1e-3)
trainer = Trainer ( gpus = num_gpus , max_epochs = 3_000 )
trainer.fit(algo)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: C:\Users\Ali\Documents\RLwithPhil\code\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type   | Params
----------------------------------------
0 | q_net        | NafDQN | 270 K 
1 | target_q_net | NafDQN | 270 K 
----------------------------------------
540 K     Trainable params
0         Non-trainable params
540 K     Total params
2.163     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Exception ignored in: <function Viewer.__del__ at 0x000002AAE6A010D0>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\vrep\lib\site-packages\gym\envs\classic_control\rendering.py", line 185, in __del__
    self.close()
  File "C:\ProgramData\Anaconda3\envs\vrep\lib\site-packages\gym\envs\classic_control\rendering.py", line 101, in close
    self.window.close()
  File "C:\ProgramData\Anaconda3\envs\vrep\lib\site-packages\pyglet\window\win32\__init__.py", line 332, in close
    super(Win32Window, self).close()
  File "C:\ProgramData\Anaconda3\envs\vrep\lib\site-packages\pyglet\window\__init__.py", line 858, in close
    app.windows.remove(self)
  File "C:\ProgramData\Anaconda3\envs\vrep\lib\_weakrefset.py", line 114, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x000002AB262A9B80; to 'Win32Window' at 0x000002AAD3956550>
