<a href="https://colab.research.google.com/github/EVSoaress/rl_studies/blob/main/rl_normalized_advantage_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DQN for continuous action spaces: Normalized Advantage Function (NAF)

In [1]:
!apt-get install -y xvfb

!pip install \
    gym==0.22 \
    gym[box2d] \
    pytorch-lightning==1.6.0 \
    pyvirtualdisplay

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 5 not upgraded.
Need to get 785 kB of archives.
After this operation, 2,271 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.11 [785 kB]
Fetched 785 kB in 1s (627 kB/s)
Selecting previously unselected package xvfb.
(Reading database ... 123941 files and directories currently installed.)
Preparing to unpack .../xvfb_2%3a1.19.6-1ubuntu4.11_amd64.deb ...
Unpacking xvfb (2:1.19.6-1ubuntu4.11) ...
Setting up xvfb (2:1.19.6-1ubuntu4.11) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/p

In [2]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

<pyvirtualdisplay.display.Display at 0x7fcbb59f0d50>

In [3]:
import copy
import gym
import torch
import random

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()




In [4]:
def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

###Create the Deep Q-Network

In [5]:
class NafDQN(nn.Module):

  def __init__(self, hidden_size, obs_size, action_dims, max_action):
    super().__init__()
    self.action_dims = action_dims
    self.max_action = torch.from_numpy(max_action).to(device)
    self.net = nn.Sequential(
        nn.Linear(obs_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU()
    )
    self.linear_mu = nn.Linear(hidden_size, action_dims)
    self.linear_value = nn.Linear(hidden_size, 1)
    self.linear_matrix = nn.Linear(hidden_size, 
                                   int(action_dims * (action_dims + 1) / 2))
    
    #Mu: compute action hightest Q-value
    @torch.no_grad()
    def mu(self, x):
      x = self.net(x)
      x = self.linear_mu
      x = torch.tanh(x) * self.max_action
      return x

    #value: compute value state
    @torch.no_grad()
    def value(self, x):
      x = self.net(x)
      x = self.linear_value(x)
      return x

    #forward: Q-value
    def forward(self, x, a):
      x = self.net(x)
      mu = torch.tanh(self.linear_mu(x)) * self.max_action
      value = self.linear_value(x)

      #P[x]
      matrix = torch.tanh(self.linear_matrix(x))

      L = torch.zeros(x.shape[0], self.action_dims,  self.action_dims)
      tril_indices = torch.tril_indices(row=self.action_dims, 
                                        col=self.actio_dims).to(device)
      L[:, tril_indices[0], tril_indices[1]] = matrix
      L.diagonal(dim1 = 1, dim2 = 2).exp_()
      P = L * L.transpose(2,1)

      u_mu = (a-mu).unsqueeze(dim=1)
      u_mu_t = u_mu.transpose(1, 2)

      adv = -1 / 2 * u_mu @ P @ u_mu_t
      adv = adv.unsqueeze(dim=1)

      return adv + value 

###Create the policy

In [6]:
def noisy_policy(state, env, net, epsilon=0.0):
  state = torch.tensor([state]).to(device)
  amin = torch.from_numpy(env.action_space.low).to(device)
  amax = torch.from_numpy(env.action_space.high).to(device)
  mu = net.mu(state)
  mu = mu + torch.normal(0, epsilon, mu.size(), device = device)
  action = mu.clamp(amin, amax)
  action = action.squeeze().cpu().numpy()

  return action

###Replay Buffer


In [7]:
class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)
  
  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [8]:
class RLDataset(IterableDataset):

  def __init__(self, buffer, sample_size=200):
    self.buffer = buffer
    self.sample_size = sample_size

  def __iter__(self):
    for experience in self.buffer.sample(self.sample_size):
      yield experience

####Create environment

In [9]:
class RepeatActionWrapper(gym.Wrapper):
  
  def __init__(self, env, n):
    super().__init__(env)
    self.env = env 
    self.n = n

  def step(self, action):
    done = False
    total_reward = 0.0

    for _ in range(self.n):
      next_state, reward, done, info = self.env.step(action)
      total_reward += reward
      if done:
        break
  pass

In [10]:
def create_enviroment(name):
  env = gym.make(name)
  env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x: x% 50 == 0)
  env = RepeatActionWrapper(env, n=8)
  env = RecordEpisodeStatistics(env)
  return env

###Update target network

In [12]:
def polyak_fuction(net, target_net, tau=0.01):

  for qp, tp in zip (net.parameters(), target_net.parameters()):
    tp.data.copy_(tau * qp.data + (1 - tau) * tp.data)

###Create DQL Algoritm

In [14]:
class NafDeepQLearning(LightningModule):
  def __init__(self, env_name, policy=noisy_policy, capacity=100_000,
               batch_size=256, lr=1e-4, hidden_size=512, gamma=0.99,
               loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=2.0,
               eps_end=0.2, eps_last_episode=1_000, samples_per_epoch=1_000,
               tau=0.01):
    
    super().__init__()

    self.env = create_enviroment(env_name)

    obs_size = self.env.observation_space.shape[0]
    action_dim = self.env.action_space.shape[0]
    max_action = self.env.action_space.high

    self.q_net = NafDQN(hidden_size, obs_size, action_dims, max_action).to(device)
    self.policy = policy

    self.buffer = ReplayBuffer(capacity=capacity)
    self.save_hyperparameters()

    while len(self.buffer) < self.hparams.samples_per_epoch:
      print(f'{len(self.buffer)} sample in experience buffer. Filling...')
      self.play_episode(epsilon=self.hparams.eps_start)

  @torch.no_grad()
  def play_episode(self, policy=None, epsilon=0.):
      state = self.env.reset()
      done = False

      while not done:
        if policy:
          action = policy(state, self.env, self.q_net, epsilon=epsilon)
        else:
          action = self.env.action_space.sample()
        next_state, reward, done, info = self.env.step(action)
        exp = (state, action, reward, done, next_state)
        self.buffer.append(exp)
        state = next_state

#forward
  def forward(self, x):
    return self.q_net(x)

#configure optimizers
  def configure_optimizers(self):
    q_net_optimizer = self.hparams.optim(self.q_net.parameters(), lr=self.hparams.lr)
    return [q_net_optimizer]

#DataLoader
  def train_dataloader(self):
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    dataloader = DataLoader(dataset=dataset, 
                            batch_size = self.hparams.batch_size)
    return dataloader

  def training_step(self, batch, batch_idx):
    states, actions, rewards, dones, next_states = batch
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    action_values = self.q_net(states, actions)

    next_state_values = self.target_q_net.values(next_states)
    next_state_values[dones]=0.0

    target = rewards + self.hparams.gamma * next_state_values

    loss = self.hparams.loss_fn(action_values, target)
    self.log("episode/loss", loss)
    return loss

  def training_epoch_end(self, training_step_outputs):
    
    epsilon = max (
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    self.last_episode(policy=self.policy, epsilon=epsilon)

    polyak_average(self.q_net, self.target_q_net, tau=self.hparams.tau)

    