##### Copyright 2021 The TF-Agents Authors.

## Setup

To render video you will need to have ffmpeg and xvbf installed.
Typically, installing is done with the command

`sudo apt-get install -y xvfb ffmpeg`

Then, if you haven't installed the following dependencies, run:

In [None]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install gym[classic_control]
!pip install gym[box2d]

In [None]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import gym

In [None]:
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

## Environment


In [None]:
# env_name = 'CartPole-v0'                # discrete action space
# env_name = 'LunarLander-v2'             # discrete action space
env_name = 'LunarLanderContinuous-v2'   # continuous action space
# env_name = 'BipedalWalker-v3'           # continuous action space
# env_name = 'BipedalWalkerHardcore-v3'     # continuous action space
env = gym.make(env_name)


You can render this environment to see how it looks:

In [None]:
env.reset()
PIL.Image.fromarray(env.render(mode='rgb_array'))

In [None]:
print('Observation Spec:')
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

In [None]:
print('Action Spec:')
print(env.action_space)

In [None]:
print('Reward Spec:')
print(env.reward_range)

## Metrics and Evaluation

The most common metric used to evaluate a policy is the average return. The return is the sum of rewards obtained while running a policy in an environment for an episode. Several episodes are run, creating an average return.

The following function computes the average return of a policy, given the policy, environment, and a number of episodes.


In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    

    total_return = 0.0
    for _ in range(num_episodes):

        observation = environment.reset()
        episode_return = 0.0
        done = False
        while not done:
            action = policy(observation)
            observation, reward, done, info = environment.step(action)
            episode_return += reward
            total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return

In [None]:
def random_policy(observation):
    return env.action_space.sample()

Running this computation on the `random_policy` shows a baseline performance in the environment.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import random
from functools import partial

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class Residual(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.l1 = nn.Linear(input_size, input_size, bias=True)
        self.l2 = nn.Linear(input_size, input_size, bias=True)
        #nn.init.normal_(self.linear.weight, mean = 0.0, std = sqrt(1/input_size))
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        scores = self.l1(x)
        scores = self.relu(scores)
        scores = self.l2(scores)
        scores = scores + x
        return scores

In [None]:
class QNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.l1 = Residual(input_size)
        self.l2 = nn.Linear(input_size, hidden_size, bias=True)
        self.l3 = nn.Linear(hidden_size, output_size, bias=True)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        return out

In [None]:
class ReplayBuffer():

    def __init__(self, max_size, device):
        self.size = max_size
        self.device = device
        self.states = None
        self.actions = None
        self.rewards = None
        self.endstates = None

    def sample(self, batchsize):
        randidxs = torch.randperm(self.states.shape[0], device = self.device)[:batchsize]
        return self.states[randidxs], self.actions[randidxs], self.rewards[randidxs], self.endstates[randidxs]

    def add_to_array(self, array1, array2):
        if array1:
            tmp = torch.cat(array1, array2)
            if tmp.shape[0] > self.size:
                tmp = tmp[tmp.shape[0] - self.size:]
            array1 = tmp
        else:
            array1 = array2

    def add(self, states, actions, rewards, endstates):
        if not self.states is None:
            tmp = torch.cat((self.states, states), dim = 0)
            if tmp.shape[0] > self.size:
                tmp = tmp[tmp.shape[0] - self.size:]
            self.states = tmp
        else:
            self.states = states

        if not self.actions is None:
            tmp = torch.cat((self.actions, actions), dim = 0)
            if tmp.shape[0] > self.size:
                tmp = tmp[tmp.shape[0] - self.size:]
            self.actions = tmp
        else:
            self.actions = actions

        if not self.rewards is None:
            tmp = torch.cat((self.rewards, rewards), dim = 0)
            if tmp.shape[0] > self.size:
                tmp = tmp[tmp.shape[0] - self.size:]
            self.rewards = tmp
        else:
            self.rewards = rewards

        if not self.endstates is None:
            tmp = torch.cat((self.endstates, endstates), dim = 0)
            if tmp.shape[0] > self.size:
                tmp = tmp[tmp.shape[0] - self.size:]
            self.endstates = tmp
        else:
            self.endstates = endstates



# Variant B

In [None]:
class DDQNAgent:
    # Agent class.

    def __init__(self, torque_factor, action_size, state_size, action_quantize, q_module, q_init, gamma = 1.0, epsilon = 0.1, device = "cpu", buffersize = 10000):
        self.torque_factor = torque_factor
        self.action_size = action_size
        self.action_quantize = action_quantize
        self.state_size = state_size
        elems = [torch.linspace(-1, 1, action_quantize, dtype=torch.float) * torque_factor] * action_size
        self.action_space = torch.cartesian_prod(*elems).to(device)
        self.q_module = q_module
        self.q_init = q_init
        self.q1 = q_module(**q_init)
        self.q2 = copy.deepcopy(self.q1)
        self.gamma = gamma
        self.epsilon = epsilon
        self.device = device
        self.buffer = ReplayBuffer(buffersize, self.device)
        self.buffersize = buffersize
        self.q1.to(device)
        self.q2.to(device)
        if "cuda" in device:
            print("Using GPU")


    # First network
    # state should be a tensor of shape [BATCH, state_size]
    # action should be a tensor of shape [BATCH, x, action_size]
    # return tensor will be size [BATCH, x]
    def Q1(self, state, action):
        return self._Q(state, action, self.q1)

    # Second network
    def Q2(self, state, action):
        return self._Q(state, action, self.q2)

    def _Q(self, state, action, q):
        inputs = self.shape_inputs(state, action)
        return q(inputs).squeeze(-1) 

    # state should be a tensor with size [BATCH, state_size]
    # action should be a tensor with size [BATCH, x, action_size]
    # output will be a tensor with size [BATCH, x, state_size + action_size]
    def shape_inputs(self, state, action):
        state = state.unsqueeze(1).repeat(1, action.shape[1], 1)
        out = torch.cat((state, action), dim=2)
        return out


    # Given a state tensor we compute the action with the highest q value
    # Input shape is [BATCH, state_size] for sdash
    # Output shape is [BATCH, action_size]
    def max_action(self, q, state):
        action = self.action_space.unsqueeze(0).repeat(state.shape[0], 1,1)
        vals = q(state, action)
        max, idxs = torch.max(vals, dim=1)
        max_action = self.action_space[idxs]
        return max_action


    # Given a state tensor we choose an action with the probability of chosing that action determined by the softmax of the action value.
    def soft_action(self, q, state, temperature = 1.0):
        action = self.action_space.unsqueeze(0).repeat(state.shape[0], 1,1)
        vals = q(state, action) / temperature
        probs = F.softmax(vals, dim=1)
        idxs = torch.multinomial(probs, 1, replacement=True)
        soft_action = self.action_space[idxs].squeeze(1)
        return soft_action


    # Computes new action value targets given reward and target state
    # reward shape should be [BATCH]
    # state should be shape [BATCH, state_size]
    def compute_targets(self, rewards, sdash):
        action = self.max_action(self.Q1, sdash).unsqueeze(1)
        action_estimate = self.Q2(sdash, action).squeeze()
        targets = rewards + self.gamma * action_estimate
        return targets


    def get_experiences(self, env, n_episodes, policy):
        states = []
        actions = []
        rewards = []
        endstates = []
        # Currently we do episodes until we have n or more experiences
        # Just get an episode and batch that?
        for _ in range(n_episodes):
            state = env.reset()
            done = False
            while not done:
                state = torch.Tensor(state).unsqueeze(0).to(self.device)
                states.append(state)
                action = policy(state)
                actions.append(action)
                state, reward, done, info = env.step(action.squeeze().cpu().numpy())
                endstate = torch.Tensor(state).unsqueeze(0).to(self.device)
                rewards.append(reward)
                endstates.append(endstate)
        states = torch.cat(states, dim=0)
        actions = torch.stack(actions)
        rewards = torch.Tensor(rewards).to(self.device)
        endstates = torch.cat(endstates, dim=0)
        return states, actions, rewards, endstates
       


    # Do the training loop for some number of gathered experiences
    def train(self, env, train_steps, n_episodes, batch_per_step = 100, batch_size = 32, epochs = 10, lr = 0.1, t_0 = 5, t_decay = 0.99, phi_update = 5):
        self.q1.train() # update only q1, not q2
        self.q2.eval()

        temperature = t_0
        optimizer = torch.optim.Adam(self.q1.parameters(), lr=lr)
        #optimizer = torch.optim.SGD(self.q1.parameters(), lr = lr)
 
        for i in range(train_steps):
            print("Training step {}...".format(i+1))
            print("Grabbing experiences...")
            policy = partial(self.policy_t, temperature=temperature)
            self.buffer.add(*self.get_experiences(env, n_episodes, policy))
            # permute experiences and iterate by batch_size
            #n_experiences = states.shape[0]
            #print(states.shape, actions.shape, rewards.shape, endstates.shape)
            #shuffled = torch.randperm(n_experiences, device = self.device)

            print("Training... Current experience buffer size : {}", self.buffer.states.shape[0])
            for j in range(batch_per_step):
                for e in range(epochs):
                    states_b, actions_b, rewards_b, endstates_b = self.buffer.sample(batch_size)
                    targets_b = self.compute_targets(rewards_b, endstates_b).detach()
                    actual_b = self.Q1(states_b.detach(), actions_b.detach()).squeeze()

                     # compute loss
                    loss = F.mse_loss(actual_b, targets_b)
                    optimizer.zero_grad()
                    loss.backward()  # Gradients
                    #torch.nn.utils.clip_grad_norm_(self.q1.parameters(), 1)
                    optimizer.step()  # do update
                    
                if e+1 % 10 == 0:
                  print("Finished epoch {}".format(e+1))
            temperature = temperature * t_decay
            if i % phi_update == 0:
                #print("Updating second network")
                # update phi network
                self.q2.load_state_dict(self.q1.state_dict())
        self.q1.eval()
        self.q2.eval()

    # Given the current q functions, state, etc., give me the next action.
    # Start with simple epsilon-greedy
    # Alternative: softmax where we pick x with prob x
    def policy_t(self, state, temperature = 1.0):
        action = self.soft_action(self.Q1, state, temperature)
        return action
        # below is epsilon-greedy
        #greedy_action = self.max_action(self.Q1, state)
        #rand_idxs = torch.randint(low=0, high=self.action_space.shape[0], size=(state.shape[0],), device = self.device)
        #rand_action = self.action_space[rand_idxs]
        #rmask = torch.rand((state.shape[0], 1), device = self.device)
        #out = torch.where(rmask > self.epsilon, greedy_action, rand_action)
        #return out


    # like policy, but takes numpy arrays
    def policy(self, state, temperature = 1.0):
        state = torch.Tensor(state).unsqueeze(0).to(self.device)
        out = self.policy_t(state, temperature).squeeze().cpu().numpy()
        return out

    def to(self, device):
        self.device = device
        self.action_space = self.action_space.to(device) 
        self.q1 = self.q1.to(device)
        self.q1 = self.q2.to(device)

    # TODO: Function for saving/loading instances of this class

    def save(self, path):
        params = {
            "torque_factor" : self.torque_factor,
            "action_size" : self.action_size,
            "action_quantize": self.action_quantize,
            "state_size" : self.state_size,
            "q_module" : self.q_module,
            "q_init" : self.q_init,
            "gamma" : self.gamma,
            "epsilon" : self.epsilon
        }
        q1_state_dict = self.q1.state_dict()
        q2_state_dict = self.q2.state_dict()
        torch.save({"q1_state_dict" : q1_state_dict,
                    "q2_state_dict" : q2_state_dict,
                    "params" : params,
                    "buffer" : self.buffer}, path)
        
    def load(path):
        checkpoint = torch.load(path)
        params = checkpoint["params"]
        newAgent = DDQNAgent(**params)
        newAgent.buffer = checkpoint["buffer"]
        newAgent.q1.load_state_dict(checkpoint["q1_state_dict"])
        newAgent.q2.load_state_dict(checkpoint["q2_state_dict"])
        return newAgent

    # TODO: Terminal states?



In [None]:
from google.colab import drive
drive.mount('/content/drive')

size_actions = 2
size_state = 8
torque_factor = 1 # motor torque is scaled from [-1, 1] using this factor

action_quantize = 7
gamma = 1.0
epsilon = 0.5
buffer_size = 1000000

qnet_init_params = {"input_size" : size_state + size_actions, 
                    "hidden_size" : 64,
                    "output_size" : 1}

device = "cuda" if torch.cuda.is_available() else "cpu"

rl = DDQNAgent(torque_factor, size_actions, size_state, action_quantize, QNet, qnet_init_params, gamma = gamma, epsilon=epsilon, device=device, buffersize = buffer_size)

# Variant A

In [None]:
class DDQNAgentVA:
    # Agent class.

    def __init__(self, torque_factor, action_size, state_size, action_quantize, q_module, q_init, gamma = 1.0, epsilon = 0.1, device = "cpu", buffersize = 10000):
        self.torque_factor = torque_factor
        self.action_size = action_size
        self.action_quantize = action_quantize
        self.state_size = state_size
        elems = [torch.linspace(-1, 1, action_quantize, dtype=torch.float) * torque_factor] * action_size
        self.action_space = torch.cartesian_prod(*elems).to(device)
        self.q_module = q_module
        self.q_init = q_init
        self.q1 = q_module(**q_init, output_size = self.action_space.shape[0])
        self.q2 = copy.deepcopy(self.q1)
        self.gamma = gamma
        self.epsilon = epsilon
        self.device = device
        self.buffer = ReplayBuffer(buffersize, self.device)
        self.buffersize = buffersize
        self.q1.to(device)
        self.q2.to(device)
        if "cuda" in device:
            print("Using GPU")


    # First network
    # state should be a tensor of shape [BATCH, state_size]
    # return tensor will be size [BATCH, action_space size]
    def Q1(self, state):
        return self._Q(state, self.q1)

    # Second network
    def Q2(self, state):
        return self._Q(state, self.q2)

    def _Q(self, state, q):
        out = q(state)
        return out

    # Given a state tensor we compute the action with the highest q value
    # Input shape is [BATCH, state_size] for state
    # Output shape is [BATCH, action_size]
    def max_action(self, q, state):
        #action = self.action_space.unsqueeze(0).repeat(state.shape[0], 1,1)
        vals = q(state)
        #print("max_action", vals.shape)
        max, idxs = torch.max(vals, dim=1)
        #max_action = self.action_space[idxs]
        #print(max_action.shape, self.action_space.shape, idxs.shape)
        #print(idxs.shape)
        return idxs


    # Given a state tensor we choose an action with the probability of chosing that action determined by the softmax of the action value.
    def soft_action(self, q, state, temperature = 1.0):
        #action = self.action_space.unsqueeze(0).repeat(state.shape[0], 1,1)
        vals = q(state) / temperature
        probs = F.softmax(vals, dim=1)
        idxs = torch.multinomial(probs, 1, replacement=True).squeeze(1)
        soft_action = self.action_space[idxs]
        return soft_action


    def get_experiences(self, env, n_episodes, policy):
        states = []
        actions = []
        rewards = []
        endstates = []
        for _ in range(n_episodes):
            state = env.reset()
            done = False
            while not done:
                state = torch.Tensor(state).unsqueeze(0).to(self.device)
                states.append(state)
                action = policy(state)
                actions.append(action)
                state, reward, done, info = env.step(action.squeeze().cpu().numpy())
                endstate = torch.Tensor(state).unsqueeze(0).to(self.device)
                rewards.append(reward)
                endstates.append(endstate)
        states = torch.cat(states, dim=0)
        actions = torch.stack(actions)
        rewards = torch.Tensor(rewards).to(self.device)
        endstates = torch.cat(endstates, dim=0)
        return states, actions, rewards, endstates
       


    # Do the training loop for some number of gathered experiences
    def train(self, 
              env, 
              train_steps, 
              n_episodes = 10, 
              batch_per_step = 100, 
              batch_size = 32, 
              epochs = 10, 
              lr = 0.1, 
              t_0 = 5, 
              t_decay = 0.99, 
              phi_update = 5, 
              save_every = 10, 
              save_dir = "/content/drive/MyDrive/Models/trial.tar"):
        self.q1.train() # update only q1, not q2
        self.q2.eval()

        temperature = t_0
        optimizer = torch.optim.Adam(self.q1.parameters(), lr=lr)
        #optimizer = torch.optim.SGD(self.q1.parameters(), lr = lr)
 
        for i in range(train_steps):
            if i % 50 == 0:
              print("Training step {}...".format(i+1))
              print("Current average return: {}".format(compute_avg_return(env, rl.policy, num_episodes=10)))
            #print("Grabbing experiences...")
            policy = partial(self.policy_t, temperature=temperature)
            self.buffer.add(*self.get_experiences(env, n_episodes, policy))
            # permute experiences and iterate by batch_size
            #n_experiences = states.shape[0]
            #print(states.shape, actions.shape, rewards.shape, endstates.shape)
            #shuffled = torch.randperm(n_experiences, device = self.device)

            #print("Training...")
            for j in range(batch_per_step):
                for e in range(epochs):
                    states_b, actions_b, rewards_b, endstates_b = self.buffer.sample(batch_size)

                    #targets_b = self.compute_targets(rewards_b, endstates_b).detach()

                    action = self.max_action(self.Q1, endstates_b)
                    action_estimate = self.Q2(endstates_b).detach()

                    action_estimate = action_estimate.gather(1, action.unsqueeze(1))

                    targets_b = rewards_b.unsqueeze(1) + self.gamma * action_estimate

                    actual_b = self.Q1(states_b).gather(1, action.unsqueeze(1))
                    
                    # compute loss
                    loss = F.mse_loss(actual_b, targets_b)

                    #loss = targets_b - actual_b
                    #loss = - loss # flip, or network learns to crash
                    
                    optimizer.zero_grad()
                    #actual_b.backward(loss.data)  # Gradients
                    loss.backward()
                    #torch.nn.utils.clip_grad_norm_(self.q1.parameters(), 1)
                    optimizer.step()  # do update
                    
                if e+1 % 10 == 0:
                  print("Finished epoch {}".format(e+1))
            temperature = temperature * t_decay
            if i % phi_update == 0:
                #print("Updating second network")
                # update phi network
                self.q2.load_state_dict(self.q1.state_dict())
            if i % save_every == 0:
              self.save(save_dir)
            
        self.q1.eval()
        self.q2.eval()

    # Given the current q functions, state, etc., give me the next action.
    # Start with simple epsilon-greedy
    # Alternative: softmax where we pick x with prob x
    def policy_t(self, state, temperature = 1.0):
        action = self.soft_action(self.Q1, state, temperature)
        return action
        # below is epsilon-greedy
        #greedy_action = self.max_action(self.Q1, state)
        #rand_idxs = torch.randint(low=0, high=self.action_space.shape[0], size=(state.shape[0],), device = self.device)
        #rand_action = self.action_space[rand_idxs]
        #rmask = torch.rand((state.shape[0], 1), device = self.device)
        #out = torch.where(rmask > self.epsilon, greedy_action, rand_action)
        #return out


    # like policy, but takes numpy arrays
    def policy(self, state, temperature = 1.0):
        state = torch.Tensor(state).unsqueeze(0).to(self.device)
        out = self.policy_t(state, temperature).squeeze().cpu().numpy()
        return out

    def to(self, device):
        self.device = device
        self.action_space = self.action_space.to(device) 
        self.q1 = self.q1.to(device)
        self.q1 = self.q2.to(device)

    # TODO: Function for saving/loading instances of this class

    def save(self, path):
        params = {
            "torque_factor" : self.torque_factor,
            "action_size" : self.action_size,
            "action_quantize": self.action_quantize,
            "state_size" : self.state_size,
            "q_module" : self.q_module,
            "q_init" : self.q_init,
            "gamma" : self.gamma,
            "epsilon" : self.epsilon
        }
        q1_state_dict = self.q1.state_dict()
        q2_state_dict = self.q2.state_dict()
        torch.save({"q1_state_dict" : q1_state_dict,
                    "q2_state_dict" : q2_state_dict,
                    "params" : params,
                    "buffer" : self.buffer}, path)
        
    def load(path):
        checkpoint = torch.load(path)
        params = checkpoint["params"]
        newAgent = DDQNAgentVA(**params)
        newAgent.buffer = checkpoint["buffer"]
        newAgent.q1.load_state_dict(checkpoint["q1_state_dict"])
        newAgent.q2.load_state_dict(checkpoint["q2_state_dict"])
        return newAgent

    # TODO: Terminal states?


In [None]:
from google.colab import drive
drive.mount('/content/drive')

size_actions = 2
size_state = 8
torque_factor = 1 # motor torque is scaled from [-1, 1] using this factor

action_quantize = 7
gamma = 1.0
epsilon = 0.5
buffer_size = 1000000

qnet_init_params = {"input_size" : size_state, 
                    "hidden_size" : 64}

device = "cuda" if torch.cuda.is_available() else "cpu"

rl = DDQNAgentVA(torque_factor, size_actions, size_state, action_quantize, QNet, qnet_init_params, gamma = gamma, epsilon=epsilon, device=device, buffersize = buffer_size)

#fake_rewards = torch.randn((9,1), device = device)
#fake_state = torch.randn((9, 8), device = device)
#rl2.compute_targets(fake_rewards, fake_state)

#rl.train(env, train_steps = 500, n_episodes = 10, batch_per_step = 100, batch_size = 32, epochs = 10, lr = 0.5, t_0 = 3, t_decay=0.998, phi_update = 5)
#rl2.save("/content/drive/MyDrive/Models/trial.tar")

# Training

In [None]:
#rl = DDQNAgentVA.load("/content/drive/MyDrive/Models/trial2.tar")
#rl.to(device)
dirpath = "/content/drive/MyDrive/Models/lunarlander.tar"
rl.train(env, train_steps = 500, n_episodes = 10, batch_per_step = 500, batch_size = 32, epochs = 10, lr = 0.5, t_0 = 5, t_decay=0.997, phi_update = 5, save_every = 10, save_dir = dirpath)
rl.save(dirpath)

In [None]:
print(compute_avg_return(env, random_policy, num_episodes=50))
print(compute_avg_return(env, rl.policy, num_episodes=50))

## Visualization


### Videos

In [None]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

Now iterate through a few episodes of the Cartpole game with the agent. The underlying Python environment (the one "inside" the TensorFlow environment wrapper) provides a `render()` method, which outputs an image of the environment state. These can be collected into a video.

In [None]:
def create_policy_eval_video(policy, filename, num_episodes=5, fps=30):
    filename = filename + ".mp4"
    with imageio.get_writer(filename, fps=fps) as video:
        for _ in range(num_episodes):
            observation = env.reset()
            done = False
            video.append_data(env.render(mode='rgb_array'))
            while not done:
                action = policy(observation)
                observation, reward, done, info = env.step(action)
                video.append_data(env.render(mode='rgb_array'))
    return embed_mp4(filename)


In [None]:
create_policy_eval_video(random_policy, "random-agent", num_episodes=1)

In [None]:
create_policy_eval_video(rl.policy, "DDQN", num_episodes=1)