##### Copyright 2021 The TF-Agents Authors.

## Setup

To render video you will need to have ffmpeg and xvbf installed.
Typically, installing is done with the command

`sudo apt-get install -y xvfb ffmpeg`

Then, if you haven't installed the following dependencies, run:

In [None]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install gym[classic_control]
!pip install gym[box2d]

In [None]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import gym

In [None]:
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

## Environment


In [None]:
# env_name = 'CartPole-v0'                # discrete action space
# env_name = 'LunarLander-v2'             # discrete action space
# env_name = 'LunarLanderContinuous-v2'   # continuous action space
# env_name = 'BipedalWalker-v3'           # continuous action space
env_name = 'BipedalWalkerHardcore-v3'     # continuous action space
env = gym.make(env_name)

You can render this environment to see how it looks:

In [None]:
env.reset()
PIL.Image.fromarray(env.render(mode='rgb_array'))

In [None]:
print('Observation Spec:')
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

In [None]:
print('Action Spec:')
print(env.action_space)

In [None]:
print('Reward Spec:')
print(env.reward_range)

## Metrics and Evaluation

The most common metric used to evaluate a policy is the average return. The return is the sum of rewards obtained while running a policy in an environment for an episode. Several episodes are run, creating an average return.

The following function computes the average return of a policy, given the policy, environment, and a number of episodes.


In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    

    total_return = 0.0
    for _ in range(num_episodes):

        observation = environment.reset()
        episode_return = 0.0
        done = False
        while not done:
            action = policy(observation)
            observation, reward, done, info = environment.step(action)
            episode_return += reward
            total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return

In [None]:
def random_policy(observation):
    return env.action_space.sample()

Running this computation on the `random_policy` shows a baseline performance in the environment.

In [None]:
compute_avg_return(env, random_policy, num_episodes=10)

In [None]:
import torch
import torch.nn as nn
import math
import copy

In [None]:
class Residual(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear = nn.Linear(input_size, input_size, bias=True)
        #nn.init.normal_(self.linear.weight, mean = 0.0, std = sqrt(1/input_size))
        #self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        scores = self.linear(x)
        #scores = self.relu(scores)
        scores = scores + x
        return scores

In [None]:
class QNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.l1 = Residual(input_size)
        #self.selu = nn.SELU(inplace=True)
        self.l2 = nn.Linear(input_size, output_size, bias=True)

    def forward(self, x):
        out = self.l1(x)
        #out = self.selu(out)
        out = self.l2(out)
        return out

In [None]:
class Agent:
    # Agent class.

    def __init__(self, torque_factor, action_size, state_size, quantize_actions, q_module, q_init, gamma = 1.0, batch_size = 32, epsilon = 0.1):
        self.torque_factor = torque_factor
        self.action_size = action_size
        self.state_size = state_size
        self.action_space = torch.stack(action_size*[torch.linspace(-1, 1, quantize_actions, dtype=torch.float) * torque_factor])
        self.q1 = q_module(**q_init)
        self.q2 = copy.deepcopy(self.q1)
        self.gamma = gamma


    # First network
    # state should be a tensor of shape [BATCH, state_size]
    # action should be a tensor of shape [BATCH, action_size ,x]
    # return tensor will be size [BATCH, x]
    def Q1(self, state, action):
        return self._Q(state, action, self.q1)

    # Second network
    def Q2(self, state, action):
        return self._Q(state, action, self.q2)

    def _Q(self, state, action, q):
        inputs = self.shape_inputs(state, action)
        return q(inputs).squeeze(-1) 

    # state should be a tensor with size [BATCH, state_size]
    # action should be a tensor with size [BATCH, action_size, x]
    # output will be a tensor with size [BATCH, x, state_size + action_size]
    def shape_inputs(self, state, action):
        state = state.unsqueeze(1).repeat(1, action.shape[2], 1)
        action = action.permute(0, 2, 1)
        out = torch.cat((state, action), dim=2)
        return out


    # Given a state tensor we compute the action with the highest q value for each state
    # Input shape is [BATCH, state_size] for sdash
    # Output shape is [BATCH]
    def max_action(self, q, state):
        action = self.action_space.unsqueeze(0).repeat(state.shape[0], 1,1)
        vals = q(state, action)
        max, idxs = torch.max(vals, dim=1)
        return max


    # Computes new action value targets given reward and target state
    # reward shape should be [BATCH]
    # state should be shape [BATCH, state_size]
    def compute_targets(self, rewards, s, sdash):
        targets = rewards + self.gamma * self.Q1(sdash, self.max_action(self.Q2, sdash))


    def get_experiences(self, env, n_episodes):
        states = []
        actions = []
        rewards = []
        endstates = []
        for _ in range(num_episodes):
            state = env.reset()
            done = False
            while not done:
                states.append(state)
                action = self.policy(state)
                actions.append(action)
                state, reward, done, info = env.step(action)
                rewards.append(reward)
                endstates.append(endstate)
                print(state, action, reward, done, info)
        return states, actions, rewards, endstates

    # Do the training loop for some number of gathered experiences
    def train(self, states, actions, rewards, endstates):
        # permute experiences and iterate by batch_size
        # get targets
        # compute loss
        # do update
        pass

    # Given the current q functions, state, etc., give me the next action.
    # Start with simple epsilon-greedy
    def policy(self, state):
        greedy_action = self.max_action(self.Q1, state)

    # TODO: Function for saving/loading instances of this class
    # TODO: Terminal states?


size_actions = 4
size_state = 24
torque_factor = 1 # motor torque is scaled from [-1, 1] using this factor

action_quantize = 11
batch_size = 1
gamma = 1.0

qnet_init_params = {"input_size" : size_actions + size_state, 
                    "output_size" : 1}

rl = Agent(torque_factor, size_actions, size_state, action_quantize, QNet, qnet_init_params, gamma = gamma, batch_size = batch_size)
fake_state = torch.randn((2, size_state))
fake_action = torch.randn((2, size_actions, 1))
rl.Q1(fake_state, fake_action)
rl.max_action(rl.Q1, fake_state)


## Visualization


### Videos

In [None]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

Now iterate through a few episodes of the Cartpole game with the agent. The underlying Python environment (the one "inside" the TensorFlow environment wrapper) provides a `render()` method, which outputs an image of the environment state. These can be collected into a video.

In [None]:
def create_policy_eval_video(policy, filename, num_episodes=5, fps=30):
    filename = filename + ".mp4"
    with imageio.get_writer(filename, fps=fps) as video:
        for _ in range(num_episodes):
            observation = env.reset()
            done = False
            video.append_data(env.render(mode='rgb_array'))
            while not done:
                action = policy(observation)
                observation, reward, done, info = env.step(action)
                video.append_data(env.render(mode='rgb_array'))
    return embed_mp4(filename)


In [None]:
create_policy_eval_video(random_policy, "random-agent", num_episodes=1)