In [1]:
# # in google colab uncomment this

# import os

# os.system('apt-get install -y xvfb')
# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')
# os.system('apt-get install -y python-opengl ffmpeg')
# os.system('pip install pyglet==1.2.4')

# os.system('python -m pip install -U pygame --user')

# print('setup complete')

# XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    %env DISPLAY = : 1

Starting virtual X frame buffer: Xvfb.
env: DISPLAY=: 1


# Implementing Advantage-Actor Critic (A2C)

In this notebook you will implement Advantage Actor Critic algorithm that trains on a batch of Atari 2600 environments running in parallel. 

Firstly, we will use environment wrappers implemented in file `atari_wrappers.py`. These wrappers preprocess observations (resize, grayscal, take max between frames, skip frames and stack them together) and rewards. Some of the wrappers help to reset the environment and pass `done` flag equal to `True` when agent dies.
File `env_batch.py` includes implementation of `ParallelEnvBatch` class that allows to run multiple environments in parallel. To create an environment we can use `nature_dqn_env` function. Note that if you are using 
PyTorch and not using `tensorboardX` you will need to implement a wrapper that will log **raw** total rewards that the *unwrapped* environment returns and redefine the implemention of `nature_dqn_env` function here. 



In [2]:
import tensorflow as tf

tf.enable_eager_execution()

summary_writer = tf.contrib.summary.create_file_writer('logs/', flush_millis=1_000)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [None]:
import numpy as np
from atari_wrappers import nature_dqn_env

env = nature_dqn_env("SpaceInvadersNoFrameskip-v4", nenvs=8)

obs = env.reset()
assert obs.shape == (8, 84, 84, 4)
assert obs.dtype == np.uint8

Next, we will need to implement a model that predicts logits and values. It is suggested that you use the same model as in [Nature DQN paper](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) with a modification that instead of having a single output layer, it will have two output layers taking as input the output of the last hidden layer. **Note** that this model is different from the model you used in homework where you implemented DQN. You can use your favorite deep learning framework here. We suggest that you use orthogonal initialization with parameter $\sqrt{2}$ for kernels and initialize biases with zeros. 

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
class Model(nn.Module):
    def __init__(self, n_actions):
        super(Model, self).__init__()
        self.n_actions = n_actions
        self.conv_layers = nn.ModuleList([
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        ])
        
        self.linear1 = nn.Linear(in_features=3136, out_features=512)
        self.linear2 = nn.Linear(in_features=512, out_features=n_actions)
        self.linear3 = nn.Linear(in_features=512, out_features=1)
        
    def __call__(self, x):
        x = x.transpose(-1, 1)
        
        for layer in self.conv_layers:
            x = F.relu(layer(x))
        
        x = x.view(x.size(0), -1)
        x = F.relu(self.linear1(x))
        return self.linear2(x), self.linear3(x)

You will also need to define and use a policy that wraps the model. While the model computes logits for all actions, the policy will sample actions and also compute their log probabilities.  `policy.act` should return a dictionary of all the arrays that are needed to interact with an environment and train the model.
 Note that actions must be an `np.ndarray` while the other
tensors need to have the type determined by your deep learning framework. 

In [6]:
class Policy:
    def __init__(self, model, eps=0.1):
        self.model = model
        self.eps = eps
        
    def act(self, inputs):
        inputs = torch.tensor(inputs, dtype=torch.float)
        logits, values = self.model(inputs)

        actions = torch.argmax(logits, dim=-1).detach().numpy()
        random_actions = np.random.randint(0, model.n_actions, size=len(actions))
        
        # Eps-greedy
        mask = np.random.rand(len(actions)) < self.eps
        actions[mask] = random_actions[mask]
        
        log_probs = torch.log_softmax(logits, dim=-1)
        
        return {
            'actions': actions,
            'logits': logits,
            'log_probs': log_probs,
            'values': values.squeeze()
        }

Next will pass the environment and policy to a runner that collects partial trajectories from the environment. 
The class that does is is already implemented for you.

In [7]:
from runners import EnvRunner

This runner interacts with the environment for a given number of steps and returns a dictionary containing
keys 

* 'observations' 
* 'rewards' 
* 'resets'
* 'actions'
* all other keys that you defined in `Policy`

under each of these keys there is a python `list` of interactions with the environment of specified length $T$ &mdash; the size of partial trajectory. 

To train the part of the model that predicts state values you will need to compute the value targets. 
Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected. 
Thus, we can implement and use `ComputeValueTargets` callable. 
The formula for the value targets is simple:

$$
\hat v(s_t) = \sum_{t'=0}^{T - 1}\gamma^{t'}r_{t+t'} + \gamma^T \hat{v}(s_{t+T}),
$$

In implementation, however, do not forget to use 
`trajectory['resets']` flags to check if you need to add the value targets at the next step when 
computing value targets for the current step. You can access `trajectory['state']['latest_observations']`
to get last observations in partial trajectory &mdash; $s_{t+T}$.

In [8]:
class ComputeValueTargets:
    def __init__(self, policy, gamma=0.99):
        self.policy = policy
        self.gamma = gamma
    
    def __call__(self, trajectory):
        n_steps = trajectory['state']['env_steps']
        rewards = np.array(trajectory['rewards'])
        resets = np.array(trajectory['resets'], dtype=int)
        value = torch.stack(trajectory['values']).detach().numpy()
        
        act = policy.act(trajectory['state']['latest_observation'])
        value_last = act['values'].view(-1).detach().numpy()
        
        # Calculate value targets
        value_targets = np.zeros_like(rewards)
        
        for i in range(n_steps - 1, -1, -1):
            value_next = value_last if i == n_steps - 1 else value_targets[i + 1] 
            value_targets[i] = rewards[i] + self.gamma * value_next * (1 - resets[i])
        
        trajectory['value_targets'] = value_targets
        
        # Calculate advantages
        advantages = np.zeros_like(rewards)
        
        for i in range(n_steps - 1, -1, -1):
            value_next = value_last if i == n_steps - 1 else value[i + 1] 
            advantages[i] = rewards[i] + self.gamma * value_next - value[i]
        
        trajectory['advantages'] = advantages

After computing value targets we will transform lists of interactions into tensors
with the first dimension `batch_size` which is equal to `T * nenvs`, i.e. you essentially need
to flatten the first two dimensions. 

In [9]:
class MergeTimeBatch:
    """ Merges first two axes typically representing time and env batch. """
    def __call__(self, trajectory):
        for key in 'actions', 'value_targets', 'rewards', 'advantages':
            trajectory[key] = np.concatenate(trajectory[key])

        for key in ['logits', 'log_probs', 'values']:
            trajectory[key] = torch.cat(trajectory[key])

In [10]:
model = Model(n_actions=env.action_space.n)
policy = Policy(model)

with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
    runner = EnvRunner(
        env=env, 
        policy=policy, 
        nsteps=5,
        transforms=[ComputeValueTargets(policy), MergeTimeBatch()]
    )

Now is the time to implement the advantage actor critic algorithm itself. You can look into your lecture,
[Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and [lecture](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=20) by Sergey Levine.

In [11]:
def to_one_hot(y_tensor, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix."""
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [12]:
class A2C:
    def __init__(self,
                 policy,
                 optimizer,
                 value_loss_coef=0.25,
                 entropy_coef=0.01,
                 max_grad_norm=0.5):
        self.policy = policy
        self.optimizer = optimizer
        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
        self.max_grad_norm = max_grad_norm
    
    def policy_loss(self, trajectory):
        log_probs = trajectory['log_probs']
        actions = to_one_hot(torch.tensor(trajectory['actions'], dtype=torch.float), policy.model.n_actions)
        log_probs_for_actions = torch.sum(log_probs * actions, dim=1)
        advantages = torch.tensor(trajectory['advantages'], dtype=torch.float)
        
        entropy_loss = self.entropy_coef * torch.sum(torch.exp(log_probs_for_actions) * log_probs_for_actions)
        policy_loss = -torch.sum(log_probs_for_actions * advantages)
        
        return policy_loss, entropy_loss
    
    def value_loss(self, trajectory):
        value_targets = torch.tensor(trajectory['value_targets'], dtype=torch.float)
        value_loss = F.mse_loss(trajectory['values'], value_targets, reduction='sum')
        return self.value_loss_coef * value_loss
      
    def step(self, trajectory):
        policy_loss, entropy_loss = self.policy_loss(trajectory)
        value_loss = self.value_loss(trajectory)
        
        loss = policy_loss + value_loss
        
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy.model.parameters(), self.max_grad_norm)
        self.optimizer.step()
        
        return {
            'policy_loss': policy_loss.detach().numpy(),
            'mean_advantage': trajectory['advantages'],
            'mean_values': trajectory['values'].detach().numpy(),
            'entropy_loss': entropy_loss.detach().numpy(),
            'value_loss': value_loss.detach().numpy(),
            'loss': loss.detach().numpy()
        }

Now you can train your model. With reasonable hyperparameters training on a single GTX1080 for 10 million steps across all batched environments (which translates to about 5 hours of wall clock time)
it should be possible to achieve *average raw reward over last 100 episodes* (the average is taken over 100 last 
episodes in each environment in the batch) of about 600. You should plot this quantity with respect to 
`runner.step_var` &mdash; the number of interactions with all environments. It is highly 
encouraged to also provide plots of the following quantities (these are useful for debugging as well):

* [Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) between 
value targets and value predictions
* Entropy of the policy $\pi$
* Value loss
* Policy loss
* Value targets
* Value predictions
* Gradient norm
* Advantages
* A2C loss

For optimization we suggest you use RMSProp with learning rate starting from 7e-4 and linearly decayed to 0, smoothing constant (alpha in PyTorch and decay in TensorFlow) equal to 0.99 and epsilon equal to 1e-5.

In [13]:
optimizer = torch.optim.RMSprop(policy.model.parameters(), lr=7e-4, alpha=0.99, eps=1e-5)
a2c = A2C(policy, optimizer)

In [None]:
n_steps = 10_000_000

lr_max = 7e-4
lr_min = 0
lr_step = (lr_max - lr_min) / n_steps

with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
    for i in range(n_steps):
        lr = lr_max - i * lr_step
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        trajectory = runner.get_next()
        losses = a2c.step(trajectory)
        
        for key, value in losses.items():
            tf.contrib.summary.scalar(f'a2c/{key}', value, step=i)
        
        tf.contrib.summary.scalar(f'a2c/lr', lr, step=i)
        runner.env.env.step_var += 1

In [15]:
step = runner.env.env.step_var
mean_reward = np.mean([np.mean(q) for q in runner.env.env.reward_queues])

print(f'Step: {step}')
print(f'Mean reward: {mean_reward}')

Step: 10581
Mean reward: 183.9303355547355
