# Aula 2 - Parte Prática - Policy Gradients 

## Introdução

Nesse segundo notebook vamos aprender ...

### Objetivos:

- Entender a abordagem de otimização de políticas como busca no espaço de parâmetros da política
- Implementar um primeiro agente baseado no algoritmo REINFORCE
- Familiarizar-se com a API básica de construção de modelos (i.e., redes neurais) em Keras
- Familiarizar-se com métodos de Deep Learning usando TensorFlow 2.X

### Imports

> **Atenção:** não se esqueça de executar todos os `imports` necessários antes prosseguir com o tutorial.

In [None]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from utils.agent import RLAgent
from utils.memory import OnPolicyReplay

In [None]:
# sanity check
assert tf.__version__ == '2.1.0'
assert tf.executing_eagerly()

## 1. Implementando políticas estocásticas em tf.Keras 

In [None]:
tfd = tfp.distributions

### 1.1 Caso discreto

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_shape=(5,), activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(3),
    tfp.layers.DistributionLambda(lambda t: tfd.Categorical(logits=t))
])

In [None]:
inputs = tf.random.normal(shape=(10, 5))
print(inputs)

In [None]:
dist = model(inputs)
print(dist)

In [None]:
sample = dist.sample()
print(sample)

In [None]:
sample_log_prob = dist.log_prob(sample)
print(sample_log_prob)

In [None]:
def build_discrete_policy(obs_space, action_space, hidden_layers, activation="relu"):
    Input = tf.keras.Input
    Dense = tf.keras.layers.Dense
    DistributionLambda = tfp.layers.DistributionLambda
    Categorical = tfd.Categorical
    
    policy_net_layers = []

    policy_net_layers.append(Input(shape=obs_space.shape, name="State"))

    for i, units in enumerate(hidden_layers):
        policy_net_layers.append(Dense(units=units, activation=activation, name=f"Hidden{i+1}"))
    
    policy_net_layers.append(Dense(units=action_space.n, name="Logits"))
    policy_net_layers.append(DistributionLambda(lambda t: Categorical(logits=t), name="Action_Distribution_Categorical"))
                                     
    return tf.keras.Sequential(policy_net_layers)


In [None]:
env = gym.make("MountainCar-v0")

hidden_layers = [64, 64]
activation = "relu"

policy = build_discrete_policy(env.observation_space, env.action_space, hidden_layers, activation)

obs = env.observation_space.sample()

action_dist = policy(obs[None,:])
print(action_dist)
print(action_dist.sample())

pi = policy.predict(obs[None,:])
print(pi)
assert pi[0] in env.action_space

action = action_dist.sample().numpy()
assert action[0] in env.action_space

In [None]:
!pip3 install pydot graphviz
import pydot

In [None]:
tf.keras.utils.plot_model(policy, show_shapes=True)

### 1.2 Caso contínuo 

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="tanh", input_shape=(10,)),
    tf.keras.layers.Dense(128, activation="tanh"),
    tf.keras.layers.Dense(10),
    tfp.layers.DistributionLambda(lambda t: tfd.MultivariateNormalDiag(loc=t, scale_diag=[1e-2] * 10))
])

In [None]:
inputs = tf.random.normal(shape=(4, 10))
print(inputs)

In [None]:
dist = model(inputs)
print(dist)

In [None]:
sample = dist.sample()
print(sample)

In [None]:
sample_log_prob = dist.log_prob(sample)
print(sample_log_prob)

In [None]:
def build_continuous_policy(obs_space, action_space, hidden_layers, activation="relu", scale_diag=1e-2):
    Input = tf.keras.Input
    Dense = tf.keras.layers.Dense
    DistributionLambda = tfp.layers.DistributionLambda
    MultivariateNormalDiag = tfd.MultivariateNormalDiag
    
    policy_net_layers = []
    
    policy_net_layers.append(Input(shape=obs_space.shape, name="State"))

    for i, units in enumerate(hidden_layers):
        policy_net_layers.append(Dense(units=units, activation=activation, name=f"Hidden{i+1}"))

    policy_net_layers.append(Dense(units=action_space.shape[0], name="Params"))
    policy_net_layers.append(DistributionLambda(
        lambda t: MultivariateNormalDiag(loc=t, scale_diag=[scale_diag] * action_space.shape[0]),
        name="Action_Distribution_Gaussian"
    ))
                                     
    return tf.keras.Sequential(policy_net_layers)

In [None]:
env = gym.make("MountainCarContinuous-v0")

hidden_layers = [32, 32, 32]
activation = "elu"

policy = build_continuous_policy(env.observation_space, env.action_space, hidden_layers, activation)

batch_size = 10
obs = np.array([env.observation_space.sample() for _ in range(batch_size)])
print(obs.shape)

action_dist = policy(obs)
print(action_dist)

# action = policy.predict(env.observation_space.sample()[None,:])
# print(action)
# assert action in env.action_space
# print(action_dist.sample())

action = action_dist.sample().numpy()
assert action[0] in env.action_space

In [None]:
tf.keras.utils.plot_model(policy, show_shapes=True)

## 2. Agente REINFORCE

In [None]:
env = gym.make("MountainCarContinuous-v0")

### 2.1 Calculando retornos de episódios

$$
\hat{R}_t = \sum_{t'=t}^T r_{t'}
$$

In [None]:
def compute_returns(rewards):
    n_episodes = len(rewards)

    returns = []

    for episode in range(n_episodes):
        episode_returns = []
        total_reward = 0.0

        for i in range(len(rewards[episode]) - 1, -1, -1):
            reward = rewards[episode][i]
            total_reward += reward
            episode_returns.append(total_reward)
        
        episode_returns = episode_returns[::-1]
        returns.append(episode_returns)

    return np.array(returns)[...,None]

In [None]:
n_episodes, batch_size = 32, 20
rewards = np.random.uniform(size=[n_episodes, batch_size])

returns = compute_returns(rewards)
print(returns.shape)

assert len(returns) == n_episodes
assert returns.shape[-1] == 1
assert all(np.allclose(np.cumsum(rewards[i][::-1])[::-1][...,None], returns[i]) for i in range(n_episodes))

### 2.2 Implementação do *surrogate loss*: combinando log-prob e retornos 

$$
\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left [ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(\mathbf{a}_t|\mathbf{s}_t) \hat{R}_t\right ]
$$

In [None]:
class REINFORCE(RLAgent):
    
    def __init__(self, obs_space, action_space, config=None):
        super(REINFORCE, self).__init__(obs_space, action_space, config)
        
        self._build_memory()
        self._build_policy()
        self._build_optimizer()
        
    def _build_memory(self):
        self.memory = OnPolicyReplay()
        
    def _build_policy(self):
        if isinstance(self.action_space, gym.spaces.discrete.Discrete):
            self.policy = build_discrete_policy(self.obs_space, self.action_space, config["hidden_layers"], config["activation"])
        else:
            self.policy = build_continuous_policy(self.obs_space, self.action_space, config["hidden_layers"], config["activation"])

    def _build_optimizer(self):
        self.optimizer = tf.keras.optimizers.Adam()

    @tf.function
    def act(self, obs):
        action_dist = self.policy(obs[None,:])
        return action_dist.sample()[0]

    def observe(self, obs, action, reward, next_obs, done):
        self.memory.update(obs, action, reward, next_obs, done)

    def learn(self):
        batch = self.memory.sample()

        with tf.GradientTape() as tape:
            loss = self._loss_fn(batch)

        gradients = tape.gradient(loss, self.policy.trainable_weights)
        self.optimizer.apply_gradients(zip(gradients, self.policy.trainable_weights))
        
        return loss
    
    def _loss_fn(self, batch):
        states, actions, rewards = batch["states"], batch["actions"], batch["rewards"]
        returns = compute_returns(rewards)
        
        n_episodes = len(states)

        loss = 0.0
        for episode in range(n_episodes):
            action_dist = self.policy(states[episode])
            log_prob = action_dist.log_prob(actions[episode])[...,None]
            loss += - tf.reduce_sum(log_prob * returns[episode])

        loss /= n_episodes
            
        return loss

In [None]:
config = {
    "hidden_layers": [32, 32],
    "activation": "relu"
}

agent = REINFORCE(env.observation_space, env.action_space, config=config)

In [None]:
n_episodes = 10

for episode in range(n_episodes):
    total_reward = 0.0
    episode_length = 0

    obs = env.reset()

    while True:
        action = agent.act(obs)
        next_obs, reward, done, _  = env.step(action)
        agent.observe(obs, action, reward, next_obs, done)
    
        total_reward += reward
        episode_length += 1

        next_obs = obs
        
        if done:
            break
    
    loss = agent.learn()

    print(f"episode = {episode:3d} -> loss = {loss:10.4f}, total_reward = {total_reward:10.4f}, episode_length = {episode_length:3d}")

## 3. Treinamento do agente

## 4. Experimentos 