# Aula 2 - Parte Prática - Policy Gradients 

## Introdução

Nesse segundo notebook vamos aprender ...

### Objetivos:

- Entender a abordagem de otimização de políticas como busca no espaço de parâmetros da política
- Implementar um primeiro agente baseado no algoritmo REINFORCE
- Familiarizar-se com a API básica de construção de modelos (i.e., redes neurais) em Keras
- Familiarizar-se com métodos de Deep Learning usando TensorFlow 2.X

### Imports

> **Atenção:** não se esqueça de executar todos os `imports` necessários antes prosseguir com o tutorial.

In [None]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from utils.agent import RLAgent
from utils.memory import OnPolicyReplay
from utils.runner import train, evaluate
from utils.viz import plot_returns, plot_action_distribution

# sanity check
assert tf.__version__ == '2.1.0'
assert tf.executing_eagerly()

tf.get_logger().setLevel('ERROR') # ignore TensorFlow warnings

## 1. Implementando políticas estocásticas em tf.Keras 

In [None]:
tfd = tfp.distributions

### 1.1 Caso discreto

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_shape=(5,), activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(3),
    tfp.layers.DistributionLambda(lambda t: tfd.Categorical(logits=t))
])

In [None]:
inputs = tf.random.normal(shape=(10, 5))
print(inputs)

In [None]:
dist = model(inputs)
print(dist)

In [None]:
sample = dist.sample()
print(sample)

In [None]:
sample_log_prob = dist.log_prob(sample)
print(sample_log_prob)

In [None]:
def build_discrete_policy(obs_space, action_space, hidden_layers, activation="relu"):
    Input = tf.keras.Input
    Dense = tf.keras.layers.Dense
    DistributionLambda = tfp.layers.DistributionLambda
    Categorical = tfd.Categorical
    
    policy_net_layers = []

    policy_net_layers.append(Input(shape=obs_space.shape, name="State"))

    for i, units in enumerate(hidden_layers):
        policy_net_layers.append(Dense(units=units, activation=activation, name=f"Hidden{i+1}"))
    
    policy_net_layers.append(Dense(units=action_space.n, name="Logits"))
    policy_net_layers.append(DistributionLambda(lambda t: Categorical(logits=t), name="Action_Distribution_Categorical"))
                                     
    return tf.keras.Sequential(policy_net_layers)


In [None]:
env = gym.make("CartPole-v0")

hidden_layers = [64, 64]
activation = "relu"

policy = build_discrete_policy(env.observation_space, env.action_space, hidden_layers, activation)

obs = env.observation_space.sample()

action_dist = policy(obs[None,:])
print(action_dist)
print(action_dist.sample())

action = action_dist.sample().numpy()
assert action[0] in env.action_space

In [None]:
tf.keras.utils.plot_model(policy, show_shapes=True)

## 2. Agente REINFORCE

$$
\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left [ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(\mathbf{a}_t|\mathbf{s}_t) \hat{R}(\tau)\right ]
$$

$$
\hat{R}(\tau) = \sum_{t=0}^T r_t
$$


In [None]:
class REINFORCE(RLAgent):
    
    def __init__(self, obs_space, action_space, config=None):
        super(REINFORCE, self).__init__(obs_space, action_space, config)
        
        self.memory = OnPolicyReplay()
        self.policy = build_discrete_policy(self.obs_space, self.action_space, config["hidden_layers"], config["activation"])
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=config["learning_rate"])

    def act(self, obs):
        return self._act(obs).numpy()
    
    @tf.function
    def _act(self, obs):
        action_dist = self.policy(obs[None,:])
        return action_dist.sample()[0]

    def observe(self, obs, action, reward, next_obs, done):
        self.memory.update(obs, action, reward, next_obs, done)

    def learn(self):
        if self.memory.batch_size < self.config["train_batch_size"]:
            return
        
        batch = self.memory.sample()

        weights = self.policy.trainable_weights

        with tf.GradientTape() as tape:
            loss = self._loss_fn(batch)
            gradients = tape.gradient(loss, weights)

        self.optimizer.apply_gradients(zip(gradients, weights))
      
        return loss

    def _loss_fn(self, batch):
        states, actions, rewards = batch["states"], batch["actions"], batch["rewards"]
        n_episodes = len(states)

        loss = 0.0
        for episode in range(n_episodes):
            action_dist = self.policy(states[episode])
            log_prob = action_dist.log_prob(actions[episode])
            total_reward = np.sum(rewards[episode])
            loss += - tf.reduce_sum(log_prob * total_reward)

        loss /= n_episodes
            
        return loss

In [None]:
# CartPole-v0, CartPole-v1
env = gym.make("CartPole-v0")
print(env.spec.reward_threshold)

In [None]:
config = {
    # policy net
    "hidden_layers": [64, 64],
    "activation": "relu",

    # optimization
    "optimizer": "adam",
    "learning_rate": 1e-3,

    # training
    "train_batch_size": 3000,
}

agent = REINFORCE(env.observation_space, env.action_space, config=config)

In [None]:
n_episodes = 100
timesteps, total_rewards, avg_total_rewards = evaluate(agent, env, n_episodes, render=False)
plot_action_distribution(agent)

In [None]:
n_episodes = 3
_ = evaluate(agent, env, n_episodes, render=True)

## 3. Treinamento do agente

In [None]:
total_timesteps = 1_000_000
timesteps, total_rewards, avg_total_rewards = train(agent, env, total_timesteps)

In [None]:
plot_returns(timesteps, total_rewards, avg_total_rewards)

In [None]:
n_episodes = 100
timesteps, total_rewards, avg_total_rewards = evaluate(agent, env, n_episodes, render=False)
plot_action_distribution(agent)

In [None]:
n_episodes = 5
_ = evaluate(agent, env, n_episodes, render=True)

## 4. Experimentos 