# Aula 3 - Parte Prática - Redução de Variância e Função Valor

## Introdução

Nesse terceiro notebook vamos aprender ...

### Objetivos:

- 

### Imports

> **Atenção:** não se esqueça de executar todos os `imports` necessários antes prosseguir com o tutorial.

In [None]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from utils.agent import RLAgent
from utils.memory import OnPolicyReplay
from utils.networks import build_discrete_policy, build_continuous_policy
from utils.runner import train, evaluate
from utils.viz import plot_returns, plot_action_distribution


tf.get_logger().setLevel('ERROR') # ignore TensorFlow warnings

## 0. Configurações 

In [None]:
# CartPole-v0, CartPole-v1
env = gym.make("CartPole-v0")
print(env.spec.reward_threshold)

In [None]:
config = {
    # policy net
    "hidden_layers": [64, 64],
    "activation": "relu",

    # optimization
    "optimizer": "adam",
    "learning_rate": 1e-3,

    # training
    "train_batch_size": 3000,
}

## 1. REINFORCE (Vanilla Policy Gradients) 

In [None]:
class REINFORCE(RLAgent):
    
    def __init__(self, obs_space, action_space, postprocessing=None, config=None):
        super(REINFORCE, self).__init__(obs_space, action_space, config)
        
        self.memory = OnPolicyReplay()
        self.policy = build_discrete_policy(self.obs_space, self.action_space, config["hidden_layers"], config["activation"])
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=config["learning_rate"])
        
        self.postprocessing = postprocessing

    def act(self, obs):
        return self._act(obs).numpy()
    
    @tf.function
    def _act(self, obs):
        action_dist = self.policy(obs[None,:])
        return action_dist.sample()[0]

    def observe(self, obs, action, reward, next_obs, done):
        self.memory.update(obs, action, reward, next_obs, done)

    def learn(self):
        if self.memory.batch_size < self.config["train_batch_size"]:
            return
        
        batch = self.memory.sample()

        weights = self.policy.trainable_weights

        with tf.GradientTape() as tape:
            loss = self._loss_fn(batch)
            gradients = tape.gradient(loss, weights)

        self.optimizer.apply_gradients(zip(gradients, weights))
      
        return loss

    def _loss_fn(self, batch):
        states, actions, rewards = batch["states"], batch["actions"], batch["rewards"]
        n_episodes = len(states)
        
        if self.postprocessing:
            G = self.postprocessing(rewards)
        else:
            G = [np.sum(rewards[episode]) for episode in range(n_episodes)]

        loss = 0.0
        for episode in range(n_episodes):
            action_dist = self.policy(states[episode])
            log_prob = action_dist.log_prob(actions[episode])
            loss += - tf.reduce_sum(log_prob * G[episode])

        loss /= n_episodes
            
        return loss

In [None]:
agent = REINFORCE(env.observation_space, env.action_space, config=config)

In [None]:
total_timesteps = 1_000_000
timesteps, total_rewards, avg_total_rewards = train(agent, env, total_timesteps)

In [None]:
plot_returns(timesteps, total_rewards, avg_total_rewards)

## 2. Policy Gradients: ignorando recompensas passadas

$$
\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left [ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(\mathbf{a}_t|\mathbf{s}_t) \hat{R}_t \right ]
$$

$$
\hat{R}_t = \sum_{t'=t}^T r_{t'}
$$


### 2.1 Reward-to-Go 

In [None]:
def compute_reward_to_go(rewards):
    n_episodes = len(rewards)

    returns = []

    for episode in range(n_episodes):
        episode_returns = []
        total_reward = 0.0

        for i in range(len(rewards[episode]) - 1, -1, -1):
            reward = rewards[episode][i]
            total_reward += reward
            episode_returns.append(total_reward)
        
        episode_returns = episode_returns[::-1]
        returns.append(np.array(episode_returns))

    return returns

In [None]:
n_episodes, episode_length = 32, 20
rewards = np.random.uniform(size=[n_episodes, episode_length])

returns = compute_reward_to_go(rewards)

assert len(returns) == n_episodes
assert all(np.allclose(np.cumsum(rewards[i][::-1])[::-1], returns[i]) for i in range(n_episodes))

### 2.2 REINFORCE + Reward-to-Go

In [None]:
agent = REINFORCE(env.observation_space, env.action_space, postprocessing=compute_reward_to_go, config=config)

In [None]:
total_timesteps = 1_000_000
timesteps, total_rewards, avg_total_rewards = train(agent, env, total_timesteps)

In [None]:
plot_returns(timesteps, total_rewards, avg_total_rewards)

In [None]:
# n_episodes = 5
# _ = evaluate(agent, env, n_episodes, render=True)

## 3. Policy Gradients: adicionando referências para os retornos (baseline)

###  3.1 Normalização dos *reward-to-go*

In [None]:
def compute_normalized_reward_to_go(rewards):
    reward_to_go = compute_reward_to_go(rewards)

    n_episodes = len(rewards)
    for episode in range(n_episodes):
        G = reward_to_go[episode]
        G_mean = np.mean(G)
        G_std = np.std(G)
        reward_to_go[episode] = (G - G_mean) / (G_std if G_std > 0.0 else 1.0)

    return reward_to_go

In [None]:
agent = REINFORCE(env.observation_space, env.action_space, postprocessing=compute_normalized_reward_to_go, config=config)

In [None]:
total_timesteps = 1_000_000
timesteps, total_rewards, avg_total_rewards = train(agent, env, total_timesteps)

In [None]:
plot_returns(timesteps, total_rewards, avg_total_rewards)

In [None]:
# n_episodes = 5
# _ = evaluate(agent, env, n_episodes, render=True)

### 3.2 Função Valor 