# Aula 2 - Parte Prática - Policy Gradients 

## Introdução

Nesse segundo notebook vamos aprender ...

### Objetivos:

- Entender a abordagem de otimização de políticas como busca no espaço de parâmetros da política
- Implementar um primeiro agente baseado no algoritmo REINFORCE
- Familiarizar-se com a API básica de construção de modelos (i.e., redes neurais) em Keras
- Familiarizar-se com métodos de Deep Learning usando TensorFlow 2.X

### Imports

> **Atenção:** não se esqueça de executar todos os `imports` necessários antes prosseguir com o tutorial.

In [2]:
import gym

import tensorflow as tf
import tensorflow_probability as tfp

In [12]:
# sanity check
assert tf.__version__ == '2.1.0'
assert tf.executing_eagerly()

## 1. Implementando políticas estocásticas em tf.Keras 

In [14]:
tfd = tfp.distributions

### 1.1 Caso discreto

In [62]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_shape=(5,), activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(3),
    tfp.layers.DistributionLambda(lambda t: tfd.Categorical(logits=t))
])

In [63]:
inputs = tf.random.normal(shape=(10, 5))
print(inputs)

tf.Tensor(
[[ 0.32103857 -0.6080192   1.4877108  -0.12977777 -1.1860949 ]
 [ 0.02536139  1.1291535  -0.07863023 -0.9902698   0.8496303 ]
 [ 0.44752735 -1.3599901   0.12103056 -0.712829    0.9447562 ]
 [-0.9185843   0.947765    0.57675695 -0.04887169 -0.954858  ]
 [-0.6540133   0.8262241  -0.39020407 -0.9625042  -0.3470074 ]
 [ 1.4613252   2.0669453   0.17619659  0.09371976  0.04527772]
 [ 0.62663805 -0.09943198  0.40964445 -1.9779913   0.6893666 ]
 [-1.7095194   1.4237188   0.09787725  1.3101344   0.4518444 ]
 [-0.9198294  -0.6814246  -1.1695627  -1.6370764  -0.41971138]
 [ 0.2589838   1.6075752  -0.3090215   0.07801837  0.13695426]], shape=(10, 5), dtype=float32)


In [64]:
dist = model(inputs)
print(dist)

tfp.distributions.Categorical("sequential_7_distribution_lambda_7_Categorical", batch_shape=[10], event_shape=[], dtype=int32)


In [65]:
sample = dist.sample()
print(sample)

tf.Tensor([0 2 1 0 2 2 0 1 2 2], shape=(10,), dtype=int32)


In [66]:
sample_log_prob = dist.log_prob(sample)
print(sample_log_prob)

tf.Tensor(
[-0.9802021  -1.3386791  -1.0007324  -1.028725   -1.406725   -1.4265188
 -0.98318666 -1.045281   -1.8600569  -1.2781339 ], shape=(10,), dtype=float32)


In [46]:
def build_discrete_policy(obs_space, action_space, hidden_layers, activation):
    Dense = tf.keras.layers.Dense
    DistributionLambda = tfp.layers.DistributionLambda
    Categorical = tfd.Categorical
    
    policy_net_layers = []
    
    for i, units in enumerate(hidden_layers):
        if i == 0:
            policy_net_layers.append(Dense(units=units, activation=activation, input_shape=obs_space.shape))
        else:
            policy_net_layers.append(Dense(units=units, activation=activation))
    
    policy_net_layers.append(Dense(units=action_space.n))
    policy_net_layers.append(DistributionLambda(lambda t: Categorical(logits=t)))
                                     
    return tf.keras.Sequential(policy_net_layers)


In [105]:
env = gym.make("MountainCar-v0")

hidden_layers = [64, 64]
activation = "relu"

policy = build_discrete_policy(env.observation_space, env.action_space, hidden_layers, activation)

obs = env.observation_space.sample()

action_dist = policy(obs[None,:])
print(action_dist)

action = action_dist.sample().numpy()
assert action[0] in env.action_space

tfp.distributions.Categorical("sequential_29_distribution_lambda_29_Categorical", batch_shape=[1], event_shape=[], dtype=int32)


### 1.2 Caso contínuo 

In [82]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="tanh", input_shape=(10,)),
    tf.keras.layers.Dense(128, activation="tanh"),
    tf.keras.layers.Dense(10),
    tfp.layers.DistributionLambda(lambda t: tfd.MultivariateNormalDiag(loc=t, scale_diag=[1e-2] * 10))
])

In [83]:
inputs = tf.random.normal(shape=(4, 10))
print(inputs)

tf.Tensor(
[[-0.8419557   0.70250314 -1.5014496   0.18716766 -0.75787574 -1.0289555
   0.01592047 -0.6249459   1.0384505  -1.184295  ]
 [ 1.556421    0.5781917  -1.6744963  -0.31686938 -0.7645374   1.3680733
  -0.24784766 -1.4356192   0.09560852 -1.3406086 ]
 [ 0.5428561  -0.4105987   0.79216933  0.85922605 -2.2289503   0.35503837
  -1.2095629   1.2945596   1.7793185   0.1548357 ]
 [-0.77329165  0.33763334  0.46752357 -0.3361535  -0.14248116  0.06044088
   0.5058375  -0.56888855 -0.08033823 -0.12951943]], shape=(4, 10), dtype=float32)


In [85]:
dist = model(inputs)
print(dist)

tfp.distributions.MultivariateNormalDiag("sequential_14_distribution_lambda_14_MultivariateNormalDiag", batch_shape=[4], event_shape=[10], dtype=float32)


In [86]:
sample = dist.sample()
print(sample)

tf.Tensor(
[[ 0.02107014  0.67111677 -0.05717062 -0.10963654  0.30163962 -0.20284596
  -0.31478062  0.4405437  -0.45328838  0.4712263 ]
 [ 0.7728581   0.47813922  0.01443296 -0.24912257 -0.9230962   0.01782654
  -0.25555795  0.18761869 -0.2212125   0.21795002]
 [ 0.2327961  -0.4189296   0.47730008  0.10519981 -0.3640216   0.29068133
   0.0499337  -0.77466166  0.03433217 -0.42330003]
 [-0.16805093  0.09478584  0.15372992  0.00448452  0.26795706 -0.2142506
  -0.00713969  0.39221483 -0.07863823  0.16443925]], shape=(4, 10), dtype=float32)


In [87]:
sample_log_prob = dist.log_prob(sample)
print(sample_log_prob)

tf.Tensor([31.728992 35.46901  32.874004 32.92938 ], shape=(4,), dtype=float32)


In [90]:
def build_continuous_policy(obs_space, action_space, hidden_layers, activation, scale_diag=1e-2):
    Dense = tf.keras.layers.Dense
    DistributionLambda = tfp.layers.DistributionLambda
    MultivariateNormalDiag = tfd.MultivariateNormalDiag
    
    policy_net_layers = []
    
    for i, units in enumerate(hidden_layers):
        if i == 0:
            policy_net_layers.append(Dense(units, activation=activation, input_shape=obs_space.shape))
        else:
            policy_net_layers.append(Dense(units, activation=activation))
    
    policy_net_layers.append(Dense(units=action_space.shape[0]))
    policy_net_layers.append(DistributionLambda(
        lambda t: MultivariateNormalDiag(loc=t, scale_diag=[scale_diag] * action_space.shape[0])))
                                     
    return tf.keras.Sequential(policy_net_layers)

In [106]:
env = gym.make("MountainCarContinuous-v0")

hidden_layers = [32, 32, 32]
activation = "elu"

policy = build_continuous_policy(env.observation_space, env.action_space, hidden_layers, activation)

obs = env.observation_space.sample()

action_dist = policy(obs[None,:])
print(action_dist)

action = action_dist.sample().numpy()
assert action[0] in env.action_space

tfp.distributions.MultivariateNormalDiag("sequential_30_distribution_lambda_30_MultivariateNormalDiag", batch_shape=[1], event_shape=[1], dtype=float32)


## 2. Função objetivo em Policy Gradients (*surrogate loss*) 

### 2.1 Calculando log-prob da escolha da ação 

### 2.2 Calculando retornos de episódios 

### 2.3 Implementação do *surrogate loss*: combinando log-prob e retornos 

## 3. Agente REINFORCE 

## 4. Treinamento do agente

## 5. Experimentos 