In [2]:
# Avoid warning from TF

# Full imports
import gym
import mlflow

# Aliased imports
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Partial Import
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from tqdm.notebook import tqdm, trange
from IPython.display import clear_output
from collections import namedtuple
from typing import Any, List, Sequence, Tuple

2022-11-17 20:52:23.315984: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-17 20:52:23.526100: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-17 20:52:23.601694: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-17 20:52:24.298894: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [3]:
# Remember to export to export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/"" if using linux
# Drop numa errors in term: "for a in /sys/bus/pci/devices/*; do echo 0 | sudo tee -a $a/numa_node; done"

# Check if we have GPU
tf.config.list_physical_devices('GPU')

2022-11-17 20:52:26.188635: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 20:52:26.210212: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 20:52:26.210333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## META CONSTANTS

In [4]:
DEFAULT_ENV = "CartPole-v1"
EXPERIMENT_NAME = "cart_pole_a2c"
TAGS = {
    "type": "RL",
    "env": "Discrete Cart Pole",
    "algorithm": "A2C",
    "sub-algorithm": "Hybrid Model"
}

In [5]:
EPS = np.finfo(np.float32).eps.item()

## GLOBAL AUX DEFS

In [6]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
env = gym.make(DEFAULT_ENV)


## IMPLEMENTATION

In [7]:
class ActorCritic(tf.keras.Model):
    def __init__(self, env: gym.Env) -> None:
        # Call super to properly init
        super().__init__()
        
        self.n_input = env.observation_space.shape[0]
        # Define model
        #self.base_1 = tf.keras.layers.Dense(128, activation="swish")
        self.base_1 = tf.keras.layers.LSTM(16, input_shape=(1, 1, self.n_input))
        self.actor_out = tf.keras.layers.Dense(env.action_space.n)
        self.critic_out = tf.keras.layers.Dense(1)

        
    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        # Apply base layers
        inputs = tf.reshape(inputs, (1, 1, self.n_input))
        x = self.base_1(inputs)

        # Compute distribution of actions for actor and pick one and corresponding log_prob
        action_dist = tfp.distributions.Categorical(logits=self.actor_out(x), dtype=tf.float32)
        action = action_dist.sample()

        # Compute estimate for value
        value = self.critic_out(x)

        return action, action_dist.log_prob(action), value
        

In [8]:
# Run as a tf function to speed up

def aux_np_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    # Perform step
    state, reward, done, _, _ = env.step(int(action))

    # Cast to 0/1 to exploit tensor repr
    return (
        state.astype(np.float32),
        np.array(reward, np.float32),
        np.array(done, np.float32)
    )

def aux_tf_step(action: tf.Tensor) -> List[tf.Tensor]:
    return tf.numpy_function(aux_np_step, [action], [tf.float32, tf.float32, tf.float32])

In [9]:
def compute_episode(init_state: tf.Tensor, model: tf.keras.Model, max_steps: int):
    # Define buffers
    log_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)
    rewards = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)

    # Init state
    state = init_state
    initial_state_shape = init_state.shape

    # Make loop
    for i in tf.range(max_steps):
        # Make batch from state
        state = tf.expand_dims(state, axis=0)

        # Pick an action
        action, log_prob, value = model(state)

        # Perform step
        state, reward, done = aux_tf_step(action[0])

        # Store result in buffer
        log_probs = log_probs.write(i, tf.squeeze(log_prob))
        values = values.write(i, tf.squeeze(value))
        rewards = rewards.write(i, tf.squeeze(reward))

        # Format state
        state.set_shape(initial_state_shape)

        # Bail-out if episode is finished
        if tf.cast(done, tf.bool): break

    # Stack to get proper tensors
    log_probs = log_probs.stack()
    values = values.stack()
    rewards = rewards.stack() 

    return log_probs, values, rewards

In [10]:
def compute_returns(rewards: tf.Tensor, gamma: float, std: bool = True) -> tf.Tensor:
    # Prepare aux vars
    rewards = tf.cast(rewards, dtype=tf.float32)
    t = tf.range(tf.size(rewards), dtype=tf.float32)

    # Compute factors
    delta = rewards * gamma ** t
    g = tf.cumsum(delta[::-1])[::-1] / gamma ** t

    # Std if needed
    if std:
        g = (g - tf.reduce_mean(g)) / (tf.math.reduce_std(g) + EPS)
    
    # Return expected returns
    return g

In [11]:
def compute_loss(log_probs: tf.Tensor, values: tf.Tensor, returns: tf.Tensor, critic_weight: float = 0.5) -> tf.Tensor:
    adv = returns - values

    # Actor loss
    actor_loss = -tf.math.reduce_sum(log_probs * adv)

    # Critic loss
    # Use hubber loss because it's more stable to outliers than delta^2
    critic_loss = huber_loss(values, returns)

    return actor_loss + critic_loss * critic_weight

In [12]:
@tf.function
def train(init_state, model: tf.keras.Model, optimizer: tf.keras.optimizers.Optimizer, gamma: float, max_steps: int, critic_weight: float = 0.5) -> tf.Tensor:
    with tf.GradientTape() as tape:
        # Collect samples
        log_probs, values, rewards = compute_episode(init_state, model, max_steps)

        returns = compute_returns(rewards, gamma) 

        # Keep in mind to expand to columns, 
        # huber loss reduces differently based on this fact
        log_probs, values, returns = [
            tf.expand_dims(x, 1) for x in [log_probs, values, returns]
        ] 

        # Compute loss
        loss = compute_loss(log_probs, values, returns)

    # Apply gradients
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Return total reward
    return tf.math.reduce_sum(rewards)


In [13]:
LR = 0.02
GAMMA = 0.99
N_EPISODES = 20000
MAX_STEPS = 500
CRITIC_WEIGHT = 1

# Define basic vars
#env = gym.make(DEFAULT_ENV)
model = ActorCritic(env)
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

rewards = []

# Start training
try:
    for e in (tbar := trange(N_EPISODES)):
        init_state = tf.constant(env.reset()[0], dtype=tf.float32)

        reward_e = int(train(init_state, model, optimizer, GAMMA, MAX_STEPS, CRITIC_WEIGHT))

        # Save rewards
        rewards.append(reward_e)

        # Update progressbar
        tbar.set_postfix(reward=reward_e)

except KeyboardInterrupt:
    print("Training stopped...")




2022-11-17 20:52:26.406411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-17 20:52:26.407082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 20:52:26.407230: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 20:52:26.407296: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA

  0%|          | 0/20000 [00:00<?, ?it/s]

2022-11-17 20:52:28.253165: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] implementation_selector failed: INVALID_ARGUMENT: Invalid format of input node name:  Expected: {forward_node_name}:{index}
2022-11-17 20:52:28.362636: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce GTX 1060 3GB" frequency: 1759 num_cores: 9 environment { key: "architecture" value: "6.1" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 1572864 shared_memory_size_per_multiprocessor: 98304 memory_size: 1933770752 bandwidth: 192192000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


In [14]:
# Make pd
data_pd = pd.DataFrame({"r":rewards})

# Compute confidence intervals 
data_pd = (data_pd
    .assign(roll_avg_r=lambda x: x.r.rolling(50).mean())
    .assign(roll_std_r=lambda x: x.r.rolling(50).std())
    .assign(upper_r=lambda x: x.roll_avg_r + 1.96 * x.roll_std_r)
    .assign(lower_r=lambda x: x.roll_avg_r - 1.96 * x.roll_std_r)
)
data_pd.to_csv("data/vanilla.csv")


# Plot
fig = go.Figure()

# Add traces
fig.add_trace(
    go.Scatter(x=data_pd.index, y=data_pd.roll_avg_r, mode="lines", name="Reward")
)

fig.add_trace(
    go.Scatter(x=data_pd.index, 
               y=data_pd.upper_r,
               marker=dict(color="#444"),
               line=dict(width=0),
               mode="lines", 
               name="Roll. Avg. Reward",
               showlegend=False)
)

fig.add_trace(
    go.Scatter(x=data_pd.index, 
               y=data_pd.lower_r,
               fillcolor='rgba(68, 68, 68, 0.3)',
               marker=dict(color="#444"),
               line=dict(width=0),
               fill='tonexty',
               mode="lines", 
               name="Roll. Avg. Reward",
               showlegend=False)
)


# Update fig optiosn
fig.update_layout(
    template="plotly_white",
    margin=dict(l=20, r=20, t=20, b=20),
    xaxis_title="Episode",
    yaxis_title="Reward"
)

fig.show()

In [224]:
init_state

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.04938623,  0.02591613, -0.00872543, -0.04496034], dtype=float32)>

In [225]:
tf.reshape(init_state, (1, 1, env.observation_space.shape[0]))

<tf.Tensor: shape=(1, 1, 4), dtype=float32, numpy=
array([[[ 0.04938623,  0.02591613, -0.00872543, -0.04496034]]],
      dtype=float32)>