# Cart Pole using REINFORCE with BASELINES

In [2]:
# Full imports
import gym
import mlflow

# Aliased imports
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Partial Import
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from tqdm.notebook import tqdm, trange
from IPython.display import clear_output
from collections import namedtuple

2022-11-04 03:02:21.390143: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-04 03:02:21.483468: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-04 03:02:21.503667: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-04 03:02:21.910142: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [2]:
# Remember to export to export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/"" if using linux
# Drop numa errors in term: "for a in /sys/bus/pci/devices/*; do echo 0 | sudo tee -a $a/numa_node; done"


# We have GPU
tf.config.list_physical_devices('GPU')

2022-11-03 19:11:22.274355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 19:11:22.277582: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 19:11:22.277700: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

The function we need to update when using REINFORCE algorithm is:
$$
    \nabla_{\theta} J(\theta) = \sum_{t=0}^{T - 1} \nabla_{\theta}\log\pi_{\theta}(a_t|s_t)G_t
$$

Thus,

$$
    \theta \leftarrow \theta + \nabla_{\theta}\log\pi_{\theta}(a_t|s_t)G_t
$$

The problem with this approach is that policy gradients are relatively unstable and introduce a lot variacen due to the differnt state-rewards that the system 
observes. To fix this we'll substract a given baseline $b(s_t)$ from the expected returns, thus our original equation becomes:
$$
    \theta \leftarrow \theta + \nabla_{\theta}\log\pi_{\theta}(a_t|s_t) \left( G_t - b(s_t) \right) 
$$

Where $A(s_t) = G_t - b(s_t)$ is the advantage.

In [3]:
# CONSTS
VIEW_RANDOM = False
DEFAULT_ENV = "CartPole-v1"
EXPERIMENT_NAME = "cart_pole_baseline"
TAGS = {
    "type": "RL",
    "env": "Discrete Cart Pole",
    "algorithm": "REINFORCE",
    "sub-algorithm": "Learned Baseline"
}

In [4]:
# Default env
env = gym.make(DEFAULT_ENV)

In [5]:
# Init mlflow
exp_exists = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if not exp_exists:
    mlflow.create_experiment(EXPERIMENT_NAME, tags=TAGS)
else:
    mlflow.set_experiment(EXPERIMENT_NAME)

In [12]:
%mkdir -p data
%mkdir -p plots

/bin/bash: /home/main/anaconda3/envs/rlenv/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/main/anaconda3/envs/rlenv/lib/libtinfo.so.6: no version information available (required by /bin/bash)


## Implementing REINFORCE

In [139]:
class PolicyNet():
    def __init__(self, env, lr=1e-3) -> None:
        # Define model
        self.model = tf.keras.Sequential([
            tf.keras.Input(shape=(env.observation_space.shape[0],)),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(env.action_space.n, activation="softmax")
        ])

        # Define optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    def build_distribution(self, states):
        # Get logits
        p = self.model(states)

        return tfp.distributions.Categorical(probs=p, dtype=tf.float32)

    def sample_action(self, states):
        # Get distribution
        dist = self.build_distribution(states)
        actions = dist.sample().numpy().astype(int)
        
        # If we only have one action, return an int
        return actions[0] if len(actions) == 1 else actions
    
    @tf.function(reduce_retracing=True)
    def train(self, states, actions, advantages):
        # Get batch size
        batch_size = 1.0 / tf.cast(tf.size(states), tf.float32)

        # Compute loss
        with tf.GradientTape() as tape:
            loss = self.build_distribution(states).log_prob(actions) * advantages
            loss = -tf.reduce_sum(loss) * batch_size
        
        # Apply gradients
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [140]:
class BaselineNet():
    def __init__(self, env, lr=1e-3) -> None:
        # Define model
        self.model = tf.keras.Sequential([
            tf.keras.Input(shape=(env.observation_space.shape[0],)),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dense(1, activation="linear")
        ])

        # Define optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    
    def predict(self, states):
        return tf.squeeze(self.model(np.atleast_2d(states)))
    
    def advantage(self, states, returns):
        adv = returns - self.predict(states).numpy()
        
        # Whitening of advantages to improve statbility
        return (adv - adv.mean()) / adv.std()

    
    @tf.function(reduce_retracing=True)
    def train(self, states, returns):
        with tf.GradientTape() as tape:
            loss = tf.keras.losses.mean_squared_error(returns, self.model(states))
        
        # Apply gradients
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [141]:
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "new_state", "done"])

class Agent:
    def __init__(self, env, gamma, policy_lr=1e-3, baseline_lr=1e-3) -> None:
        # Init private vars
        self._policy = PolicyNet(env, policy_lr)
        self._baseline = BaselineNet(env, baseline_lr)
        self._env = env
        self._gamma = gamma

        # Init state vars
        self.reset()

        # Init buffers
        self._experience_buffer = []

    def reset(self) -> None:
        # Reset state and experience buffer
        self._state, _ = self._env.reset()
        self._experience_buffer = []

    def step(self):
        action = self._policy.sample_action(np.atleast_2d(self._state))
        new_state, reward, done, _, _ = self._env.step(action)

        # Add to experience buffer
        self._experience_buffer.append(Experience(self._state, action, reward, new_state, done))
        
        # Update state
        self._state = new_state
        
        # Return
        return self._state, reward, done


    def train(self):
        # Vectorize transitions
        experiences = pd.DataFrame(self._experience_buffer)
        
        t = np.arange(len(self._experience_buffer))
        r = experiences.reward * self._gamma ** t
        expected_returns = r[::-1].cumsum()[::-1] / (self._gamma ** t)
        

        # Compute G_t
        """
        expected_returns = []
        for i in range(len(self._experience_buffer)):
            gt = pw = 0
            for j in range(i, len(self._experience_buffer)):
                gt += (self._gamma ** pw) * self._experience_buffer[j].reward
                pw += 1
            
            # Save expected returns
            expected_returns.append(gt)
        """

        
        # Normalize returns for numerical statbility
        expected_returns = np.array(expected_returns).astype(np.float32)
        
        # Get vars
        states = np.stack(experiences.state.to_numpy()).astype(np.float32)
        actions = experiences.action.to_numpy().astype(np.float32)
        advantages = self._baseline.advantage(states, expected_returns)

        # Train policy
        self._baseline.train(states, expected_returns)
        self._policy.train(states, actions, advantages)
        
        
    
    def get_experience_buffer(self):
        return pd.DataFrame(self._experience_buffer)


In [148]:
# Define main training loop and start run
mlflow.end_run()
mlflow.start_run(tags=TAGS)

# Define constants
DEFAULT_ENV = "CartPole-v1"
N_EPISODES = 10000
N_STEPS = 500

# Define parameters
GAMMA = 0.999
POLICY_LR = 2e-3
BASELINE_LR = 2e-3

# Log params
mlflow.log_param("ENV", DEFAULT_ENV)
mlflow.log_param("N_EPISODES", N_EPISODES)
mlflow.log_param("N_STEPS", N_STEPS)
mlflow.log_param("GAMMA", GAMMA)
mlflow.log_param("POLICY_LR", POLICY_LR)
mlflow.log_param("BASELINE_LR", BASELINE_LR)

# Define enb
env = gym.make(DEFAULT_ENV)

# Define agent
agent = Agent(env, GAMMA, POLICY_LR, BASELINE_LR)

# Log nets model schema
tf.keras.utils.plot_model(agent._baseline.model, "plots/baseline.png", show_shapes=True, show_layer_activations=True)
tf.keras.utils.plot_model(agent._policy.model, "plots/policy.png", show_shapes=True, show_layer_activations=True)

mlflow.log_artifact("plots/baseline.png", "model_schemas")
mlflow.log_artifact("plots/policy.png", "model_schemas")

# Aux monitor 
data = []

# Start training
try:
    for e in (tbar := trange(N_EPISODES)):
        # Init agent
        agent.reset()
        done = False

        # Gather experiences
        for _ in range(N_STEPS):
            s, r, done = agent.step()

            # Stop if failed
            if done: break

        # Update current rewards
        total_reward = agent.get_experience_buffer().reward.sum()
        tbar.set_description(f"Total reward: {total_reward: 0.3}")
        tbar.refresh()

        # Save metrics
        data.append({
            "e": e,
            "r": total_reward
        })

        # Log metrics
        mlflow.log_metric(key="reward", step=e, value=total_reward)
        
        # Train agent
        agent.train()
    mlflow.end_run("FINISHED")

except KeyboardInterrupt:
    print("Training stopped...")
    mlflow.end_run("KILLED")


  0%|          | 0/10000 [00:00<?, ?it/s]

Training stopped...


In [None]:
# Make pd
data_pd = pd.DataFrame(data)

# Compute confidence intervals 
data_pd = (data_pd
    .assign(roll_avg_r=lambda x: x.r.rolling(50).mean())
    .assign(roll_std_r=lambda x: x.r.rolling(50).std())
    .assign(upper_r=lambda x: x.roll_avg_r + 1.96 * x.roll_std_r)
    .assign(lower_r=lambda x: x.roll_avg_r - 1.96 * x.roll_std_r)
)
data_pd.to_csv("data/vanilla.csv")


# Plot
fig = go.Figure()

# Add traces
fig.add_trace(
    go.Scatter(x=data_pd.e, y=data_pd.roll_avg_r, mode="lines", name="Reward")
)

fig.add_trace(
    go.Scatter(x=data_pd.e, 
               y=data_pd.upper_r,
               marker=dict(color="#444"),
               line=dict(width=0),
               mode="lines", 
               name="Roll. Avg. Reward",
               showlegend=False)
)

fig.add_trace(
    go.Scatter(x=data_pd.e, 
               y=data_pd.lower_r,
               fillcolor='rgba(68, 68, 68, 0.3)',
               marker=dict(color="#444"),
               line=dict(width=0),
               fill='tonexty',
               mode="lines", 
               name="Roll. Avg. Reward",
               showlegend=False)
)


# Update fig optiosn
fig.update_layout(
    template="plotly_white",
    margin=dict(l=20, r=20, t=20, b=20),
    xaxis_title="Episode",
    yaxis_title="Reward"
)

fig.show()

In [3]:
0.99 ** np.array([1, 2, 3])

array([0.99    , 0.9801  , 0.970299])