# Cart Pole using REINFORCE

In [150]:
# Full imports
import gym

# Aliased imports
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Partial Import
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from tqdm.notebook import tqdm, trange
from IPython.display import clear_output
from collections import namedtuple

In [6]:
# Remember to export to export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/"" if using linux
# Drop numa errors in term: "for a in /sys/bus/pci/devices/*; do echo 0 | sudo tee -a $a/numa_node; done"


# We have GPU
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

The function we need to update when using REINFORCE algorithm is:
$$
    \nabla_{\theta} J(\theta) = \sum_{t=0}^{T - 1} \nabla_{\theta}\log\pi_{\theta}(a_t|s_t)G_t
$$

Thus,

$$
    \theta \leftarrow \theta + \nabla_{\theta}\log\pi_{\theta}(a_t|s_t)G_t
$$

## Sample of a random agent

In [8]:
DEFAULT_ENV = "CartPole-v1"

# Default env
env = gym.make(DEFAULT_ENV, render_mode="human")

# Run random action for the example
state = env.reset()

# Run 10 sample episodes
for _ in range(10):
    # Init episode
    done = False
    while not done:
        action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        env.render()

        if done:
            env.reset()

# Close environment
env.close()

In [None]:
# Check for envinronment help
help(env.unwrapped)

## Implementing REINFORCE

In [177]:
# Define experience
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "new_state", "done"])

class Agent:
    def __init__(self, env, model, optimizer, gamma) -> None:
        # Init private vars
        self._model = model
        self._env = env
        self._opt = optimizer
        self._gamma = gamma

        # Init state vars
        self.reset()

        # Init buffers
        self._experience_buffer = []

    def reset(self) -> None:
        # Reset state and experience buffer
        self._state, _ = self._env.reset()
        self._experience_buffer = []

    def step(self):
        # Get action
        p = self._model(np.array([self._state]))

        # Build distribution and sample action
        dist = tfp.distributions.Categorical(probs=p, dtype=tf.float32)

        # Perform step on env
        action = int(dist.sample().numpy()[0])
        new_state, reward, done, _, _ = self._env.step(action)

        # Add to experience buffer
        self._experience_buffer.append(Experience(self._state, action, reward, new_state, done))
        
        # Update state
        self._state = new_state
        
        # Return
        return self._state, reward, done

    @tf.function
    def _train_model(self, state, action, exp_ret):
        with tf.GradientTape() as tape:
            # Model outputs
            probs = self._model(state)

            # Get loss
            dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
            loss = -dist.log_prob(action) * exp_ret

            # Apply gradients
            grads = tape.gradient(loss, self._model.trainable_variables)
            self._opt.apply_gradients(zip(grads, self._model.trainable_variables))

    def train(self):
        # Compute G_t
        expected_returns = []
        for i in range(len(self._experience_buffer)):
            gt = pw = 0
            for j in range(i, len(self._experience_buffer)):
                gt += (self._gamma ** pw) * self._experience_buffer[j].reward
            
            # Save expected returns
            expected_returns.append(gt)
        
        # Normalize returns for numerical statbility
        expected_returns = np.array(expected_returns)
        expected_returns_norm = (expected_returns  - expected_returns.mean()) / expected_returns.std()

        # Vectorize transitions
        experiences = pd.DataFrame(self._experience_buffer)
        
        # Train model
        states = np.stack(experiences.state.to_numpy()).astype(np.float32)
        actions = experiences.action.to_numpy().astype(np.float32)
        exp_ret = expected_returns_norm.astype(np.float32)
        self._train_model(states, actions, exp_ret)
    
    def get_experience_buffer(self):
        return pd.DataFrame(self._experience_buffer)


In [182]:
# Define parameters
LEARNING_RATE = 1e-3

# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

# Define model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation="relu", input_shape=(env.observation_space.shape[0],)),
    tf.keras.layers.Dense(30, activation="relu"),
    tf.keras.layers.Dense(env.action_space.n, activation="softmax")
])  

# Build model
model.build()

# Print model summary
print(model.summary())

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_87 (Dense)            (None, 30)                150       
                                                                 
 dense_88 (Dense)            (None, 30)                930       
                                                                 
 dense_89 (Dense)            (None, 2)                 62        
                                                                 
Total params: 1,142
Trainable params: 1,142
Non-trainable params: 0
_________________________________________________________________
None


In [183]:
# Define main training loop

# Define constants
DEFAULT_ENV = "CartPole-v1"
N_EPISODES = 10000
N_STEPS = 500

# Define parameters
GAMMA = 0.99

# Define enb
env = gym.make(DEFAULT_ENV)

# Define agent
agent = Agent(env, model, optimizer, GAMMA)

# Aux monitor 
data = []


# Def figure



# Start training
try:
    for e in (tbar := trange(N_EPISODES)):
        # Init agent
        agent.reset()
        done = False

        # Gather experiences
        for _ in range(N_STEPS):
            s, r, done = agent.step()

            # Stop if failed
            if done: break

        # Update current rewards
        total_reward = agent.get_experience_buffer().reward.sum()
        tbar.set_description(f"Total reward: {total_reward: 0.3}")
        tbar.refresh()

        # Save metrics
        data.append({
            "e": e,
            "r": total_reward
        })
        
        # Train agent
        agent.train()

except KeyboardInterrupt:
    print("Training stopped...")


  0%|          | 0/10000 [00:00<?, ?it/s]

Training stopped...


In [184]:
# Make pd
data_pd = pd.DataFrame(data)
data_pd = data_pd.assign(roll_avg_r=lambda x: x.r.rolling(50).mean())

# Plot
fig = go.Figure()

# Add traces
fig.add_trace(
    go.Scatter(x=data_pd.e, y=data_pd.r, mode="lines", name="Reward")
)

fig.add_trace(
    go.Scatter(x=data_pd.e, y=data_pd.roll_avg_r, mode="lines", name="Roll. Avg. Reward")
)

# Update fig optiosn
fig.update_layout(
    template="plotly_white",
    margin=dict(l=20, r=20, t=20, b=20),
    xaxis_title="Episode",
    yaxis_title="Reward"
)

fig.show()