In [3]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras import layers

import numpy as np
import tensorflow as tf
import gymnasium as gym
import scipy.signal

2024-03-30 17:36:29.269943: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Experience Replay


In [4]:
def discounted_cumulative_sums(x, discount):
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class Buffer:
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.log_probability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, log_probability):
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.log_probability_buffer[self.pointer] = log_probability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.log_probability_buffer,
        )


def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)


def log_probabilities(logits, a):
    log_probabilities_all = keras.ops.log_softmax(logits)
    log_probability = keras.ops.sum(
        keras.ops.one_hot(a, num_actions) * log_probabilities_all, axis=1
    )
    return log_probability


seed_generator = keras.random.SeedGenerator(1337)


@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = keras.ops.squeeze(
        keras.random.categorical(logits, 1, seed=seed_generator), axis=1
    )
    return logits, action


@tf.function
def train_policy(
    observation_buffer, action_buffer, log_probability_buffer, advantage_buffer
):
    with tf.GradientTape() as tape:
        ratio = keras.ops.exp(
            log_probabilities(actor(observation_buffer), action_buffer)
            - log_probability_buffer
        )
        min_advantage = keras.ops.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -keras.ops.mean(
            keras.ops.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = keras.ops.mean(
        log_probability_buffer
        - log_probabilities(actor(observation_buffer), action_buffer)
    )
    kl = keras.ops.sum(kl)
    return kl


@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:
        value_loss = keras.ops.mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))

2024-03-30 17:36:35.274026: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-30 17:36:35.274619: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Hyperparameters


In [5]:
steps_per_epoch = 4000
epochs = 10
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (64, 64)

# If you want to see the environment, set render to True
render = True

## Initializations


In [6]:
if render:
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    temp_dir = "frames"
    os.makedirs(temp_dir, exist_ok=True)
else:
    gym.make("CartPole-v1")

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

buffer = Buffer(observation_dimensions, steps_per_epoch)

observation_input = keras.Input(shape=(observation_dimensions,), dtype="float32")
logits = mlp(observation_input, list(hidden_sizes) + [num_actions])
actor = keras.Model(inputs=observation_input, outputs=logits)
value = keras.ops.squeeze(mlp(observation_input, list(hidden_sizes) + [1]), axis=1)
critic = keras.Model(inputs=observation_input, outputs=value)

policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)

observation, _ = env.reset()
episode_return, episode_length = 0, 0

## Train


In [5]:
import imageio

for epoch in range(epochs):
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
        if render:
            frame = env.render()
            frame_path = os.path.join(temp_dir, f"epoch_{epoch}_frame_{t+1:04d}.png")
            imageio.imwrite(frame_path, frame)

        observation = observation.reshape(1, -1)
        logits, action = sample_action(observation)
        observation_new, reward, done, _, _ = env.step(action[0].numpy())
        episode_return += reward
        episode_length += 1

        value_t = critic(observation)
        log_probability_t = log_probabilities(logits, action)

        buffer.store(observation, action, reward, value_t, log_probability_t)

        observation = observation_new

        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, _ = env.reset()
            episode_return, episode_length = 0, 0

    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        log_probability_buffer,
    ) = buffer.get()

    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, log_probability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    for _ in range(train_value_iterations):
        train_value_function(observation_buffer, return_buffer)

    print(
        f"Epoch: {epoch + 1} | Mean Return: {sum_return / num_episodes} | Mean Length: {sum_length / num_episodes}"
    )

Epoch: 1 | Mean Return: 23.391812865497077 | Mean Length: 23.391812865497077
Epoch: 2 | Mean Return: 28.776978417266186 | Mean Length: 28.776978417266186
Epoch: 3 | Mean Return: 43.47826086956522 | Mean Length: 43.47826086956522
Epoch: 4 | Mean Return: 68.96551724137932 | Mean Length: 68.96551724137932
Epoch: 5 | Mean Return: 97.5609756097561 | Mean Length: 97.5609756097561
Epoch: 6 | Mean Return: 133.33333333333334 | Mean Length: 133.33333333333334
Epoch: 7 | Mean Return: 160.0 | Mean Length: 160.0
Epoch: 8 | Mean Return: 200.0 | Mean Length: 200.0
Epoch: 9 | Mean Return: 166.66666666666666 | Mean Length: 166.66666666666666
Epoch: 10 | Mean Return: 285.7142857142857 | Mean Length: 285.7142857142857


## Generate Video Of Training


In [7]:
import imageio
from tqdm import tqdm

filenames = sorted(
    [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith(".png")]
)
with imageio.get_writer("training_video.mp4", fps=30) as video:
    for filename in tqdm(filenames):
        video.append_data(imageio.imread(filename))

"""# Optionally, clean up by removing the images if no longer needed
for filename in filenames:
    os.remove(filename)
os.rmdir(temp_dir)"""

  video.append_data(imageio.imread(filename))
100%|██████████| 44000/44000 [14:58<00:00, 49.00it/s] 


'# Optionally, clean up by removing the images if no longer needed\nfor filename in filenames:\n    os.remove(filename)\nos.rmdir(temp_dir)'