# PPO DEMO
This notebook guides you on how to use the PPO implementation

## Writing your one ActorCriticPolicy with actor and critic networks

Implementing a gaussian policy for continuous actions spaces by inheriting form GaussianActorCriticPolicy and using the tensorflow subclassing methode

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense

from policies import GaussianActorCriticPolicy


class MyMlpGaussianActorCriticPolicyIndependentSigma(GaussianActorCriticPolicy):

    def create_actor_critic_network(self):
        class MyMlpModel(keras.Model):

            def __init__(self, action_dim):
                super(MyMlpModel, self).__init__()

                self.mu_0 = Dense(256, activation=tf.nn.relu)
                self.mu_1 = Dense(128, activation=tf.nn.relu)
                self.mu_2 = Dense(64, activation=tf.nn.relu)
                self.mu_out = Dense(action_dim, activation=None)

                self.sigma = tf.Variable(initial_value=tf.zeros(action_dim), trainable=True)

                self.v_0 = Dense(256, activation=tf.nn.relu)
                self.v_1 = Dense(128, activation=tf.nn.relu)
                self.v_2 = Dense(64, activation=tf.nn.relu)
                self.value = Dense(1, activation=None)

            @tf.function
            def call(self, inputs):
                x = self.mu_0(inputs)
                x = self.mu_1(x)
                x = self.mu_2(x)
                mu = self.mu_out(x)

                y = self.v_0(inputs)
                y = self.v_1(y)
                y = self.v_2(y)
                va = self.value(y)
                return mu, self.sigma, va

        return MyMlpModel(self._action_dim)

Implementing policy for discrete actions spaces by inheriting form DiscreteActorCriticPolicy and using the tensorflow functional API

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense

from policies import DiscreteActorCriticPolicy


class MyMlpDiscreteActorCriticPolicy(DiscreteActorCriticPolicy):
    def __init__(self, state_dim, n_actions, shared_networks=False):
        self._shared_networks = shared_networks
        super().__init__(state_dim, n_actions)

    def create_actor_critic_network(self):
        if self._shared_networks:
            return self._creat_network()
        else:
            return self._creat_network_separate()

    def _creat_network(self):
        inputs = keras.Input(shape=self._state_dim)
        x = Dense(256, activation=tf.nn.relu)(inputs)
        x = Dense(128, activation=tf.nn.relu)(x)
        x = Dense(64, activation=tf.nn.relu)(x)
        value = Dense(1, activation=None)(x)
        logits = Dense(self._n_actions, activation=None)(x)
        model = keras.Model(inputs=inputs, outputs=(logits, value))
        return model

    def _creat_network_separate(self):
        inputs = keras.Input(shape=self._state_dim)

        x = Dense(256, activation=tf.nn.relu)(inputs)
        x = Dense(128, activation=tf.nn.relu)(x)
        x = Dense(64, activation=tf.nn.relu)(x)
        logits = Dense(self._n_actions, activation=None)(x)

        y = Dense(256, activation=tf.nn.relu)(inputs)
        y = Dense(128, activation=tf.nn.relu)(y)
        y = Dense(64, activation=tf.nn.relu)(y)
        value = Dense(1, activation=None)(y)

        model = keras.Model(inputs=inputs, outputs=(logits, value))
        return model

## Training setup

In [None]:
import datetime
import gymnasium as gym
import tensorflow as tf

env_name = "InvertedPendulum-v4"  # define environment to use
# env_name = "CartPole-v1"  # discrete alternative (requires to install gymnasium[box2d])
discrete = False
num_envs = 4  # number of vectorized environments
network_type = "mlp"  # define the network type that should be used
log_dir = f'logs/{env_name}/PPO_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'

Creating the environment!
Training with RNNs is also possible using the FrameStack wrapper

In [None]:
from gymnasium.wrappers import FrameStack

window_size = None  # number of environment observation unified in one window
if window_size is not None:
    envs = [lambda: FrameStack(gym.make(env_name), window_size) for _ in range(num_envs)]
    env = gym.vector.SyncVectorEnv(envs)  # oder: env = gym.vector.AsyncVectorEnv(envs)
else:
    env = gym.vector.make(env_name, num_envs=num_envs, asynchronous=False)

Define the policy to be used

In [None]:
# import default policies for CNNs and LSTMs
from policies import CnnGaussianActorCriticPolicyIndependentSigma, \
    LstmGaussianActorCriticPolicy

if discrete:
    if network_type == "mlp":
        policy = MyMlpDiscreteActorCriticPolicy(n_actions=env.single_action_space.n,
                                                state_dim=env.single_observation_space.shape)
    else:
        raise Exception(f"Unknown network type {network_type}")
else:
    if network_type == "cnn":
        policy = CnnGaussianActorCriticPolicyIndependentSigma(action_dim=env.single_action_space.shape[0],
                                                              state_dim=env.single_observation_space.shape,
                                                              action_space=env.single_action_space)
    elif network_type == "rnn":
        policy = LstmGaussianActorCriticPolicy(action_dim=env.single_action_space.shape[0],
                                               state_dim=env.single_observation_space.shape,
                                               action_space=env.single_action_space)
    elif network_type == "mlp":
        policy = MyMlpGaussianActorCriticPolicyIndependentSigma(action_dim=env.single_action_space.shape[0],
                                                                state_dim=env.single_observation_space.shape,
                                                                action_space=env.single_action_space)
    else:
        raise Exception(f"Unknown network type {network_type}")

Using tensorboard to monitor your progress

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/

Start the training and define hyperparameters

In [None]:
from agent import Agent

keras.backend.clear_session()

agent = Agent(
    env=env,
    policy=policy,
    normalize_adv=False,
    log_dir=log_dir,
    verbose=True,
    batch_size=256,
    data_set_repeats=4,
    steps_per_epoch=2048
)
agent.train(epochs=100)

In [None]:
%reload_ext tensorboard