## Basic Demo

In [None]:
import gym
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

In [None]:
env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

In [None]:
model = PPO2(MlpPolicy, env, verbose=1)

In [None]:
model.get_parameter_list()

In [None]:
model.learn(total_timesteps=10000)

In [None]:
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

## Defining a custom policy for TD3 to include different actor critic architecture: MyFeedForwardPolicy

* CustomFFPTD3 will inherit from TD3Policy class
* The custom policy "CustomFFPTD3 shall form the substitute for "FeedForwardPolicy" which inherits from TD3Policy class too
* Any Custom TD3Policy will inherit from this newly created "CustomFFPTD3" class
* It will be based on the more flexible class FeedForwardPolicy(ActorCriticPolicy) https://stable-baselines.readthedocs.io/en/master/_modules/stable_baselines/common/policies.html

In [None]:
import tensorflow as tf
import numpy as np
from gym.spaces import Box

from stable_baselines.td3.policies import TD3Policy
from stable_baselines.sac.policies import mlp
from stable_baselines.common.policies import nature_cnn

In [None]:
class CustomFFPTD3(TD3Policy):
    """
    Policy object that implements a DDPG-like actor critic, using a feed forward neural network. It is only different
    from the existing FeedForwardPolicy for TD3 in the way network architectures are defined: here we can define 
    separate architectures for actor and critic networks.

    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param net_arch: (dict) The architecture e of the actor and critic network for the policy (if None, default to [64, 64])
    :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
    :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
    :param layer_norm: (bool) enable layer normalisation
    :param act_fun: (tf.func) the activation function to use in the neural network.
    :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, net_arch=None,
                 cnn_extractor=nature_cnn, feature_extraction="cnn",
                 layer_norm=False, act_fun=tf.nn.relu, **kwargs):
        super(CustomFFPTD3, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
                                                reuse=reuse, scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)
        self.layer_norm = layer_norm
        self.feature_extraction = feature_extraction
        self.cnn_kwargs = kwargs
        self.cnn_extractor = cnn_extractor
        self.reuse = reuse
        if net_arch is None:
            net_arch = dict(pi=[64, 64],vf=[64,64])
        self.net_arch = net_arch

        assert ('pi' in self.net_arch.keys()) & ('vf' in self.net_arch.keys()), "KeyError: 'pi' and 'vf' keywords missing"
        assert len(self.net_arch['pi']) >= 1, "Error: must have at least one hidden layer for the actor network."
        assert len(self.net_arch['vf']) >= 1, "Error: must have at least one hidden layer for the critics network."

        self.activ_fn = act_fun

    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            pi_h = mlp(pi_h, self.net_arch['pi'], self.activ_fn, layer_norm=self.layer_norm)

            self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh)

        return policy

    def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h = tf.concat([critics_h, action], axis=-1)

            # Double Q values to reduce overestimation
            with tf.variable_scope('qf1', reuse=reuse):
                qf1_h = mlp(qf_h, self.net_arch['vf'], self.activ_fn, layer_norm=self.layer_norm)
                qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

            with tf.variable_scope('qf2', reuse=reuse):
                qf2_h = mlp(qf_h, self.net_arch['vf'], self.activ_fn, layer_norm=self.layer_norm)
                qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

            self.qf1 = qf1
            self.qf2 = qf2

        return self.qf1, self.qf2

    def step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy, {self.obs_ph: obs})

## Using the different policy in TD3

In [None]:
import gym
import numpy as np

from stable_baselines import TD3
from stable_baselines.td3.policies import FeedForwardPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

In [None]:
# Custom MLP policy with two layers
class CustomTD3Policy(CustomFFPTD3):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__(*args, **kwargs,
                                           net_arch = dict(pi=[16, 16],vf=[32,32]),
                                           layer_norm=False,
                                           feature_extraction="mlp")

In [None]:
# Create and wrap the environment
env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])

In [None]:
# The noise objects for TD3
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

In [None]:
model = TD3(CustomTD3Policy, env, action_noise=action_noise, verbose=1)

In [None]:
# Train the agent
model.learn(total_timesteps=10000)

## Trying to log different parameters in Tensorboard

In [None]:
import os

import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter


best_mean_reward, n_steps = -np.inf, 0

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward
    # Print stats every 1000 calls
    if (n_steps + 1) % 1000 == 0:
        # Evaluate policy training performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return True

In [None]:
# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
# Train the agent
time_steps = 10000
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, 'CartPole-v1')
plt.show()

In [None]:
!tensorboard --logdir ./a2c_cartpole_tensorboard/

## Logging additional trainable vars of my choice on docs example

In [None]:
import tensorflow as tf
import numpy as np

from stable_baselines import SAC

model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="./sac/", verbose=1)
# Define a new property to avoid global variable
model.is_tb_set = False

In [None]:
model.get_parameter_list()  # get_parameters()

In [None]:
model.get_parameter_list()[0].name

In [None]:
model.get_parameters()['target/values_fn/vf/fc0/kernel:0']

In [None]:
def callback(locals_, globals_):
    self_ = locals_['self']
    # Log additional tensor
    if not self_.is_tb_set:
        with self_.graph.as_default():
            tf.summary.scalar('value_target', tf.reduce_mean(self_.value_target))
            tf.summary.histogram('target/values_fn/vf/fc0/kernel:0',
                                 self_.get_parameters()['target/values_fn/vf/fc0/kernel:0'])
            self_.summary = tf.summary.merge_all()
        self_.is_tb_set = True
    # Log scalar value (here a random variable)
    value = np.random.random()
    summary = tf.Summary(value=[tf.Summary.Value(tag='random_value', simple_value=value)])
    locals_['writer'].add_summary(summary, self_.num_timesteps)
    return True

In [None]:
model.learn(10000, callback=callback)