## PPO

#### Libraries

In [3]:
import sys
import time
import warnings
from tabulate import tabulate
import matplotlib.pyplot as plt

from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar, Union

import gymnasium as gym
from gymnasium import spaces

import numpy as np
import pandas as pd
import torch as th
from torch.nn import functional as F

from stable_baselines3.common.buffers import RolloutBuffer

from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, BasePolicy, MultiInputActorCriticPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common.buffers import DictRolloutBuffer, RolloutBuffer
from stable_baselines3.common.callbacks import BaseCallback

from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.utils import explained_variance, get_schedule_fn, obs_as_tensor, safe_mean
from stable_baselines3.common.vec_env import VecEnv



In [4]:
SelfOnPolicyAlgorithm = TypeVar("SelfOnPolicyAlgorithm", bound="OnPolicyAlgorithm")


class OnPolicyAlgorithm(BaseAlgorithm):
    """
    The base for On-Policy algorithms (ex: A2C/PPO).

    :param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
    :param env: The environment to learn from (if registered in Gym, can be str)
    :param learning_rate: The learning rate, it can be a function
        of the current progress remaining (from 1 to 0)
    :param n_steps: The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    :param gamma: Discount factor
    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator.
        Equivalent to classic advantage when set to 1.
    :param ent_coef: Entropy coefficient for the loss calculation
    :param vf_coef: Value function coefficient for the loss calculation
    :param max_grad_norm: The maximum value for the gradient clipping
    :param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
        instead of action noise exploration (default: False)
    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
        Default: -1 (only sample at the beginning of the rollout)
    :param rollout_buffer_class: Rollout buffer class to use. If ``None``, it will be automatically selected.
    :param rollout_buffer_kwargs: Keyword arguments to pass to the rollout buffer on creation.
    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
        the reported success rate, mean episode length, and mean reward over
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param monitor_wrapper: When creating an environment, whether to wrap it
        or not in a Monitor wrapper.
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
        debug messages
    :param seed: Seed for the pseudo random generators
    :param device: Device (cpu, cuda, ...) on which the code should be run.
        Setting it to auto, the code will be run on the GPU if possible.
    :param _init_setup_model: Whether or not to build the network at the creation of the instance
    :param supported_action_spaces: The action spaces supported by the algorithm.
    """

    rollout_buffer: RolloutBuffer
    policy: ActorCriticPolicy

    def __init__(
        self,
        policy: Union[str, Type[ActorCriticPolicy]],
        env: Union[GymEnv, str],
        learning_rate: Union[float, Schedule],
        n_steps: int,
        gamma: float,
        gae_lambda: float,
        ent_coef: float,
        vf_coef: float,
        max_grad_norm: float,
        use_sde: bool,
        sde_sample_freq: int,
        rollout_buffer_class: Optional[Type[RolloutBuffer]] = None,
        rollout_buffer_kwargs: Optional[Dict[str, Any]] = None,
        stats_window_size: int = 100,
        tensorboard_log: Optional[str] = None,
        monitor_wrapper: bool = True,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 0,
        seed: Optional[int] = None,
        device: Union[th.device, str] = "auto",
        _init_setup_model: bool = True,
        supported_action_spaces: Optional[Tuple[Type[spaces.Space], ...]] = None,
    ):
        super().__init__(
            policy=policy,
            env=env,
            learning_rate=learning_rate,
            policy_kwargs=policy_kwargs,
            verbose=verbose,
            device=device,
            use_sde=use_sde,
            sde_sample_freq=sde_sample_freq,
            support_multi_env=True,
            monitor_wrapper=monitor_wrapper,
            seed=seed,
            stats_window_size=stats_window_size,
            tensorboard_log=tensorboard_log,
            supported_action_spaces=supported_action_spaces,
        )

        self.n_steps = n_steps
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.rollout_buffer_class = rollout_buffer_class
        self.rollout_buffer_kwargs = rollout_buffer_kwargs or {}

        if _init_setup_model:
            self._setup_model()

        ##============##
        # CUSTOM
        ##============##
        self.rollout_counter = 0
        ##============##
        # CUSTOM
        ##============##

    def _setup_model(self) -> None:
        self._setup_lr_schedule()
        self.set_random_seed(self.seed)

        if self.rollout_buffer_class is None:
            if isinstance(self.observation_space, spaces.Dict):
                self.rollout_buffer_class = DictRolloutBuffer
            else:
                self.rollout_buffer_class = RolloutBuffer

        self.rollout_buffer = self.rollout_buffer_class(
            self.n_steps,
            self.observation_space,  # type: ignore[arg-type]
            self.action_space,
            device=self.device,
            gamma=self.gamma,
            gae_lambda=self.gae_lambda,
            n_envs=self.n_envs,
            **self.rollout_buffer_kwargs,
        )
        self.policy = self.policy_class(  # type: ignore[assignment]
            self.observation_space, self.action_space, self.lr_schedule, use_sde=self.use_sde, **self.policy_kwargs
        )
        self.policy = self.policy.to(self.device)
        # Warn when not using CPU with MlpPolicy
        self._maybe_recommend_cpu()

    def _maybe_recommend_cpu(self, mlp_class_name: str = "ActorCriticPolicy") -> None:
        """
        Recommend to use CPU only when using A2C/PPO with MlpPolicy.

        :param: The name of the class for the default MlpPolicy.
        """
        policy_class_name = self.policy_class.__name__
        if self.device != th.device("cpu") and policy_class_name == mlp_class_name:
            warnings.warn(
                f"You are trying to run {self.__class__.__name__} on the GPU, "
                "but it is primarily intended to run on the CPU when not using a CNN policy "
                f"(you are using {policy_class_name} which should be a MlpPolicy). "
                "See https://github.com/DLR-RM/stable-baselines3/issues/1245 "
                "for more info. "
                "You can pass `device='cpu'` or `export CUDA_VISIBLE_DEVICES=` to force using the CPU."
                "Note: The model will train, but the GPU utilization will be poor and "
                "the training might take longer than on CPU.",
                UserWarning,
            )

    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:

        self.rollout_counter += 1

        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_rollout_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        # Switch to eval mode (this affects batch norm / dropout)
        self.policy.set_training_mode(False)

        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions

            if isinstance(self.action_space, spaces.Box):
                if self.policy.squash_output:
                    # Unscale the actions to match env bounds
                    # if they were previously squashed (scaled in [-1, 1])
                    clipped_actions = self.policy.unscale_action(clipped_actions)
                else:
                    # Otherwise, clip the actions to avoid out of bound error
                    # as we are sampling from an unbounded Gaussian distribution
                    clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if not callback.on_step():
                return False

            self._update_info_buffer(infos, dones)
            n_steps += 1

            if isinstance(self.action_space, spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            # Handle timeout by bootstrapping with value function
            # see GitHub issue #633
            for idx, done in enumerate(dones):
                if (
                    done
                    and infos[idx].get("terminal_observation") is not None
                    and infos[idx].get("TimeLimit.truncated", False)
                ):
                    terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0]
                    with th.no_grad():
                        terminal_value = self.policy.predict_values(terminal_obs)[0]  # type: ignore[arg-type]
                    rewards[idx] += self.gamma * terminal_value

            rollout_buffer.add(
                self._last_obs,  # type: ignore[arg-type]
                actions,
                rewards,
                self._last_episode_starts,  # type: ignore[arg-type]
                values,
                log_probs,
            )
            self._last_obs = new_obs  # type: ignore[assignment]
            self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            values = self.policy.predict_values(obs_as_tensor(new_obs, self.device))  # type: ignore[arg-type]

        rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)

        callback.update_locals(locals())

        callback.on_rollout_end()

        return True

    def train(self) -> None:
        """
        Consume current rollout data and update policy parameters.
        Implemented by individual algorithms.
        """
        raise NotImplementedError

    def _dump_logs(self, iteration: int) -> None:
        """
        Write log.

        :param iteration: Current logging iteration
        """
        assert self.ep_info_buffer is not None
        assert self.ep_success_buffer is not None

        time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon)
        fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed)
        self.logger.record("time/iterations", iteration, exclude="tensorboard")
        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
            self.logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
            self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
        self.logger.record("time/fps", fps)
        self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard")
        self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
        if len(self.ep_success_buffer) > 0:
            self.logger.record("rollout/success_rate", safe_mean(self.ep_success_buffer))
        self.logger.dump(step=self.num_timesteps)

    def learn(
        self: SelfOnPolicyAlgorithm,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        tb_log_name: str = "OnPolicyAlgorithm",
        reset_num_timesteps: bool = True,
        progress_bar: bool = False,
    ) -> SelfOnPolicyAlgorithm:
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps,
            callback,
            reset_num_timesteps,
            tb_log_name,
            progress_bar,
        )

        callback.on_training_start(locals(), globals())

        assert self.env is not None

        while self.num_timesteps < total_timesteps:
            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if not continue_training:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                assert self.ep_info_buffer is not None
                self._dump_logs(iteration)

            self.train()

        callback.on_training_end()

        return self

    def _get_torch_save_params(self) -> Tuple[List[str], List[str]]:
        state_dicts = ["policy", "policy.optimizer"]

        return state_dicts, []

In [5]:
SelfPPO = TypeVar("SelfPPO", bound="PPO")


class PPO(OnPolicyAlgorithm):
    """
    Proximal Policy Optimization algorithm (PPO) (clip version)

    Paper: https://arxiv.org/abs/1707.06347
    Code: This implementation borrows code from OpenAI Spinning Up (https://github.com/openai/spinningup/)
    https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail and
    Stable Baselines (PPO2 from https://github.com/hill-a/stable-baselines)

    Introduction to PPO: https://spinningup.openai.com/en/latest/algorithms/ppo.html

    :param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
    :param env: The environment to learn from (if registered in Gym, can be str)
    :param learning_rate: The learning rate, it can be a function
        of the current progress remaining (from 1 to 0)
    :param n_steps: The number of steps to run for each environment per update
        (i.e. rollout buffer size is n_steps * n_envs where n_envs is number of environment copies running in parallel)
        NOTE: n_steps * n_envs must be greater than 1 (because of the advantage normalization)
        See https://github.com/pytorch/pytorch/issues/29372
    :param batch_size: Minibatch size
    :param n_epochs: Number of epoch when optimizing the surrogate loss
    :param gamma: Discount factor
    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    :param clip_range: Clipping parameter, it can be a function of the current progress
        remaining (from 1 to 0).
    :param clip_range_vf: Clipping parameter for the value function,
        it can be a function of the current progress remaining (from 1 to 0).
        This is a parameter specific to the OpenAI implementation. If None is passed (default),
        no clipping will be done on the value function.
        IMPORTANT: this clipping depends on the reward scaling.
    :param normalize_advantage: Whether to normalize or not the advantage
    :param ent_coef: Entropy coefficient for the loss calculation
    :param vf_coef: Value function coefficient for the loss calculation
    :param max_grad_norm: The maximum value for the gradient clipping
    :param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
        instead of action noise exploration (default: False)
    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
        Default: -1 (only sample at the beginning of the rollout)
    :param rollout_buffer_class: Rollout buffer class to use. If ``None``, it will be automatically selected.
    :param rollout_buffer_kwargs: Keyword arguments to pass to the rollout buffer on creation
    :param target_kl: Limit the KL divergence between updates,
        because the clipping is not enough to prevent large update
        see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
        By default, there is no limit on the kl div.
    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
        the reported success rate, mean episode length, and mean reward over
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
    :param policy_kwargs: additional arguments to be passed to the policy on creation
    :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
        debug messages
    :param seed: Seed for the pseudo random generators
    :param device: Device (cpu, cuda, ...) on which the code should be run.
        Setting it to auto, the code will be run on the GPU if possible.
    :param _init_setup_model: Whether or not to build the network at the creation of the instance
    """

    policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = {
        "MlpPolicy": ActorCriticPolicy,
        "CnnPolicy": ActorCriticCnnPolicy,
        "MultiInputPolicy": MultiInputActorCriticPolicy,
    }

    def __init__(
        self,
        policy: Union[str, Type[ActorCriticPolicy]],
        env: Union[GymEnv, str],
        learning_rate: Union[float, Schedule] = 3e-4,
        n_steps: int = 2048,
        batch_size: int = 64,
        n_epochs: int = 10,
        gamma: float = 0.99,
        gae_lambda: float = 0.95,
        clip_range: Union[float, Schedule] = 0.2,
        clip_range_vf: Union[None, float, Schedule] = None,
        normalize_advantage: bool = True,
        ent_coef: float = 0.0,
        vf_coef: float = 0.5,
        max_grad_norm: float = 0.5,
        use_sde: bool = False,
        sde_sample_freq: int = -1,
        rollout_buffer_class: Optional[Type[RolloutBuffer]] = None,
        rollout_buffer_kwargs: Optional[Dict[str, Any]] = None,
        target_kl: Optional[float] = None,
        stats_window_size: int = 100,
        tensorboard_log: Optional[str] = None,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 0,
        seed: Optional[int] = None,
        device: Union[th.device, str] = "auto",
        _init_setup_model: bool = True,
    ):
        super().__init__(
            policy,
            env,
            learning_rate=learning_rate,
            n_steps=n_steps,
            gamma=gamma,
            gae_lambda=gae_lambda,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            use_sde=use_sde,
            sde_sample_freq=sde_sample_freq,
            rollout_buffer_class=rollout_buffer_class,
            rollout_buffer_kwargs=rollout_buffer_kwargs,
            stats_window_size=stats_window_size,
            tensorboard_log=tensorboard_log,
            policy_kwargs=policy_kwargs,
            verbose=verbose,
            device=device,
            seed=seed,
            _init_setup_model=False,
            supported_action_spaces=(
                spaces.Box,
                spaces.Discrete,
                spaces.MultiDiscrete,
                spaces.MultiBinary,
            ),
        )

        # Sanity check, otherwise it will lead to noisy gradient and NaN
        # because of the advantage normalization
        if normalize_advantage:
            assert (
                batch_size > 1
            ), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"

        if self.env is not None:
            # Check that `n_steps * n_envs > 1` to avoid NaN
            # when doing advantage normalization
            buffer_size = self.env.num_envs * self.n_steps
            assert buffer_size > 1 or (
                not normalize_advantage
            ), f"`n_steps * n_envs` must be greater than 1. Currently n_steps={self.n_steps} and n_envs={self.env.num_envs}"
            # Check that the rollout buffer size is a multiple of the mini-batch size
            untruncated_batches = buffer_size // batch_size
            if buffer_size % batch_size > 0:
                warnings.warn(
                    f"You have specified a mini-batch size of {batch_size},"
                    f" but because the `RolloutBuffer` is of size `n_steps * n_envs = {buffer_size}`,"
                    f" after every {untruncated_batches} untruncated mini-batches,"
                    f" there will be a truncated mini-batch of size {buffer_size % batch_size}\n"
                    f"We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.\n"
                    f"Info: (n_steps={self.n_steps} and n_envs={self.env.num_envs})"
                )
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.clip_range = clip_range
        self.clip_range_vf = clip_range_vf
        self.normalize_advantage = normalize_advantage
        self.target_kl = target_kl

        if _init_setup_model:
            self._setup_model()
        
        ##============##
        # CUSTOM
        ##============##

        self.train_counter = 0
        self.update_model_loop = 0
        self.epoch_loops = 0

        self.avg_rollout_rew, self.avg_rollout_adv_mean, self.avg_rollout_adv_var = [], [], []
        self.total_losses, self.pg_losses, self.value_losses, self.entropy_losses = [], [], [], []
        self.policy_norm, self.approx_kl_div, self.clip_fraction, self.explained_var = [], [], [], []
        self.grad_norm_before, self.grad_norm_after = [], []

        ##============##
        # CUSTOM
        ##============##

    def _setup_model(self) -> None:
        super()._setup_model()

        # Initialize schedules for policy/value clipping
        self.clip_range = get_schedule_fn(self.clip_range)
        if self.clip_range_vf is not None:
            if isinstance(self.clip_range_vf, (float, int)):
                assert self.clip_range_vf > 0, "`clip_range_vf` must be positive, " "pass `None` to deactivate vf clipping"

            self.clip_range_vf = get_schedule_fn(self.clip_range_vf)

    def train(self) -> None:

        self.train_counter += 1

        #self.display_rollout_buffer(self.rollout_buffer)

        avg_rollout_rew, avg_rollout_adv_mean, avg_rollout_adv_var = self.calculate_metrices(self.rollout_buffer.rewards, self.rollout_buffer.episode_starts, self.rollout_buffer.advantages)
        self.avg_rollout_rew.append(avg_rollout_rew)
        self.avg_rollout_adv_mean.append(avg_rollout_adv_mean)
        self.avg_rollout_adv_var.append(avg_rollout_adv_var)

        
        """
        Update policy using the currently gathered rollout buffer.
        """
        
        # Switch to train mode (this affects batch norm / dropout)
        self.policy.set_training_mode(True)
        # Update optimizer learning rate
        self._update_learning_rate(self.policy.optimizer)
        # Compute current clip range
        clip_range = self.clip_range(self._current_progress_remaining)  # type: ignore[operator]
        # Optional: clip range for the value function
        if self.clip_range_vf is not None:
            clip_range_vf = self.clip_range_vf(self._current_progress_remaining)  # type: ignore[operator]

        entropy_losses = []
        pg_losses, value_losses = [], []
        clip_fractions = []
        grad_norm_before, grad_norm_after = [], [] 
        
        ####
        total_losses = []
        ####
        
        continue_training = True
        # train for n_epochs epochs
        for epoch in range(self.n_epochs):

            self.epoch_loops += 1

            approx_kl_divs = []
            # Do a complete pass on the rollout buffer
            for rollout_data in self.rollout_buffer.get(self.batch_size):


                self.update_model_loop += 1


                actions = rollout_data.actions
                if isinstance(self.action_space, spaces.Discrete):
                    # Convert discrete action from float to long
                    actions = rollout_data.actions.long().flatten()

                values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                values = values.flatten()
                # Normalize advantage
                advantages = rollout_data.advantages
                # Normalization does not make sense if mini batchsize == 1, see GH issue #325
                if self.normalize_advantage and len(advantages) > 1:
                    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                # ratio between old and new policy, should be one at the first iteration
                ratio = th.exp(log_prob - rollout_data.old_log_prob)

                # clipped surrogate loss
                policy_loss_1 = advantages * ratio
                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()

                # Logging
                pg_losses.append(policy_loss.item())
                clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item()
                clip_fractions.append(clip_fraction)

                if self.clip_range_vf is None:
                    # No clipping
                    values_pred = values
                else:
                    # Clip the difference between old and new value
                    # NOTE: this depends on the reward scaling
                    values_pred = rollout_data.old_values + th.clamp(
                        values - rollout_data.old_values, -clip_range_vf, clip_range_vf
                    )
                # Value loss using the TD(gae_lambda) target
                value_loss = F.mse_loss(rollout_data.returns, values_pred)
                value_losses.append(value_loss.item())

                # Entropy loss favor exploration
                if entropy is None:
                    # Approximate entropy when no analytical form
                    entropy_loss = -th.mean(-log_prob)
                else:
                    entropy_loss = -th.mean(entropy)

                entropy_losses.append(entropy_loss.item())

                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

                ####
                total_losses.append(loss.item())
                ####

                # Calculate approximate form of reverse KL Divergence for early stopping
                # see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
                # and discussion in PR #419: https://github.com/DLR-RM/stable-baselines3/pull/419
                # and Schulman blog: http://joschu.net/blog/kl-approx.html
                with th.no_grad():
                    log_ratio = log_prob - rollout_data.old_log_prob
                    approx_kl_div = th.mean((th.exp(log_ratio) - 1) - log_ratio).cpu().numpy()
                    approx_kl_divs.append(approx_kl_div)

                if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl:
                    continue_training = False
                    if self.verbose >= 1:
                        print(f"Early stopping at step {epoch} due to reaching max kl: {approx_kl_div:.2f}")
                    break

                # Optimization step
                self.policy.optimizer.zero_grad()
                loss.backward()

                # Gradient Norm before clipping
                total_norm = 0.0
                for param in self.policy.parameters():
                    if param.grad is not None:
                        param_norm = param.grad.data.norm(2)
                        total_norm += param_norm.item() ** 2
                total_norm = total_norm ** 0.5
                grad_norm_before.append(total_norm)

                # Clip grad norm
                th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)

                # Gradient Norm after clipping
                total_norm_after = 0.0
                for param in self.policy.parameters():
                    if param.grad is not None:
                        param_norm = param.grad.data.norm(2)
                        total_norm_after += param_norm.item() ** 2
                total_norm_after = total_norm_after ** 0.5
                grad_norm_after.append(total_norm_after)
                            
                self.policy.optimizer.step()

            self._n_updates += 1
            if not continue_training:
                break

        explained_var = explained_variance(self.rollout_buffer.values.flatten(), self.rollout_buffer.returns.flatten())

        ######

        self.total_losses.extend(total_losses)
        self.pg_losses.extend(pg_losses)
        self.value_losses.extend(value_losses)
        self.entropy_losses.extend(entropy_losses)
        self.approx_kl_div.extend(approx_kl_divs)
        self.clip_fraction.extend(clip_fractions)
        self.grad_norm_before.extend(grad_norm_before)
        self.grad_norm_after.extend(grad_norm_after)
        self.explained_var.append(explained_var)

        ######

        # Logs
        self.logger.record("train/entropy_loss", np.mean(entropy_losses))
        self.logger.record("train/policy_gradient_loss", np.mean(pg_losses))
        self.logger.record("train/value_loss", np.mean(value_losses))
        self.logger.record("train/approx_kl", np.mean(approx_kl_divs))
        self.logger.record("train/clip_fraction", np.mean(clip_fractions))
        self.logger.record("train/loss", loss.item())
        self.logger.record("train/explained_variance", explained_var)
        if hasattr(self.policy, "log_std"):
            self.logger.record("train/std", th.exp(self.policy.log_std).mean().item())

        self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        self.logger.record("train/clip_range", clip_range)
        if self.clip_range_vf is not None:
            self.logger.record("train/clip_range_vf", clip_range_vf)

    def learn(
        self: SelfPPO,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        tb_log_name: str = "PPO",
        reset_num_timesteps: bool = True,
        progress_bar: bool = False,
    ) -> SelfPPO:
        return super().learn(
            total_timesteps=total_timesteps,
            callback=callback,
            log_interval=log_interval,
            tb_log_name=tb_log_name,
            reset_num_timesteps=reset_num_timesteps,
            progress_bar=progress_bar,
        )
    
    def display_rollout_buffer(self, rollout_buffer):

        print("\nRollout Buffer Contents:")
        
        # Prepare data for tabulation
        data = []
        for i in range(rollout_buffer.buffer_size):

            if bool(rollout_buffer.episode_starts[i]) == True:
                    data.append({'observations': 'New Episode', 'actions': '', 'rewards': '',
                            'returns': '', 'episode_starts': '', 'values': '', 'log_probs': '', 'advantages': ''})
            row = {
                'observations': rollout_buffer.observations[i],
                'actions': rollout_buffer.actions[i],
                'rewards': rollout_buffer.rewards[i],
                'returns': rollout_buffer.returns[i],
                'episode_starts': rollout_buffer.episode_starts[i],
                'values': rollout_buffer.values[i],
                'log_probs': rollout_buffer.log_probs[i],
                'advantages': rollout_buffer.advantages[i]
            }
            data.append(row)
            # if not i == (rollout_buffer.buffer_size-1):
            #     if bool(rollout_buffer.episode_starts[i+1]) == True:
            #         data.append({'observations': 'New Episode', 'actions': '', 'rewards': '',
            #                 'returns': '', 'episode_starts': '', 'values': '', 'log_probs': '', 'advantages': ''})
        
        df = pd.DataFrame(data)
        pd.set_option('display.expand_frame_repr', False)  # Don't wrap to multiple lines
        pd.set_option('display.max_colwidth', None)  # Don't truncate column contents
        pd.set_option('display.width', None) 
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        print(df)
        print('')
        print('')
    
    def calculate_metrices(self, rewards, episode_starts, advantages):
        # Ensure inputs are numpy arrays
        rewards = np.array(rewards).flatten()
        episode_starts = np.array(episode_starts).flatten()
        advantages = np.array(advantages).flatten()
        
        # Find indices where episodes start
        episode_start_indices = np.where(episode_starts == 1)[0]

        if episode_starts[0] != 1:
            episode_start_indices = np.append(0, episode_start_indices)

        # Add the length of rewards as the last index to capture the last episode
        if episode_starts[len(episode_starts)-1] != 1:
            episode_start_indices = np.append(episode_start_indices, len(rewards))
        
        # Calculate rewards for each episode
        episode_rewards = []
        advantage_means = []
        advantage_vars = []
        for i in range(len(episode_start_indices) - 1):
            start = episode_start_indices[i]
            end = episode_start_indices[i+1]

            episode_reward = np.sum(rewards[start:end])
            advantage_mean = np.mean(advantages[start:end])
            advantage_var = np.var(advantages[start:end])

            episode_rewards.append(episode_reward)
            advantage_means.append(advantage_mean)
            advantage_vars.append(advantage_var)
        
        # Calculate average reward per episode
        average_reward = np.mean(episode_rewards)
        average_adv_mean = np.mean(advantage_means)
        average_adv_var = np.mean(advantage_vars)
        
        return average_reward, average_adv_mean, average_adv_var


In [6]:
def plot_training_results(model):
    
    fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(18,10))
    axs = axs.flatten()

    axs[0].plot(model.total_losses, color='blue')
    axs[0].set_title('Total Loss')
    axs[0].set_ylabel('Value')
    axs[0].set_xlabel('Mini-Batch Update')

    axs[1].plot(model.pg_losses, color='blue')
    axs[1].set_title('Policy Loss')
    axs[1].set_xlabel('Mini-Batch Update')

    axs[2].plot(model.value_losses, color='blue')
    axs[2].set_title('Value Loss')
    axs[2].set_xlabel('Mini-Batch Update')

    axs[3].plot(model.entropy_losses, color='blue')
    axs[3].set_title('Entropy Loss')
    axs[3].set_xlabel('Mini-Batch Update')

    axs[4].plot(model.avg_rollout_rew, color='blue')
    axs[4].set_title('Avg Reward per Sequence')
    axs[4].set_xlabel('# of Rollout Buffer filled (total_timesteps/n_steps)')
    axs[4].set_ylabel('Reward')

    axs[5].plot(model.grad_norm_before, color='blue', label='before clipping')
    axs[5].plot(model.grad_norm_after, color='red', label='after clipping')
    axs[5].set_title('Gradient Norm')
    axs[5].set_xlabel('# of Rollout Buffers filled (total_timesteps/n_steps)')
    axs[5].set_ylabel('Norm')
    axs[5].legend()

    axs[6].plot(model.clip_fraction, color='blue')
    axs[6].set_title('Clip Fraction (how many times it was applied)')
    axs[6].set_xlabel('Mini-Batch Update')
    axs[6].set_ylabel('Ratio in %')

    #axs[7].plot(model.kl_div, color='blue')
    axs[7].set_title('KL Div. real vs. sim sequences')
    axs[7].set_xlabel('')
    axs[7].set_ylabel('')

    axs[8].plot(model.approx_kl_div, color='blue')
    axs[8].set_title('Approx. KL Div. old vs. new policy')
    axs[8].set_xlabel('Value')
    axs[8].set_ylabel('Mini-Batch Update')

    axs[9].plot(model.explained_var, color='blue')
    axs[9].set_title('Explained Variance')
    axs[9].set_xlabel('Value')
    axs[9].set_ylabel('# of Rollout Buffers filled (total_timesteps/n_steps)')
    axs[9].set_ylim(-0.2,1.2)

    axs[10].plot(model.avg_rollout_adv_mean, color='blue', label='mean')
    axs[10].plot(model.avg_rollout_adv_var, color='red', label='variance')
    axs[10].set_title('Advantage Mean & Variance')
    axs[10].set_xlabel('# of Rollout Buffers filled (total_timesteps/n_steps)')
    axs[10].set_ylabel('Value')
    axs[10].legend()
    
    #axs[14].hist(model.adv_hist1, color='blue', edgecolor='white')
    #axs[14].set_title('Advantage Histogram Start')
    #axs[14].set_xlabel('Value')
    #axs[14].set_ylabel('Frequency')

    #axs[15].hist(model.adv_hist2, color='blue', edgecolor='white')
    #axs[15].set_title('Advantage Histogram End')
    #axs[15].set_xlabel('Value')
    #axs[15].set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

def train_and_evaluate(env, lr, bs, total_timesteps):
    
    env = env

    model = PPO(
    policy="MlpPolicy",
    env=env,
    verbose=0,
    
    n_steps=2048,
    batch_size = bs,
    n_epochs=10,
    
    learning_rate=lr,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    clip_range_vf=None,
    ent_coef = 0.0,
    vf_coef = 0.5,
    max_grad_norm=0.5
    )

    model.learn(total_timesteps=total_timesteps)
    
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
    print(f"Batch Size: {bs} Learning rate: {lr}")
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
    print("---")
    plot_training_results(model)

    env.close()



#### PPO Parameters Explanation

##### Key Parameters

- **total_timesteps**: Total number of environment interactions
- **n_steps**: Number of environment interactions filled in the rollout buffer
- **batch_size**: Mini-batch size used to update the model (out of n_steps/rollout_buffer, batch_size will be chosen to update)
- **n_epochs**: Number of updates with the same rollout buffer, so mini-batch size will be chosen n_epochs times and the model updated

##### Relationships and Calculations

- **Rollout Buffer Size** = n_steps
- **Rollout Buffers Loop** = total_timesteps / n_steps
- **Train Loop** = total_timesteps / n_steps
- **N_epoch Loop** = Train Loop * n_epochs
- **Number of Updates** = Train Loop * n_epochs * (n_steps / batch_size)

Batch_Size, Learning_Rates and N_epochs play a crucial role

In [7]:
# total_timesteps:  total # of environment interactions
# n_steps:          # of environment interactions filled in the rollout buffer
# batch_size:       mini-batch size used to update the model. (out of n_steps/rollout_buffer, batch_size will be chosen to update)
# n_epochs:         # of updates with the same rollout buffer, so mini batch_size will be chosen n_epochs times and the model updated.

# Rollout Buffer Size = n_steps
# Rollout Buffers Loop = total_timesteps / n_steps
# Train Loop = total_timesteps / n_steps
# N_epoch Loop = Train Loop * n_epoch
# of Updates = Train Loop * n_epochs * (n_steps / batch_size)

# print('# of Train Loops: ', model.train_counter)
# print('# of Rollout Buffer filling: ',model.rollout_counter)
# print('# of N_epochs', model.epoch_loops)
# print('# of updates', model.update_model_loop)

**Pendulum-v1**: Learning rate: **3e-4** Batch Size: **32**, total timesteps: **500'000**

**FrozenLake-v1**: Learning rate: **5e-4** Batch Size: **64**, total timesteps: **200'000**

**Taxi-v3** Learning rate: **5e-4** Batch Size: **64**, total timesteps: **50'000**

**LunarLander-v2** Learning rate: **7e-4** Batch Size: **64**, total timestep: **300'000**

In [None]:
# Test different learning rates
learning_rates = [5e-5, 1e-4, 5e-4, 1e-3, 5e-3]
batch_sizes = [32,64,128,256]

for bs in batch_sizes:
    for lr in learning_rates:
        train_and_evaluate(gym.make("LunarLander-v2"), lr, bs, total_timesteps=200000)

# # After finding the best lr, you can train for longer
# best_lr = 3e-4  # Replace with the best lr you found
# env = gym.make('Pendulum-v1')
# model = PPO("MlpPolicy", env, learning_rate=best_lr, n_steps=2048, batch_size=64, n_epochs=10, verbose=0)
# model.learn(total_timesteps=1000000)

# # Evaluate the final model
# mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20)
# print(f"Final model performance:")
# print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

#env.close()