In [6]:
%autosave 0

Autosave disabled


## Idea.
We will use as much class inheritance as possible so we are making few changes.

We will be copying from the source code we downloaded for version 1.8.0 of Stable Baselines3.

## Create custom environment.

In [7]:
import gym
from gym.spaces import MultiDiscrete
from gym.spaces import Discrete
from gym.spaces import Box
import numpy as np
from numpy.random import default_rng
from stable_baselines3.common.env_checker import check_env

# https://www.youtube.com/watch?v=R5S2FmtFnt8&ab_channel=DibyaChakravorty
class CustomEnv(gym.Env):
    def __init__(self):
        # Need to define observation and action space.
        action_lows = np.array([1, 0, 0, 0.05])
        action_highs = np.array([4, 2, 4, 10])
        self.action_space = Box(low=action_lows, high=action_highs)

        # Observation space will have just t and r. Both are floats. t is in [0, 100] and r is in [-100, 100].
        obs_lows = np.array([0, -100])
        obs_highs = np.array([100, 100])
        self.observation_space = Box(low=obs_lows, high=obs_highs)

        # Also declare a random number generator.
        self.rng = default_rng()

        # This will always store the current observation.
        self.current_obs = None
    
    def reset(self):
        # The reset() method will initialise an episode (fix the problem parameters by sampling from the parameter space).
        # Each episode, we will have different starting observation space.
        normal = min(100, max(0, self.rng.normal(loc=48.57142857, scale=15.89249598)))
        uniform = self.rng.uniform(-100, 100)

        # Return the first (initial) observation.
        self.current_obs = np.array([normal, uniform], dtype="float32")

        return self.current_obs
    
    def step(self, action):
        # The input "action" will be a numpy array like np.array([2, 3, 2, 4]).
        c_r = action[0]
        m_r = action[1]
        s_r = action[2]
        p = action[3]

        normal, uniform = self.current_obs

        # Need to calculate the next observation (even though wont need it).
        # Dont have randomness, have a deterministic value for r_gain and prof as just using regression. Sals is bounded by 0.
        sals = max(0, 44.06153 + -4.42618 * p + 0.167923 * uniform + -0.31689 * normal)

        r_per_sal = -0.13906 + 0.067546 * c_r + 0.266495 * m_r + 0.097398 * s_r + -0.20013 * p
        r_at_end = min(100, max(-100, uniform + sals * r_per_sal))
        r_gain = r_at_end - uniform

        prof = sals * p - sals * (0.3 * c_r + 0.09 * m_r + 0.16 * s_r + 0.12)

        self.current_obs = np.array([normal, r_at_end], dtype="float32")

        # Need to calculate reward. Will just have 1 reward for now, prof.
        reward = prof

        # Need to compute done (always True).
        done = True

        return self.current_obs, reward, done, {}

In [9]:
from stable_baselines3 import PPO

env = CustomEnv()
env.reset()

model = PPO("MlpPolicy", env, verbose=1)

TIMESTEPS = 100
iters = 0
while True:
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -4.52    |
| time/              |          |
|    fps             | 1688     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------


KeyboardInterrupt: 

In [11]:
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy

env = CustomEnv()
env.reset()

model = PPO(ActorCriticPolicy, env, verbose=1)

TIMESTEPS = 100
iters = 0
while True:
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -5.23    |
| time/              |          |
|    fps             | 1800     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------


KeyboardInterrupt: 

## Custom PPO class.
This class will be exactly the same (via inheritance) but we need to change the MlpExtractor class to one of my own.

In [20]:
import torch as th
from torch import nn
from typing import Dict, List, Tuple, Type, Union

class CustomMlpExtractor(nn.Module):
    def __init__(
        self,
        feature_dim: int,
        net_arch: Union[List[int], Dict[str, List[int]]],
        activation_fn: Type[nn.Module],
        device: Union[th.device, str] = "auto",
    ) -> None:
        super().__init__()
        device = get_device(device)
        policy_net: List[nn.Module] = []
        value_net: List[nn.Module] = []
        last_layer_dim_pi = feature_dim
        last_layer_dim_vf = feature_dim
        
        print("In CustomMlpExtractor")

        # save dimensions of layers in policy and value nets
        if isinstance(net_arch, dict):
            # Note: if key is not specificed, assume linear network
            pi_layers_dims = net_arch.get("pi", [])  # Layer sizes of the policy network
            vf_layers_dims = net_arch.get("vf", [])  # Layer sizes of the value network
        else:
            pi_layers_dims = vf_layers_dims = net_arch
        # Iterate through the policy layers and build the policy net
        for curr_layer_dim in pi_layers_dims:
            policy_net.append(nn.Linear(last_layer_dim_pi, curr_layer_dim))
            policy_net.append(activation_fn())
            last_layer_dim_pi = curr_layer_dim
        # Iterate through the value layers and build the value net
        for curr_layer_dim in vf_layers_dims:
            value_net.append(nn.Linear(last_layer_dim_vf, curr_layer_dim))
            value_net.append(activation_fn())
            last_layer_dim_vf = curr_layer_dim

        # Save dim, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf

        # Create networks
        # If the list of layers is empty, the network will just act as an Identity module
        self.policy_net = nn.Sequential(*policy_net).to(device)
        self.value_net = nn.Sequential(*value_net).to(device)

    def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        """
        :return: latent_policy, latent_value of the specified network.
            If all layers are shared, then ``latent_policy == latent_value``
        """
        return self.forward_actor(features), self.forward_critic(features)

    def forward_actor(self, features: th.Tensor) -> th.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: th.Tensor) -> th.Tensor:
        return self.value_net(features)

In [49]:
from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor, FlattenExtractor)
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
from gym import spaces
from stable_baselines3.common.type_aliases import Schedule
from stable_baselines3.common.utils import get_device

class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: spaces.Space,
        action_space: spaces.Space,
        lr_schedule: Schedule,
        net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None,
        activation_fn: Type[nn.Module] = nn.Tanh,
        ortho_init: bool = True,
        use_sde: bool = False,
        log_std_init: float = 0.0,
        full_std: bool = True,
        use_expln: bool = False,
        squash_output: bool = False,
        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        share_features_extractor: bool = True,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None
    ):
        if optimizer_kwargs is None:
            optimizer_kwargs = {}
            # Small values to avoid NaN in Adam optimizer
            if optimizer_class == th.optim.Adam:
                optimizer_kwargs["eps"] = 1e-5
        super().__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            ortho_init,
            use_sde,
            log_std_init,
            full_std,
            use_expln,
            squash_output,
            features_extractor_class,
            features_extractor_kwargs,
            share_features_extractor,
            normalize_images,
            optimizer_class,
            optimizer_kwargs
        )
    
    def _build_mlp_extractor(self) -> None:
        """
        Create the policy and value networks.
        Part of the layers can be shared.
        """
        self.mlp_extractor = CustomMlpExtractor(
            self.features_dim,
            net_arch=self.net_arch,
            activation_fn=self.activation_fn,
            device=self.device,
        )

In [50]:
from stable_baselines3 import PPO

env = CustomEnv()
env.reset()

model = PPO(CustomActorCriticPolicy, env, verbose=1)

TIMESTEPS = 100
iters = 0
while True:
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
In CustomMlpExtractor
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -5.91    |
| time/              |          |
|    fps             | 1824     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 10.3     |
| time/              |          |
|    fps             | 1735     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 4096     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 16       |
| time/              |          |
|    fps             | 1757     |


KeyboardInterrupt: 

In [51]:
print(model.policy.mlp_extractor.policy_net)
print(model.policy.mlp_extractor.value_net)

Sequential(
  (0): Linear(in_features=2, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): Tanh()
)
Sequential(
  (0): Linear(in_features=2, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): Tanh()
)
