# **Inroduction:**

This notebook trains a reinforcement learning agent using the Proximal Policy Optimization (PPO) algorithm to interact with a specific environment, called "HVAC" (just for example), from the pyRDDLGym package.

In [1]:
!git clone https://github.com/ataitler/pyRDDLGym.git




Cloning into 'pyRDDLGym'...
remote: Enumerating objects: 10142, done.[K
remote: Counting objects: 100% (741/741), done.[K
remote: Compressing objects: 100% (290/290), done.[K
remote: Total 10142 (delta 502), reused 676 (delta 447), pack-reused 9401[K
Receiving objects: 100% (10142/10142), 7.49 MiB | 16.46 MiB/s, done.
Resolving deltas: 100% (6826/6826), done.


In [2]:
%cd /content/pyRDDLGym

/content/pyRDDLGym


In [3]:
!pip install -r requirements.txt



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pillow>=9.2.0 (from -r requirements.txt (line 2))
  Downloading Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting ply (from -r requirements.txt (line 6))
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ply, pillow
  Attempting uninstall: pillow
    Found existing installation: Pillow 8.4.0
    Uninstalling Pillow-8.4.0:
      Successfully uninstalled Pillow-8.4.0
Successfully installed pillow-9.5.0 ply-3.11


In [4]:
!pip install stable-baselines3==2.0.0a5 shimmy>=0.2.1 matplotlib>=3.5.0 pillow>=9.2.0 gym>=0.24.0 numpy>=1.22 pygame ply

In [5]:

from stable_baselines3 import PPO
import gym
from pyRDDLGym import RDDLEnv
from pyRDDLGym import ExampleManager

  if not hasattr(tensorboard, "__version__") or LooseVersion(
  x = re.search("instance\d+.*", file)


In [6]:
from gym import spaces

  and should_run_async(code)


In [7]:
from stable_baselines3.common.vec_env import DummyVecEnv

In [8]:
class FlatActionWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        if isinstance(env.action_space, spaces.Box):
            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.flat_dim(env.action_space),))
        else:
            self.action_space = env.action_space

    @staticmethod
    def flat_dim(action_space):
        if isinstance(action_space, spaces.Box):
            return np.prod(action_space.shape)
        elif isinstance(action_space, spaces.Discrete):
            return action_space.n
        elif isinstance(action_space, spaces.Dict):
            return sum([FlatActionWrapper.flat_dim(action_space[key]) for key in action_space.spaces.keys()])
        else:
            raise ValueError(f"Unexpected action space: {action_space}")

    def step(self, action):
        action = FlatActionWrapper.unflatten_action(self.env.action_space, action)
        return super().step(action)

    @staticmethod
    def unflatten_action(action_space, action):
        if isinstance(action_space, spaces.Box):
            return action.reshape(action_space.shape)
        elif isinstance(action_space, spaces.Discrete):
            return int(action)
        elif isinstance(action_space, spaces.Dict):
            unflattened_action = {}
            start = 0
            for key, subspace in action_space.spaces.items():
                dim = FlatActionWrapper.flat_dim(subspace)
                unflattened_action[key] = FlatActionWrapper.unflatten_action(subspace, action[start:start+dim])
                start += dim
            return unflattened_action


class MyRDDLAgent:
    def __init__(self, env=None, action_space=None, num_actions=3, seed=None):
        if env is not None:
            self.set_env(env, action_space)

    def set_env(self, env, action_space):
        self.original_env = env
        self.env = FlatActionWrapper(env)
        self.action_space = action_space
        self.model = PPO("MultiInputPolicy", self.env, verbose=1)

    def train(self, total_timesteps):
        self.model.learn(total_timesteps)

    def predict(self, state):
        flat_action, _ = self.model.predict(state, deterministic=True)
        action = FlatActionWrapper.unflatten_action(self.original_env.action_space, flat_action)
        return action


In [10]:
environments = ['RaceCar', 'UAV_mixed', 'UAV_discrete', 'UAV_continuous', 'Reservoir_continuous', 'Reservoir_discrete', 
                'PowerGen_discrete', 'PowerGen_continuous', 'MountainCar', 'RecSim', 'HVAC']

# Create the agent without an environment first
agent = MyRDDLAgent()



In [11]:
for env_id in environments:
    EnvInfo = ExampleManager.GetEnvInfo(env_id)

    myEnv = RDDLEnv.RDDLEnv(domain=EnvInfo.get_domain(),
                            instance=EnvInfo.get_instance(0),
                            enforce_action_constraints=True,
                            debug=True)
    
    # Update the environment of the agent
    agent.set_env(env = myEnv, action_space=myEnv.action_space)

    # Train the agent
    agent.train(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


AssertionError: ignored

I'm sorry for misunderstanding your request before. The error indeed is pointing towards the incompatibility of the action space. The PPO algorithm provided by Stable Baselines only supports a limited set of action spaces by default, namely, Discrete, Box, MultiDiscrete, and MultiBinary.

This is the reason why the FlatActionWrapper was used in the previous example. It was meant to flatten the Dictionary action space into a single Box action space that the PPO can accept. However, it appears that the Stable Baselines PPO implementation is not accepting this flattened action space as it's meant to be.