# 📚 Case Study 101b: Reinforcement Learning Based Control, Multi-Agent

Before you start:
- Make sure you have completed [](TUT000_setup.ipynb) through [](TUT003_variables.ipynb), 
as well as [](TUT101a_app-rllib-monoagent.ipynb).

What you will learn in this chapter:
- TODO

In [4]:
import numpy as _numpy_

from controllables.core.tools.gymnasium import BoxSpace, DictSpace
from controllables.core.tools.rllib import MultiAgentEnv
from controllables.energyplus import Actuator, OutputMeter
from controllables.energyplus import examples


class UserMultiAgentEnv(MultiAgentEnv):
    action_spaces = {
        space_key: DictSpace({
            'thermostat': BoxSpace(
                low=15., high=20.,
                dtype=_numpy_.float32,
                shape=(),
            ).bind(
                Actuator.Ref(
                    type='Zone Temperature Control',
                    control_type='Heating Setpoint',
                    key=ref_key,
                ) 
            ),
        })
        for space_key, ref_key in [
            ('zone_1-1', 'SPACE1-1'),
            ('zone_2-1', 'SPACE2-1'),
        ]
    }

    observation_spaces = {
        space_key: DictSpace({
            'energy-transfer': BoxSpace(
                low=-_numpy_.inf, high=+_numpy_.inf,
                dtype=_numpy_.float32,
                shape=(),
            ).bind(
                lambda x: x[OutputMeter.Ref(
                    type=f'EnergyTransfer:Zone:{ref_key}',
                )].cast(_numpy_.array)
            ),
            'energy-consumption': BoxSpace(
                low=-_numpy_.inf, high=+_numpy_.inf,
                dtype=_numpy_.float32,
                shape=(),
            ).bind(
                lambda x: x[OutputMeter.Ref(
                    type=f'Electricity:HVAC',
                )].cast(_numpy_.array)
            ),
        })
        for space_key, ref_key in [
            ('zone_1-1', 'SPACE1-1'),
            ('zone_2-1', 'SPACE2-1'),
        ]
    }

    rewards = {
        space_key: lambda agent: (
            agent.observation.value['energy-transfer'] 
            / agent.observation.value['energy-consumption']
        ) if agent.observation.value['energy-consumption'] != 0. else 0.
        for space_key in [
            'zone_1-1',
            'zone_2-1',
        ]
    }

    def __init__(self, config: dict = dict()):
        super().__init__({
            'action_spaces': self.__class__.action_spaces,
            'observation_spaces': self.__class__.observation_spaces,
            'rewards': self.__class__.rewards,
            **config,
        })

    def run(self):
        system = examples.systems.X5ZoneAirCooled(repeat=True)
        #system.add('logging:progress')
        # schedule a (recurring) episode associated with the system
        self.__attach__(system).schedule_episode()
        # run the system in background
        system.start()
        # wait for the system to finish to keep this env alive
        system.wait()

In [5]:
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.algorithms.ppo import PPO, PPOConfig

algo = PPO(
    PPOConfig()
    .environment(UserMultiAgentEnv)
    .rollouts(
        # NOTE this env (an `ExternalEnv`) does not support connectors
        enable_connectors=False,
        # TODO for eval
        create_env_on_local_worker=True,
        # TODO rm
        #num_rollout_workers=0,
        #rollout_fragment_length=1_000,
    )
    .multi_agent(
        # TODO doc policy mapping
        policies={
            space_key: PolicySpec(
                action_space=UserMultiAgentEnv.action_spaces[space_key],
                observation_space=UserMultiAgentEnv.observation_spaces[space_key],
            )
            for space_key in ['zone_1-1', 'zone_2-1']
        },
        policy_mapping_fn=lambda agent_id, *args, **kwargs: str(agent_id),
    )
    # TODO
    .resources(num_gpus=1.)
)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


In [6]:
for i in range(20):
    print(i, algo.train())



0 {'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'zone_1-1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(1.3372318), 'cur_kl_coeff': np.float64(0.20000000000000004), 'cur_lr': np.float64(5.0000000000000016e-05), 'total_loss': np.float64(9.969870257377625), 'policy_loss': np.float64(-0.0033367656733995924), 'vf_loss': np.float64(9.972520725925763), 'vf_explained_var': np.float64(-0.0030253227179249127), 'kl': np.float64(0.0034315682342139554), 'entropy': np.float64(1.3916464991867543), 'entropy_coeff': np.float64(0.0)}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': np.float64(125.0), 'num_grad_updates_lifetime': np.float64(480.5), 'diff_num_grad_updates_vs_sampler_policy': np.float64(479.5)}, 'zone_2-1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(1.221515), 'cur_kl_coeff': np.float64(0.20000000000000004), 'cur_lr': np.float64(5.0000000000000016e-05), 'total_loss': np.float64(9.



19 {'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'zone_1-1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(1.7174817), 'cur_kl_coeff': np.float64(0.025000000000000005), 'cur_lr': np.float64(5.0000000000000016e-05), 'total_loss': np.float64(9.969602938493093), 'policy_loss': np.float64(-0.0035884468161384575), 'vf_loss': np.float64(9.973036576310793), 'vf_explained_var': np.float64(-0.005631253619988759), 'kl': np.float64(0.006192493883015976), 'entropy': np.float64(0.9751135553543766), 'entropy_coeff': np.float64(0.0)}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': np.float64(125.0), 'num_grad_updates_lifetime': np.float64(18720.5), 'diff_num_grad_updates_vs_sampler_policy': np.float64(479.5)}, 'zone_2-1': {'learner_stats': {'allreduce_latency': np.float64(0.0), 'grad_gnorm': np.float32(2.1821375), 'cur_kl_coeff': np.float64(0.006250000000000001), 'cur_lr': np.float64(5.0000000000000016e-05), 'total_loss': np.float6

In [7]:
# TODO evaluate plot
algo.evaluate()

{'env_runners': {'episode_reward_max': nan,
  'episode_reward_min': nan,
  'episode_reward_mean': nan,
  'episode_len_mean': nan,
  'episode_media': {},
  'episodes_timesteps_total': 0,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [], 'episode_lengths': []},
  'sampler_perf': {},
  'num_faulty_episodes': 0,
  'connector_metrics': {},
  'num_episodes': 0,
  'episode_return_max': nan,
  'episode_return_min': nan,
  'episode_return_mean': nan,
  'episodes_this_iter': 0},
 'num_agent_steps_sampled_this_iter': 40000,
 'num_env_steps_sampled_this_iter': 20000,
 'timesteps_this_iter': 20000}

