# 📚 Case Study 101a: Reinforcement Learning Based Control, Mono-Agent

Before you start:
- Make sure you have completed [](TUT000_setup.ipynb) through [](TUT003_variables.ipynb).

What you will learn in this chapter:
- TODO

In [1]:
import numpy as _numpy_

from controllables.core.tools.gymnasium import BoxSpace, DictSpace
from controllables.core.tools.rllib import Env
from controllables.energyplus import Actuator, OutputVariable
from controllables.energyplus import examples


class UserEnv(Env):
    action_space = DictSpace({
        'thermostat': BoxSpace(
            low=15., high=20.,
            dtype=_numpy_.float32,
            shape=(),
        ).bind(
            Actuator.Ref(
                type='Zone Temperature Control',
                control_type='Heating Setpoint',
                key='MAIN ZONE',
            )            
        )
    })

    observation_space = DictSpace({
        'temperature': BoxSpace(
            low=-_numpy_.inf, high=+_numpy_.inf,
            dtype=_numpy_.float32,
            shape=(),
        ).bind(
            lambda x: x[OutputVariable.Ref(
                type='Zone Mean Air Temperature',
                key='MAIN ZONE',
            )].cast(_numpy_.array)
        ),
    })

    reward = lambda agent: -abs(
        agent.observation['temperature'].value - agent.action['thermostat'].value
    )
    r"""
    Reward function.

    This reward function aims to minimize the control error, 
    i.e., the difference between the thermostat setpoint and the actual temperature.
    """

    def __init__(self, config: dict = dict()):
        super().__init__({
            'action_space': self.__class__.action_space,
            'observation_space': self.__class__.observation_space,
            'reward': self.__class__.reward,
            **config,
        })

    def run(self):
        # create a system
        system = examples.systems.X1ZoneEvapCooler(repeat=True)
        # enable progress reporting
        system.add('logging:progress')
        # attach this environment to the system
        system.add(self)
        # run the system in background
        system.start()

        # continuously...
        while True:
            # start a new episode
            episode_id = self.start_episode()
            # loop: here, it runs 100 steps of the episode
            for _ in range(100):
                with system.events['timestep'].wait(deferred=True):
                    self.step_episode(episode_id)
            # end the started episode after the above steps
            self.end_episode(episode_id)

### Training

In [2]:
from ray.rllib.algorithms.ppo import PPO, PPOConfig

algo = PPO(
    PPOConfig()
    .environment(UserEnv)
    .env_runners(
        # NOTE this env (an `ExternalEnv`) does not support connectors
        enable_connectors=False,
    )
    # TODO
    .resources(num_gpus=1.)
)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2025-01-22 14:56:35,641	INFO worker.py:1816 -- Started a local Ray instance.
  0%|          | 0/100.0 [00:00<?, ?it/s]
  0%|          | 0/100.0 [00:00<?, ?it/s, EnergyPlus, Version 23.2.0-7636e6b3e9, YMD=2025.01.22 14:56]


In [3]:
for i in range(2):
    print(i, algo.train())

  1%|          | 1.0/100.0 [00:01<01:35,  1.04it/s, Starting Simulation at 07/21 for DENVER CENTENNIAL ANN CLG 1% CONDNS DB=>MWB]
  1%|          | 1.0/100.0 [00:01<01:35,  1.04it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]                          
  2%|▏         | 2.0/100.0 [00:01<01:16,  1.27it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]
  1%|          | 1.0/100.0 [00:01<01:34,  1.04it/s, Starting Simulation at 07/21 for DENVER CENTENNIAL ANN CLG 1% CONDNS DB=>MWB]
  1%|          | 1.0/100.0 [00:01<01:34,  1.04it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]                          
  5%|▌         | 5.0/100.0 [00:07<03:16,  2.07s/it, Starting Simulation at 01/01/2013 for RUN PERIOD 1][32m [repeated 6x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
  6%|▌         | 6.0/



0 {'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'cur_kl_coeff': np.float64(0.20000000000000004), 'cur_lr': np.float64(5.0000000000000016e-05), 'total_loss': np.float64(9.68377141542332), 'policy_loss': np.float64(0.0037309974632276), 'vf_loss': np.float64(9.679496995864376), 'vf_explained_var': np.float64(-1.2561839113953293e-08), 'kl': np.float64(0.002717184874990214), 'entropy': np.float64(1.384358350820439), 'entropy_coeff': np.float64(0.0)}, 'model': {}, 'num_grad_updates_lifetime': np.float64(465.5), 'diff_num_grad_updates_vs_sampler_policy': np.float64(464.5)}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}, 'env_runners': {'episode_reward_max': np.float64(-82.64355307789427), 'episode_reward_min': np.float64(-673.4723131254235), 'episode_reward_mean': np.float64(-159.8474477815283), 'episode_len_mean': np.float64(100.0), 

### Evaluation

TODO

In [4]:
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from controllables.core.tools.records import VariableRecords


class PlottingCallbacks(DefaultCallbacks):
    def __init__(self):
        self.env_records = None

    def on_episode_start(self, *, episode, worker, **kwargs):
        env = worker.env
        system = env.system
        if self.env_records is None:
            self.env_records = records = VariableRecords({
                '🕰️': system['time'],
                '🍩': env.agent.reward,
            })
            display(
                records.plot.scatter(x='🕰️', y='🍩')
                .watch(records.events['change'] % 1_000)
            )

    def on_episode_step(self, *, episode, **kwargs):
        self.env_records.poll()


# create a new algorithm instance specifically for evaluation
algo_eval = PPO(
    PPOConfig()
    .environment(UserEnv)
    .env_runners(
        # NOTE this env (an `ExternalEnv`) does not support connectors
        enable_connectors=False,
        # disable distributed workers for local plotting
        num_rollout_workers=0,
        create_env_on_local_worker=True,
    )
    .evaluation(
        evaluation_num_workers=0,
    )
    .callbacks(PlottingCallbacks)
)

# restore weights from the trained algorithm
algo_eval.restore(algo.save())

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


  0%|          | 0/100.0 [00:00<?, ?it/s]

2025-01-22 14:56:51,836	INFO trainable.py:583 -- Restored on 192.168.200.249 from checkpoint: Checkpoint(filesystem=local, path=/tmp/tmpyzvimlpi)


In [None]:
algo_eval.evaluate()



<controllables.core.tools.plot.PlotlyBackend at 0x7fbc003ed810>

{'env_runners': {'episode_reward_max': np.float64(-79.27454931567762),
  'episode_reward_min': np.float64(-792.3468276367258),
  'episode_reward_mean': np.float64(-365.21884606320907),
  'episode_len_mean': np.float64(100.0),
  'episode_media': {},
  'episodes_timesteps_total': 40000,
  'policy_reward_min': {'default_policy': np.float64(-792.3468276367258)},
  'policy_reward_max': {'default_policy': np.float64(-79.27454931567762)},
  'policy_reward_mean': {'default_policy': np.float64(-365.21884606320907)},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [np.float64(-106.15530114141913),
    np.float64(-319.8654978991499),
    np.float64(-699.2440713630454),
    np.float64(-99.95683243003666),
    np.float64(-104.2540073813481),
    np.float64(-109.42411286478915),
    np.float64(-130.73264340845847),
    np.float64(-108.4236095632685),
    np.float64(-86.26987239261581),
    np.float64(-93.68688267807977),
    np.float64(-81.02062479726891),
    np.float64(-85.5781500202084

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/home/AD/user/lab/controllables-core/packages/controllables/energyplus/events.py", line 129, in cb_
    return cb(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/controllables-core/packages/controllables/energyplus/events.py", line 159, in _state
    self._event.__call__(
  File "/home/AD/user/lab/controllables-core/packages/controllables/energyplus/events.py", line 98, in __call__
    return super().__call__(context)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/controllables-core/packages/controllables/core/callbacks.py", line 496, in __call__
    return self._callables.__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/controllables-core/packages/controllables/core/callables.py", line 89, in __call__
    res[f] = f(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/controllables-core/packages