# 📚 Case Study 101a: Reinforcement Learning Based Control, Mono-Agent

Before you start:
- Make sure you have completed [](TUT000_setup.ipynb) through [](TUT003_variables.ipynb).

What you will learn in this chapter:
- TODO

In [1]:
import numpy as _numpy_

from controllables.core.tools.gymnasium import BoxSpace, DictSpace
from controllables.core.tools.rllib import Env
from controllables.energyplus import Actuator, OutputVariable
from controllables.energyplus import examples


class UserEnv(Env):
    action_space = DictSpace({
        'thermostat': BoxSpace(
            low=15., high=20.,
            dtype=_numpy_.float32,
            shape=(),
        ).bind(
            Actuator.Ref(
                type='Zone Temperature Control',
                control_type='Heating Setpoint',
                key='MAIN ZONE',
            )            
        )
    })

    observation_space = DictSpace({
        'temperature': BoxSpace(
            low=-_numpy_.inf, high=+_numpy_.inf,
            dtype=_numpy_.float32,
            shape=(),
        ).bind(
            OutputVariable.Ref(
                type='Zone Mean Air Temperature',
                key='MAIN ZONE',
            )
        ),
    })

    reward = lambda agent: -abs(
        agent.observation['temperature'].value - agent.action['thermostat'].value
    )
    r"""
    Reward function.

    This reward function aims to minimize the control error, 
    i.e., the difference between the thermostat setpoint and the actual temperature.
    """

    def __init__(self, config: dict = dict()):
        super().__init__({
            'action_space': self.__class__.action_space,
            'observation_space': self.__class__.observation_space,
            'reward': self.__class__.reward,
            **config,
        })

    def run(self):
        # create a system
        system = examples.systems.X1ZoneEvapCooler(repeat=True)
        # enable progress reporting
        system.add('logging:progress')
        # attach this environment to the system
        system.add(self)
        # run the system in background
        system.start()

        # continuously...
        while True:
            # start a new episode
            episode_id = self.start_episode()
            # loop: here, it runs 100 steps of the episode
            for _ in range(100):
                with system.events['timestep'].wait(deferred=True):
                    self.step_episode(episode_id)
            # end the started episode after the above steps
            self.end_episode(episode_id)

### Training

In [2]:
from ray.rllib.algorithms.ppo import PPO, PPOConfig

algo = PPO(
    PPOConfig()
    .environment(UserEnv)
    .rollouts(
        # NOTE this env (an `ExternalEnv`) does not support connectors
        enable_connectors=False,
    )
    # TODO
    .resources(num_gpus=1.)
)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


2024-10-11 14:30:29,932	INFO worker.py:1786 -- Started a local Ray instance.
  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s, Initializing HVAC]                                          
  0%|          | 0/100 [00:00<?, ?it/s, Initializing Simulation]                                    


In [3]:
for i in range(2):
    print(i, algo.train())

  0%|          | 0.0/100 [00:00<?, ?it/s, Warming up {2}]                                                            
  1%|          | 1.0/100 [00:00<01:09,  1.42it/s, Starting Simulation at 07/21 for DENVER CENTENNIAL ANN CLG 1% CONDNS DB=>MWB]
  1%|          | 1.0/100 [00:01<01:09,  1.42it/s, Warming up {4}]                                                              
  1%|          | 1.0/100 [00:01<01:07,  1.47it/s, Warming up {2}]                                                              
  1%|          | 1.0/100 [00:01<01:09,  1.42it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]
  1%|          | 1.0/100 [00:01<01:07,  1.47it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]
  2%|▏         | 2.0/100 [00:01<01:16,  1.28it/s, Starting Simulation at 01/01/2013 for RUN PERIOD 1]
  0%|          | 0.0/100 [00:00<?, ?it/s, Warming up {2}]                                                            
  1%|          | 1.0/100 [00:00<01:07,  1.47it/s, Starting Simulation at 0



0 {'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.703443007315359, 'policy_loss': 0.002579946020838394, 'vf_loss': 9.700014789130098, 'vf_explained_var': -1.986821492513021e-09, 'kl': 0.0042412850173360475, 'entropy': 1.4620032405340544, 'entropy_coeff': 0.0}, 'model': {}, 'num_grad_updates_lifetime': 465.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}, 'env_runners': {'episode_reward_max': -100.27354907989502, 'episode_reward_min': -692.2425556182861, 'episode_reward_mean': -162.7630206823349, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_timesteps_total': 4000, 'policy_reward_min': {'default_policy': -692.2425556182861}, 'policy_reward_max': {'default_policy': -100.27354907989502

### Evaluation

TODO

In [4]:
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from controllables.core.tools.records import VariableRecords


class PlottingCallbacks(DefaultCallbacks):
    def __init__(self):
        self.env_records = None

    def on_episode_start(self, *, episode, worker, **kwargs):
        env = worker.env
        system = env.system
        if self.env_records is None:
            self.env_records = records = VariableRecords({
                '🕰️': system['time'],
                '🍩': env.agent.reward,
            })
            display(
                records.plot.scatter(x='🕰️', y='🍩')
                .watch(records.events['change'] % 1_000)
            )

    def on_episode_step(self, *, episode, **kwargs):
        self.env_records.poll()


# create a new algorithm instance specifically for evaluation
algo_eval = PPO(
    PPOConfig()
    .environment(UserEnv)
    .env_runners(
        # NOTE this env (an `ExternalEnv`) does not support connectors
        enable_connectors=False,
        # disable distributed workers for local plotting
        num_rollout_workers=0,
        create_env_on_local_worker=True,
    )
    .evaluation(
        evaluation_num_workers=0,
    )
    .callbacks(PlottingCallbacks)
)

# restore weights from the trained algorithm
algo_eval.restore(algo.save())

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


  0%|          | 0/100 [00:00<?, ?it/s]

2024-10-11 14:30:46,236	INFO trainable.py:583 -- Restored on 192.168.200.249 from checkpoint: Checkpoint(filesystem=local, path=/tmp/tmpsdsrhrf_)


In [5]:
algo_eval.evaluate()



<controllables.core.tools.plot.PlotlyBackend at 0x7f371c2a1e10>

{'env_runners': {'episode_reward_max': -89.72452068328857,
  'episode_reward_min': -786.6589460372925,
  'episode_reward_mean': -358.1904717040062,
  'episode_len_mean': 100.0,
  'episode_media': {},
  'episodes_timesteps_total': 40000,
  'policy_reward_min': {'default_policy': -786.6589460372925},
  'policy_reward_max': {'default_policy': -89.72452068328857},
  'policy_reward_mean': {'default_policy': -358.1904717040062},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [-102.94939613342285,
    -316.2675256729126,
    -659.9846363067627,
    -117.86398696899414,
    -116.76044654846191,
    -123.20420837402344,
    -113.22254180908203,
    -102.05380058288574,
    -116.11430168151855,
    -91.32668876647949,
    -123.42814350128174,
    -89.72452068328857,
    -103.8595199584961,
    -135.62759113311768,
    -125.70651054382324,
    -190.1004238128662,
    -144.19491291046143,
    -125.39299869537354,
    -110.02676105499268,
    -113.25980281829834,
    -121.3205394744873,

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/tmp/ipykernel_938592/3944663123.py", line 66, in run
  File "/home/AD/user/lab/EnergyPlus-OOEP/packages/controllables/core/tools/rllib/env.py", line 255, in step_episode
    self.agent.action.value = self.get_action(episode_id)
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/EnergyPlus-OOEP/packages/controllables/core/tools/rllib/env.py", line 218, in get_action
    return super().get_action(
           ^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/EnergyPlus-OOEP/.venv/lib/python3.11/site-packages/ray/rllib/env/external_env.py", line 139, in get_action
    return episode.wait_for_action(observation)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/AD/user/lab/EnergyPlus-OOEP/.venv/lib/python3.11/site-packages/ray/rllib/env/external_env.py", line 297, in wait_for_action
    ret