In [1]:
#%pip install --extra-index-url https://test.pypi.org/simple ../../EnergyPlus-OOEP/
#%pip install git+https://github.com/NTU-CCA-HVAC-OPTIM-a842a748/EnergyPlus-Datasets.git
#%pip install ipywidgets tqdm gymnasium torch ray[rllib]

# 🤖 Reinforcement Learning

TODO description

Kickstart the simulation.

In [2]:
from controllables.energyplus import (
    World,
    #WeatherModel,
    #Report,
    Actuator,
    OutputVariable,
)

from energyplus.dataset.basic import dataset as _epds_

world = World(
    input=World.Specs.Input(
        world=(
            _epds_.models / '1ZoneEvapCooler.idf'
        ),
        weather=(_epds_.weathers / 'USA_CO_Denver-Aurora-Buckley.AFB.724695_TMY3.epw'),
    ),
    output=World.Specs.Output(
        #report=('/tmp/ooep-report-9e1287d2-8e75-4cf5-bbc5-f76580b56a69'),
    ),
    runtime=World.Specs.Runtime(
        recurring=True,
        #design_day=True,
    ),
)

Configure the controller (an RLlib algorithm).

In [3]:
import numpy as _numpy_
import gymnasium as _gymnasium_

from controllables.energyplus import (
    Actuator,
    OutputVariable,
)
from controllables.core.tools.gymnasium import (
    BoxSpace,
    DictSpace,
)
from controllables.core.tools.ray import ExternalEnv

from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.rllib.algorithms.callbacks import DefaultCallbacks


config = (
    PPOConfig()
    .environment(
        ExternalEnv, 
        env_config=ExternalEnv.Config(
            action_space=DictSpace({
                'thermostat': BoxSpace(
                    low=15., high=20.,
                    dtype=_numpy_.float32,
                    shape=(),
                ).bind(
                    Actuator.Ref(
                        type='Zone Temperature Control',
                        control_type='Heating Setpoint',
                        key='MAIN ZONE',
                    )            
                )
            }),    
            observation_space=DictSpace({
                'temperature': BoxSpace(
                    low=-_numpy_.inf, high=+_numpy_.inf,
                    dtype=_numpy_.float32,
                    shape=(),
                ).bind(
                    OutputVariable.Ref(
                        type='Zone Air Temperature',
                        key='MAIN ZONE',
                    )
                ),
            }),
            system=lambda: world,
            reward_function=lambda agent: 1,
            episode_events={
                'step': 'begin_zone_timestep_after_init_heat_balance',
            },
        )
    )
    .rollouts(
        create_env_on_local_worker=True,
        #num_rollout_workers=10,
        num_rollout_workers=0,
        enable_connectors=False,
    )
    .framework('torch')
    .evaluation(
        #evaluation_interval=1,
        #evaluation_num_workers=0
    )
    .resources(num_gpus=1.)
)

algo = PPO(
    config=config,
)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


Train the algorithm.

In [4]:
world.add('logging:progress').start()

  0%|          | 0/100 [00:00<?, ?it/s]

<controllables.energyplus.world.World at 0x7f07097b5b90>

In [5]:
# start training
for _ in range(20):
    print(algo.train())



{'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.937928897078319, 'policy_loss': -0.06264554547766844, 'vf_loss': 10.0, 'vf_explained_var': 0.7428920326694366, 'kl': 0.002872142626594762, 'entropy': 1.4138359632543338, 'entropy_coeff': 0.0}, 'model': {}, 'num_grad_updates_lifetime': 465.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}, 'sampler_results': {'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_ep

Evaluate the algorithm.

In [6]:
algo.evaluate()

{'evaluation': {'sampler_results': {'episode_reward_max': 52847.0,
   'episode_reward_min': 52847.0,
   'episode_reward_mean': 52847.0,
   'episode_len_mean': 52848.0,
   'episode_media': {},
   'episodes_this_iter': 1,
   'policy_reward_min': {},
   'policy_reward_max': {},
   'policy_reward_mean': {},
   'custom_metrics': {},
   'hist_stats': {'episode_reward': [52847.0], 'episode_lengths': [52848]},
   'sampler_perf': {'mean_raw_obs_processing_ms': 0.10332065805842851,
    'mean_inference_ms': 0.90695389286474,
    'mean_action_processing_ms': 0.08013065799669808,
    'mean_env_wait_ms': 0.2798091494760819,
    'mean_env_render_ms': 0.0},
   'num_faulty_episodes': 0,
   'connector_metrics': {}},
  'episode_reward_max': 52847.0,
  'episode_reward_min': 52847.0,
  'episode_reward_mean': 52847.0,
  'episode_len_mean': 52848.0,
  'episode_media': {},
  'episodes_this_iter': 1,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hi

Stop the simulation environment when we are done!

In [7]:
world.stop()