In [2]:
!python --version

Python 3.11.7


In [3]:
"""
%%capture

# The environment we will be working with
%pip install CityLearn==2.1.2

# For participant interactions (buttons)
%pip install ipywidgets

# To generate static figures
%pip install matplotlib
%pip install seaborn

# Provide standard RL algorithms
%pip install stable-baselines3

# Enable gym compatibility with later stable-baselines3 versions
%pip install shimmy

# Results submission
%pip install requests
%pip install beautifulsoup4
"""

'\n%%capture\n\n# The environment we will be working with\n%pip install CityLearn==2.1.2\n\n# For participant interactions (buttons)\n%pip install ipywidgets\n\n# To generate static figures\n%pip install matplotlib\n%pip install seaborn\n\n# Provide standard RL algorithms\n%pip install stable-baselines3\n\n# Enable gym compatibility with later stable-baselines3 versions\n%pip install shimmy\n\n# Results submission\n%pip install requests\n%pip install beautifulsoup4\n'

In [4]:
import sys, subprocess
print("Python:", sys.executable)
print("Pip:   ", subprocess.run(["which","pip"], capture_output=True, text=True).stdout)

# System operations
import os

# Type hinting
from typing import Any, List, Mapping, Tuple, Union

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

import import_ipynb

# Data manipulation
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import torch as th

# CityLearn
from citylearn.data import DataSet
from citylearn.reward_function import RewardFunction, SolarPenaltyReward

# Baseline RL algorithms
from stable_baselines3 import SAC
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecMonitor, VecNormalize
from stable_baselines3.common.callbacks import EvalCallback, CallbackList
from stable_baselines3.common.monitor import Monitor


# set all plotted figures without margins
plt.rcParams['axes.xmargin'] = 0
plt.rcParams['axes.ymargin'] = 0
%matplotlib inline

Python: /hkfs/home/haicore/iai/cj9272/citylearn_env/bin/python
Pip:    /software/all/jupyter/ai/2025-05-23/bin/pip



In [5]:
# --- keep only electrical storage so we have 1D continuous action to discretize ---
def keep_only_electrical_storage(schema: dict) -> dict:
    if 'actions' in schema:
        for a in list(schema['actions'].keys()):
            schema['actions'][a]['active'] = (a == 'electrical_storage')
    if 'buildings' in schema:
        for b in schema['buildings']:
            if 'actions' in b:
                for a in list(b['actions'].keys()):
                    b['actions'][a]['active'] = (a == 'electrical_storage')
    return schema
# Dataset
DATASET_NAME = 'citylearn_challenge_2023_phase_3_1'
schema = DataSet().get_schema(DATASET_NAME)
schema['root_directory'] = r'/hkfs/home/haicore/iai/cj9272/Bachelorthesis_DQN_Agent/data/datasets/citylearn_challenge_2023_phase_3_1'
schema = keep_only_electrical_storage(schema) # Activate only the electrical storage control (fix "Expected 18 actions but got 1")
# Set reward function
schema['reward_function'] = { # CostReward Function
    'type': 'citylearn.reward_function.CostReward',
    'attributes': {}
}
# Set pricing file
price_file = 'pricing_germany_2023_june_to_august.csv'  # Pricing CSV
if 'buildings' not in schema:
    raise RuntimeError("schema does not contain 'buildings' (make sure schema is loaded first)")
for bname, bconf in schema['buildings'].items():
    bconf['pricing'] = price_file

# Building
#root_directory = schema['root_directory']
root_directory = 'Bachelorthesis_DQN_Agent/data/datasets/citylearn_challenge_2023_phase_3_1'
building_name = 'Building_1'
# Weather data
filename = schema['buildings'][building_name]['weather']
filepath = os.path.join(root_directory, filename)
weather_data = pd.read_csv(filepath)
# Pricing data (simple)
filename = schema['buildings'][building_name]['pricing']
filepath = os.path.join(root_directory, filename)
pricing_data = pd.read_csv(filepath)
# Carbon Intensity data
filename = schema['buildings'][building_name]['carbon_intensity']
filepath = os.path.join(root_directory, filename)
carbon_intensity_data = pd.read_csv(filepath)
# building data
filename = schema['buildings'][building_name]['energy_simulation']
filepath = os.path.join(root_directory, filename)
building_data = pd.read_csv(filepath)

# Display building data
# display(building_data.head())
# display(building_data.describe(include='all'))

bld = building_data.copy()
wth = weather_data.copy()
prc = pricing_data.copy()
car = carbon_intensity_data.copy()

print(building_data.columns.tolist())

INFO:root:Go here /home/iai/cj9272/.cache/citylearn/v2.4.1/datasets/citylearn_challenge_2023_phase_3_1/schema.json 


['month', 'hour', 'day_type', 'daylight_savings_status', 'indoor_dry_bulb_temperature', 'average_unmet_cooling_setpoint_difference', 'indoor_relative_humidity', 'non_shiftable_load', 'dhw_demand', 'cooling_demand', 'heating_demand', 'solar_generation', 'occupant_count', 'indoor_dry_bulb_temperature_cooling_set_point', 'indoor_dry_bulb_temperature_heating_set_point', 'hvac_mode']


In [6]:
observations = [
    # building_df
    'month', 'hour', 'day_type', 'daylight_savings_status',
    'indoor_dry_bulb_temperature',
    'average_unmet_cooling_setpoint_difference',
    'indoor_relative_humidity',
    'non_shiftable_load', 'dhw_demand',
    'cooling_demand', 'heating_demand',
    'solar_generation', 'occupant_count',
    'indoor_dry_bulb_temperature_cooling_set_point',
    'indoor_dry_bulb_temperature_heating_set_point', 'hvac_mode',
    # weather_df
    'outdoor_dry_bulb_temperature',
    'outdoor_relative_humidity',
    'diffuse_solar_irradiance',
    'direct_solar_irradiance',
    'outdoor_dry_bulb_temperature_predicted_1',
    'outdoor_dry_bulb_temperature_predicted_2',
    'outdoor_dry_bulb_temperature_predicted_3',
    'outdoor_relative_humidity_predicted_1',
    'outdoor_relative_humidity_predicted_2',
    'outdoor_relative_humidity_predicted_3',
    'diffuse_solar_irradiance_predicted_1',
    'diffuse_solar_irradiance_predicted_2',
    'diffuse_solar_irradiance_predicted_3',
    'direct_solar_irradiance_predicted_1',
    'direct_solar_irradiance_predicted_2',
    'direct_solar_irradiance_predicted_3',
    # carbon_df 
    'carbon_intensity',
    # pricing_df
    'electricity_pricing',
    'electricity_pricing_predicted_1',
    'electricity_pricing_predicted_2',
    'electricity_pricing_predicted_3'
]
bld_cols = [
            'month', 'hour', 'day_type', 'daylight_savings_status',
            'indoor_dry_bulb_temperature',
            'average_unmet_cooling_setpoint_difference',
            'indoor_relative_humidity', 'non_shiftable_load',
            'dhw_demand', 'cooling_demand', 'heating_demand',
            'solar_generation', 'occupant_count',
            'indoor_dry_bulb_temperature_cooling_set_point',
            'indoor_dry_bulb_temperature_heating_set_point',
            'hvac_mode'
]
wth_cols = [
    'outdoor_dry_bulb_temperature',
    'outdoor_relative_humidity',
    'diffuse_solar_irradiance',
    'direct_solar_irradiance'
]

### Deep Q-Learning Agent
#### 1. Reward Function


In [7]:
class CustomReward(RewardFunction):
    def __init__(self, capacity: float):
        self.capacity = capacity
        self.prev_net_load = None

    def __call__(self, obs: Mapping[str, float], action_frac: float) -> float:
        # 1) Net load & cost
        net_load   = obs['non_shiftable_load'] - action_frac*self.capacity # max: 11.25 kW
        price = obs['electricity_pricing'] # 0.06605 or 0.03025
        cost  = max(0.0, net_load) * price # deviation range: 0-11.25*0.06605 = ~0-0.75$
        #norm_cost = cost / 0.75 # normalize to [0,1]

        # return -(w_cost*cost + w_pen*comfort_penalty + w_emis*emis + w_ramp*ramp)        
        return -cost

In [8]:
class TrainLoggerCallback(BaseCallback):
    """Logging State, Action, Reward per step and Loss per update phase."""
    def __init__(self, verbose=0):
        super().__init__(verbose)
        # Will collect a dict per env-step
        self.rows = []
        # Loss values and their timesteps (global)
        self.losses = []
        self.loss_timesteps = []
        # Completed episode returns (global list)
        self.episode_rewards = []
        # Placeholders for per-env tracking
        self._current_ep_rewards = []         # sum of rewards in current episode per env
        self._current_ep_counts = []          # episode index per env
        self._current_step_in_episode = []    # step counter (0..T-1) per env

        # DataFrames to populate at end
        self.df = pd.DataFrame()
        self.ep_df = pd.DataFrame()

    def _on_training_start(self) -> None:
        try:
            n_envs = self.training_env.num_envs
        except AttributeError:
            n_envs = 1
        # initialize counters per sub-env
        self._current_ep_rewards = [0.0] * n_envs
        self._current_ep_counts = [1] * n_envs
        self._current_step_in_episode = [0] * n_envs
        super()._on_training_start()

    def _on_step(self) -> bool:
        obs_vec = self.locals.get("new_obs")
        acts    = self.locals.get("actions")
        rews    = self.locals.get("rewards")
        dones   = self.locals.get("dones")
        step    = int(self.num_timesteps)

        # log loss if present
        loss_val = self.logger.name_to_value.get("train/loss")
        if loss_val is not None:
            self.losses.append(float(loss_val))
            self.loss_timesteps.append(step)

        # iterate each sub-env
        for idx, (obs, act, rew, done) in enumerate(zip(obs_vec, acts, rews, dones)):
            # flatten observation
            flat = obs.flatten().tolist()
            # build row with metadata
            row = {f"x{i}": flat[i] for i in range(len(flat))}
            row.update({
                "env_id": idx,
                "episode": self._current_ep_counts[idx],
                "step_in_ep": self._current_step_in_episode[idx],
                "action": int(act),
                "reward": float(rew),
                "global_step": step
            })
            self.rows.append(row)

            # accumulate per-episode reward
            self._current_ep_rewards[idx] += float(rew)
            # increment step in episode
            self._current_step_in_episode[idx] += 1

            # if end of episode for this env
            if done:
                # log reward and finalize episode
                print(f"Env {idx} Episode {self._current_ep_counts[idx]} done at global step {step}, total reward: {self._current_ep_rewards[idx]:.3f}")
                self.episode_rewards.append(self._current_ep_rewards[idx])
                # reset for next episode
                self._current_ep_rewards[idx] = 0.0
                self._current_ep_counts[idx] += 1
                self._current_step_in_episode[idx] = 0

        return True

    def _on_training_end(self) -> None:
        # build full-step DataFrame
        self.df = pd.DataFrame(self.rows)
        # build episodes summary DataFrame
        self.ep_df = pd.DataFrame({
            "episode_global": range(1, len(self.episode_rewards) + 1),
            "return": self.episode_rewards
        })
        super()._on_training_end()

In [9]:
# Minimal DQN on standard CityLearnEnv (single building, single action)
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor

from citylearn.data import DataSet
from citylearn.citylearn import CityLearnEnv


# --- wrapper: single-agent SB3 env on top of CityLearnEnv + action discretization ---
class CityLearnDQNWrapper(gym.Env):
    """Wrap standard CityLearnEnv for SB3 DQN:
       - single central agent, we control building 0
       - discretize the [-1, 1] storage action into n_bins
    """
    metadata = {"render_modes": []}

    def __init__(self, cl_env: CityLearnEnv, n_bins: int = 5):
        super().__init__()
        self.cl = cl_env
        self.n_bins = int(n_bins)
        assert self.n_bins >= 2, "n_bins must be >= 2"

        # Build discrete action space
        self.action_space = spaces.Discrete(self.n_bins)

        # Infer observation shape (use building 0 obs)
        self.cl.reset()
        obs0 = np.array(self.cl.observations[0], dtype=np.float32)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=obs0.shape, dtype=np.float32
        )

        # Precompute action broadcast size (how many controls for bldg 0)
        self.N = self.cl.action_space[0].shape[0]  # should be 1 if only electrical_storage is active

    def _map_discrete_to_frac(self, a: int) -> float:
        # map {0..n_bins-1} -> [-1, 1]
        if self.n_bins == 1:
            return 0.0
        return -1.0 + 2.0 * (a / (self.n_bins - 1))

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.cl.reset(seed=seed)
        obs = np.array(self.cl.observations[0], dtype=np.float32)
        return obs, {}

    def step(self, a: int):
        # map discrete to continuous fraction
        frac = float(np.clip(self._map_discrete_to_frac(int(a)), -1.0, 1.0))
        # build CityLearn action format: list per building -> flat list for building 0
        actions = [[frac] * self.N]  # control building 0 only
        obs_all, rewards, terminated, truncated, _info = self.cl.step(actions)

        obs = np.array(obs_all[0], dtype=np.float32)
        reward = float(rewards[0])
        term = bool(terminated)
        trunc = bool(truncated)
        return obs, reward, term, trunc, {}

# ----------------- build env -----------------

cl_env = CityLearnEnv(schema, central_agent=True)
env = Monitor(CityLearnDQNWrapper(cl_env, n_bins=5))

# Train env
train_env = SubprocVecEnv(env)
train_env = VecMonitor(train_env)
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

# New eval env
eval_env = DummyVecEnv([lambda: Monitor(CityLearnEnv(building_df=bld, pricing_df=prc, weather_df=wth, carbon_df=car, n_bins=5))])
eval_env = VecMonitor(eval_env) 
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

# Evaluate every 2k steps over 5 episodes
eval_callback = EvalCallback(
    eval_env,
    log_path="logs/eval/",
    best_model_save_path="logs/best_model/",
    #eval_freq=len(building_data), # run evaluation every 5k timesteps, TODO: change to 5_000
    eval_freq=1000, # run evaluation every n timesteps, TODO: change n to 5_000
    n_eval_episodes=5,       # average over 5 full episodes
    deterministic=True,
    verbose=1
)

train_callback = TrainLoggerCallback()

# ----------------- train DQN (super simple) -----------------
model = DQN(
    "MlpPolicy",
    train_env,
    learning_rate=3e-4,
    buffer_size=50_000,
    batch_size=256,
    learning_starts=2_000,
    train_freq=256,
    target_update_interval=1_000,
    gamma=0.98,
    exploration_fraction=0.3,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    verbose=1,
    policy_kwargs=dict(net_arch=[256, 256]),
)

# Model training
#model.learn(total_timesteps=100_000)
T = len(building_data) # 1 episode = 2208 timesteps
num_episodes = 10 # 10 episodes
model.learn(
    total_timesteps=num_episodes * T,
    callback=CallbackList([train_callback, eval_callback])
)

print("Eval timesteps:", eval_callback.evaluations_timesteps)
print("Eval results   :", eval_callback.evaluations_results)


print("---------------Train callback: \n", train_callback.df)
print("---------------Episode rewards: \n", train_callback.ep_df)


# Evaluation results
all_rewards = eval_callback.evaluations_results # List of lists: each element are rewards from an eval round
eval_steps  = eval_callback.evaluations_timesteps # timesteps at which the evaluations were run

# Mean reward per round
mean_rewards = [np.mean(r) for r in all_rewards]

# ----------------- quick evaluation (one episode) -----------------
obs, _ = env.reset()
done = False
ret = 0.0
while not done:
    action = model.predict(obs, deterministic=True)[0]
    obs, r, term, trunc, _ = env.step(action)
    ret += r
    done = term or trunc
print(f"Episode return: {ret:.3f}")


/hkfs/home/haicore/iai/cj9272/Bachelorthesis_DQN_Agent/data/datasets/citylearn_challenge_2023_phase_3_1
Dataset '/hkfs/home/haicore/iai/cj9272/Bachelorthesis_DQN_Agent/data/datasets/citylearn_challenge_2023_phase_3_1' copied to '/hkfs/home/haicore/iai/cj9272/Bachelorthesis_DQN_Agent/data/datasets/citylearn_challenge_2023_phase_3_1/../../../../results/2025-08-27_15-43-41'


TypeError: object of type 'Monitor' has no len()