In [70]:
import datetime as dt
import numpy as np
import math
import matplotlib.pyplot as plt
from typing import Literal
import pandas as pd
import time
import os

from FleetRL.fleet_env import FleetEnv

from stable_baselines3.common.vec_env import VecNormalize, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, ProgressBarCallback, BaseCallback
from stable_baselines3.common.logger import HParam

from pink import PinkActionNoise
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise, NormalActionNoise

In [71]:
# define fundamental parameters
run_name = "LMD_2022_arbitrage_PPO"
n_train_steps = 48  # number of hours in a training episode
n_eval_steps = 8600  # number of hours in one evaluation episode
n_eval_episodes = 1  # number of episodes for evaluation
n_evs = 2  # number of evs
n_envs = 2  # number of envs parallel - has to be one if train_freq = (1, episode) (or default)
time_steps_per_hour = 4  # temporal resolution
use_case: str = "lmd"  # for file name
scenario: Literal["arb", "tariff"] = "tariff"  # arbitrage or tariff
gen_new_schedule = True  # generate a new schedule - refer to schedule generator and its config to change settings
gen_new_test_schedule = True  # generate a new schedule for agent testing

# training parameters
norm_obs_in_env = False  # normalize observations in FleetRL (max, min normalization)
vec_norm_obs = True  # normalize observations in SB3 (rolling normalization)
vec_norm_rew = True  # normalize rewards in SB3 (rolling normalization)
total_steps = int(1e6)  # total training time steps
saving_interval = 50000  # interval for saving the model

In [72]:
# environment arguments - adjust settings if necessary
# additional settings can be changed in the config files
env_kwargs = {"schedule_name": str(n_evs) + "_" + str(use_case) + ".csv",
              "building_name": "load_" + str(use_case) + ".csv",
              "use_case": use_case,
              "include_building": True,
              "include_pv": True,
              "time_picker": "random",
              "deg_emp": False,
              "include_price": True,
              "ignore_price_reward": False,
              "ignore_invalid_penalty": False,
              "ignore_overcharging_penalty": False,
              "ignore_overloading_penalty": False,
              "episode_length": n_train_steps,
              "normalize_in_env": norm_obs_in_env,
              "verbose": 0,
              "aux": True,
              "log_data": False,
              "calculate_degradation": True,
              "target_soc": 0.85,
              "gen_schedule": gen_new_schedule,
              "gen_start_date": "2022-01-01 00:00",
              "gen_end_date": "2022-12-31 23:59:59",
              "gen_name": "my_sched.csv",
              "gen_n_evs": 1,
              "seed": 42
              }

if scenario == "tariff":
    env_kwargs["spot_markup"] = 10
    env_kwargs["spot_mul"] = 1.5
    env_kwargs["feed_in_ded"] = 0.25
    env_kwargs["price_name"] = "spot_2021_new.csv"
    env_kwargs["tariff_name"] = "fixed_feed_in.csv"
elif scenario == "arb":
    env_kwargs["spot_markup"] = 0
    env_kwargs["spot_mul"] = 1
    env_kwargs["feed_in_ded"] = 0
    env_kwargs["price_name"] = "spot_2021_new.csv"
    env_kwargs["tariff_name"] = "spot_2021_new_tariff.csv"

In [73]:
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_envs,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs=env_kwargs)

train_norm_vec_env = VecNormalize(venv=train_vec_env,
                                  norm_obs=vec_norm_obs,
                                  norm_reward=vec_norm_rew,
                                  training=True,
                                  clip_reward=10.0)

env_kwargs["time_picker"] = "eval"
env_kwargs["gen_schedule"] = False
env_kwargs["schedule_name"] = env_kwargs["gen_name"]

eval_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_envs,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs=env_kwargs)

eval_norm_vec_env = VecNormalize(venv=eval_vec_env,
                                  norm_obs=vec_norm_obs,
                                  norm_reward=vec_norm_rew,
                                  training=True,
                                  clip_reward=10.0)

In [74]:
if gen_new_test_schedule:
    # generate an evaluation schedule
    test_sched_name = env_kwargs["gen_name"]
    if not test_sched_name.endswith(".csv"):
        test_sched_name = test_sched_name + "_test" + ".csv"
    else:
        test_sched_name = test_sched_name.strip(".csv")
        test_sched_name = test_sched_name + "_test" + ".csv"

    env_kwargs["gen_schedule"] = True
    env_kwargs["gen_name"] = test_sched_name

    test_vec_env = make_vec_env(FleetEnv,
                                n_envs=1,
                                vec_env_cls=SubprocVecEnv,
                                env_kwargs=env_kwargs)

    env_kwargs["gen_schedule"] = False
    env_kwargs["schedule_name"] = test_sched_name

test_vec_env = make_vec_env(FleetEnv,
                            n_envs=n_envs,
                            vec_env_cls=SubprocVecEnv,
                            env_kwargs=env_kwargs)

test_norm_vec_env = VecNormalize(venv=test_vec_env,
                                 norm_obs=vec_norm_obs,
                                 norm_reward=vec_norm_rew,
                                 training=True,
                                 clip_reward=10.0)

In [75]:
eval_callback = EvalCallback(eval_env=eval_norm_vec_env,
                             warn=True,
                             verbose=1,
                             deterministic=True,
                             eval_freq=max(10000 // n_envs, 1),
                             n_eval_episodes=5,
                             render=False,
                             )

class HyperParamCallback(BaseCallback):
    """
    Saves hyperparameters and metrics at start of training, logging to tensorboard
    """

    def _on_training_start(self) -> None:
        hparam_dict = {
            "algorithm": self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma,
        }

        metric_dict = {
            "rollout/ep_len_mean": 0,
            "train/value_loss": 0.0,
        }

        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout", "log", "json", "csv")
        )

    def _on_step(self) -> bool:
        return True

progress_bar = ProgressBarCallback()

## wandb callback possible, check documentation

In [76]:
hyperparameter_callback = HyperParamCallback()

In [77]:
n_actions = train_norm_vec_env.action_space.shape[-1]
param_noise = None
noise_scale = 0.1
seq_len = n_train_steps * time_steps_per_hour
action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)

In [78]:
model = PPO(policy="MlpPolicy",
            verbose=1,
            env=train_norm_vec_env,
            tensorboard_log="./tb_log",
            gamma=0.993,
            learning_rate=0.0005,
            batch_size=128,
            n_epochs=8,
            gae_lambda=0.9,
            clip_range=0.2,
            clip_range_vf=None,
            normalize_advantage=True,
            ent_coef=0.0008,
            vf_coef=0.5,
            max_grad_norm=0.5,
            n_steps=2048)

Using cpu device


In [79]:
%reload_ext tensorboard
%tensorboard --logdir ./tb_log --bind_all --port 6006

Launching TensorBoard...

In [80]:
comment = run_name
time_now = int(time.time())
trained_agents_dir = f"./FleetRL/RL_agents/trained_agents/vec_PPO-{time_now}-{run_name}"
logs_dir = f"./FleetRL/RL_agents/trained_agents/logs/vec_PPO-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [81]:
for i in range(0, int(total_steps / saving_interval)):
    model.learn(total_timesteps=saving_interval,
                reset_num_timesteps=False,
                tb_log_name=f"PPO_{time_now}_{comment}",
                callback=[eval_callback, hyperparameter_callback, progress_bar])

    model.save(f"{trained_agents_dir}/{saving_interval * i}")

    # Don't forget to save the VecNormalize statistics when saving the agent
    log_dir = "./tmp/vec_PPO/"
    model.save(log_dir + f"PPO-fleet_{comment}")
    stats_path = os.path.join(log_dir, f"vec_normalize-{comment}.pkl")
    train_norm_vec_env.save(stats_path)

Logging to ./tb_log/PPO_1692134414_LMD_2022_arbitrage_PPO_0


Output()

----------------------------------
| rollout/           |           |
|    ep_len_mean     | 192       |
|    ep_rew_mean     | -1.53e+03 |
| time/              |           |
|    fps             | 87        |
|    iterations      | 1         |
|    time_elapsed    | 47        |
|    total_timesteps | 4096      |
----------------------------------


KeyboardInterrupt: 