In [1]:
import numpy as np
import time
import os

import FleetRL
from FleetRL.fleet_env.fleet_environment import FleetEnv

from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import VecNormalize, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, ProgressBarCallback, BaseCallback
from stable_baselines3.common.logger import HParam

from pink import PinkNoiseDist, PinkActionNoise

In [2]:
run_name = "td3_LMD_full"
comment = run_name

In [3]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_TD3-{time_now}-{run_name}"
logs_dir = f"./logs/vec_TD3-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [4]:
n_cpu = 2
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "50_lmd.csv",
                                 "building_name": "load_lmd.csv",
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd",
                                 "include_building": True,
                                 "include_pv": True,
                                 "time_picker": "static",
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False
                             })

In [5]:
train_norm_vec_env = VecNormalize(venv=train_vec_env, norm_obs=True, norm_reward=True, training=True, clip_reward=10.0)

In [6]:
eval_vec_env = make_vec_env(FleetEnv,
                            n_envs=1,
                            vec_env_cls=SubprocVecEnv,
                            env_kwargs={
                                "schedule_name": "50_lmd.csv",
                                "building_name": "load_lmd.csv",
                                "verbose": 0,
                                "aux": True,
                                "use_case": "lmd",
                                "include_building": True,
                                "include_pv": True,
                                "time_picker": "eval",
                                "deg_emp": False,
                                "include_price": True,
                                "ignore_price_reward": False,
                                "ignore_invalid_penalty": False,
                                "ignore_overcharging_penalty": False,
                                "ignore_overloading_penalty": False,
                                "episode_length": 48,
                                "normalize_in_env": False
                            })

In [7]:
eval_norm_vec_env = VecNormalize(venv=eval_vec_env,
                                 norm_obs=True,
                                 norm_reward=True,
                                 training=True,
                                 clip_reward=10.0)

In [8]:
eval_callback = EvalCallback(eval_env=eval_norm_vec_env,
                             warn=True,
                             verbose=1,
                             deterministic=True,
                             eval_freq=max(10000 // n_cpu, 1),
                             n_eval_episodes=5,
                             render=False,
                             )

In [9]:
class HyperParamCallback(BaseCallback):

    """
    Saves hyperparameters and metrics at start of training, logging to tensorboard
    """

    def _on_training_start(self) -> None:
        hparam_dict = {
            "algorithm": self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma,
            "tau": self.model.tau,
            "learning starts": self.model.learning_starts,
            "batch size": self.model.batch_size,
            "buffer size": self.model.buffer_size,
            "policy_delay": self.model.policy_delay,
        }

        metric_dict = {
            "rollout/ep_len_mean": 0,
            "train/value_loss": 0.0,
        }

        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout", "log", "json", "csv")
        )

    def _on_step(self) -> bool:
        return True


In [10]:
hyperparameter_callback = HyperParamCallback()

In [11]:
n_actions = train_norm_vec_env.action_space.shape[-1]
param_noise = None
noise_scale = 0.2
seq_len = 48 * 4
action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)

In [12]:
model = TD3(policy="MlpPolicy",
            env=train_norm_vec_env,
            verbose=0,
            train_freq=(4, "step"),
            learning_rate=0.001,
            learning_starts=20000,
            gamma=0.97,
            batch_size=100,
            buffer_size=1000000,
            tau=0.01,
            tensorboard_log="./tb_log",
            action_noise=action_noise
            )


In [None]:
saving_interval = 50000
for i in range(1, 60):
    model.learn(total_timesteps=saving_interval, 
                reset_num_timesteps=False, 
                tb_log_name=f"TD3_{time_now}_{comment}",
                callback=[eval_callback, hyperparameter_callback])
    
    model.save(f"{trained_agents_dir}/{saving_interval * i}")

    # Don't forget to save the VecNormalize statistics when saving the agent
    log_dir = "./tmp/vec_td3/"
    model.save(log_dir + f"td3-fleet_{comment}")
    stats_path = os.path.join(log_dir, f"vec_normalize-{comment}.pkl")
    train_norm_vec_env.save(stats_path)

Eval num_timesteps=10000, episode_reward=-181.46 +/- 142.47
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-178.28 +/- 154.20
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-13.70 +/- 8.21
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-25.88 +/- 19.48
Episode length: 192.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=-22.45 +/- 13.44
Episode length: 192.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=-13.19 +/- 7.93
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=70000, episode_reward=-20.61 +/- 4.42
Episode length: 192.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=-10.03 +/- 5.57
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=90000, episode_reward=-22.86 +/- 12.41
Episode length: 192.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=-13.18 +/- 8.75
Episode length: 192.00