In [1]:
import numpy as np
import time
import os

import FleetRL
from FleetRL.fleet_env.fleet_environment import FleetEnv

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, ProgressBarCallback, BaseCallback
from stable_baselines3.common.logger import HParam

from pink import PinkNoiseDist, PinkActionNoise

In [2]:
run_name = "CT_2021_arbitrage_PPO"
comment = run_name

In [3]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_PPO-{time_now}-{run_name}"
logs_dir = f"./logs/vec_PPO-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [4]:
env_args = {"schedule_name": "ct_sched_single.csv",
            "building_name": "load_ct.csv",
            "price_name": "spot_2021_new.csv",
            "tariff_name": "spot_2021_new_tariff.csv",
            "use_case": "ct",
            "verbose": False,
            "time_picker": "random",
            "episode_length": 48,
            "calculate_degradation": True,
            "log_data": False,
            "normalize_in_env": False,
            "aux": True,
            "spot_markup": 0,
            "spot_mul": 1,
            "feed_in_ded": 0
            }

n_cpu = 10
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs=env_args)

In [5]:
train_norm_vec_env = VecNormalize(venv=train_vec_env, norm_obs=True, norm_reward=True, training=True, clip_reward=10.0)

In [6]:
env_args = {"schedule_name": "ct_sched_single.csv",
            "building_name": "load_ct.csv",
            "price_name": "spot_2021_new.csv",
            "tariff_name": "spot_2021_new_tariff.csv",
            "use_case": "ct",
            "verbose": False,
            "time_picker": "eval",
            "episode_length": 48,
            "calculate_degradation": True,
            "log_data": False,
            "normalize_in_env": False,
            "aux": True,
            "spot_markup": 0,
            "spot_mul": 1,
            "feed_in_ded": 0
            }

eval_vec_env = make_vec_env(FleetEnv,
                            n_envs=1,
                            vec_env_cls=SubprocVecEnv,
                            env_kwargs=env_args)

In [7]:
eval_norm_vec_env = VecNormalize(venv=eval_vec_env,
                                 norm_obs=True,
                                 norm_reward=True,
                                 training=True,
                                 clip_reward=10.0)

In [8]:
eval_callback = EvalCallback(eval_env=eval_norm_vec_env,
                             warn=True,
                             verbose=1,
                             deterministic=True,
                             eval_freq=max(10000 // n_cpu, 1),
                             n_eval_episodes=5,
                             render=False,
                             )

In [9]:
class HyperParamCallback(BaseCallback):

    """
    Saves hyperparameters and metrics at start of training, logging to tensorboard
    """

    def _on_training_start(self) -> None:
        hparam_dict = {
            "algorithm": self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma,
            "tau": self.model.tau,
            "learning starts": self.model.learning_starts,
            "batch size": self.model.batch_size,
            "buffer size": self.model.buffer_size,
            "policy_delay": self.model.policy_delay,
        }

        metric_dict = {
            "rollout/ep_len_mean": 0,
            "train/value_loss": 0.0,
        }

        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout", "log", "json", "csv")
        )

    def _on_step(self) -> bool:
        return True


In [10]:
hyperparameter_callback = HyperParamCallback()

In [11]:
n_actions = train_norm_vec_env.action_space.shape[-1]
param_noise = None
noise_scale = 0.1
seq_len = 48 * 4
action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)

model = PPO(policy="MlpPolicy",
           verbose=0,
           env = train_norm_vec_env,
           tensorboard_log = "./tb_log",
           gamma=0.99,
           learning_rate=0.0005,
           batch_size=128,
           n_epochs=8,
           gae_lambda=0.9,
           clip_range=0.2,
           clip_range_vf=None,
           normalize_advantage=True,
           ent_coef=0.0005,
           vf_coef=0.5,
           max_grad_norm=0.5,
           n_steps=2048)

model = PPO(policy="MlpPolicy",
            env=train_norm_vec_env,
            verbose=0,
            train_freq=(4, "step"),
            learning_rate=0.001,
            learning_starts=20000,
            gamma=0.97,
            batch_size=100,
            buffer_size=1000000,
            tau=0.01,
            tensorboard_log="./tb_log",
            action_noise=action_noise
            )

In [12]:
train_norm_vec_env.load("./tmp/vec_PPO/vec_normalize-CT_2021_arbitrage_PPO.pkl", venv=train_norm_vec_env)
model = PPO.load("./tmp/vec_PPO/PPO-fleet_CT_2021_arbitrage_PPO.zip", env = train_norm_vec_env,
                custom_objects={"observation_space": train_norm_vec_env.observation_space,
                                "action_space": train_norm_vec_env.action_space})
#model.ent_coef = 0.0005

In [None]:
saving_interval = 50000
for i in range(0, 100):
    model.learn(total_timesteps=saving_interval, 
                reset_num_timesteps=False, 
                tb_log_name=f"PPO_{time_now}_{comment}",
                callback=[eval_callback])
    
    model.save(f"{trained_agents_dir}/{saving_interval * i}")

    # Don't forget to save the VecNormalize statistics when saving the agent
    log_dir = "./tmp/vec_PPO/"
    model.save(log_dir + f"PPO-fleet_{comment}")
    stats_path = os.path.join(log_dir, f"vec_normalize-{comment}.pkl")
    train_norm_vec_env.save(stats_path)

Eval num_timesteps=5744400, episode_reward=-46.71 +/- 48.06
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5754400, episode_reward=-69.56 +/- 70.61
Episode length: 192.00 +/- 0.00
Eval num_timesteps=5764400, episode_reward=-40.70 +/- 39.43
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5774400, episode_reward=-14.30 +/- 50.93
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5784400, episode_reward=-24.63 +/- 54.20
Episode length: 192.00 +/- 0.00
Eval num_timesteps=5794400, episode_reward=27.07 +/- 7.58
Episode length: 192.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5804400, episode_reward=-26.49 +/- 48.44
Episode length: 192.00 +/- 0.00
Eval num_timesteps=5814400, episode_reward=-92.95 +/- 202.66
Episode length: 192.00 +/- 0.00
Eval num_timesteps=5824400, episode_reward=-87.53 +/- 184.10
Episode length: 192.00 +/- 0.00
Eval num_timesteps=5834400, episode_reward=-20.28 +/- 46.35
Episode length: 192.00 +