In [17]:
from optuna.trial import Trial
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from pink import PinkNoiseDist, PinkActionNoise
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, ProgressBarCallback
from stable_baselines3.common.logger import HParam
from FleetRL.fleet_env.fleet_environment import FleetEnv
import optuna
from stable_baselines3 import TD3
from stable_baselines3.common.evaluation import evaluate_policy
import time
import os

In [18]:
run_name = "td3_full_vecnorm_clip5_hp_study_aux"

In [19]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_TD3-{time_now}-{run_name}"
logs_dir = f"./logs/vec_TD3-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [20]:
n_cpu = 4
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "time_picker": "random",
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd"
                             })

In [21]:
vec_norm_train_env = VecNormalize(venv=train_vec_env,
                                  training=True,
                                  norm_obs=True,
                                  norm_reward=True,
                                  clip_reward=10.0)

In [22]:
n_cpu = 4
eval_vec_env = make_vec_env(FleetEnv,
                             n_envs=1,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "time_picker": "eval",
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd"
                             })

In [23]:
vec_norm_eval_env = VecNormalize(venv=eval_vec_env,
                                 training=True,
                                 norm_obs=True,
                                 norm_reward=True,
                                 clip_reward=10.0)

In [24]:
class HyperParamCallback(BaseCallback):
    """
    Saves hyperparameters and metrics at start of training, logging to tensorboard
    """

    def _on_training_start(self) -> None:
        hparam_dict = {
            "algorithm": self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma,
            "tau": self.model.tau,
            "learning starts": self.model.learning_starts,
            "batch size": self.model.batch_size,
            "buffer size": self.model.buffer_size,
            "policy_delay": self.model.policy_delay,
        }

        metric_dict = {
            "rollout/ep_len_mean": 0,
            "train/value_loss": 0.0,
        }

        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout", "log", "json", "csv")
        )

    def _on_step(self) -> bool:
        return True


In [25]:
hp_callback = HyperParamCallback()
progress_bar_callback = ProgressBarCallback()

In [26]:
n_actions = vec_norm_train_env.action_space.shape[-1]
param_noise = None
seq_len = 48 * 4

In [27]:
def objective(trial: Trial):
    noise_scale = trial.suggest_categorical("noise_scale", [0.1, 0.2, 0.3, 0.4, 0.5])
    action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)
    gamma = trial.suggest_categorical('gamma', [0.9, 0.91, 0.95, 0.97, 0.99])
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 3e-3, 5e-3, 1e-2, 3e-2, 5e-2, 1e-1])
    buffer_size = trial.suggest_categorical('buffer_size', [10000, 50000, 50000, 1000000])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512])
    tau = trial.suggest_categorical('tau', [0.0001, 0.0005, 0.0008, 0.001, 0.003, 0.005, 0.01])
    learning_starts = trial.suggest_categorical('learning_starts', [5000, 10000, 50000])
    train_freq = trial.suggest_categorical('train_freq', [2, 4, 8])

    model = TD3('MlpPolicy',
            env=vec_norm_train_env,
            verbose=0,
            learning_rate=learning_rate,
            learning_starts=learning_starts,
            buffer_size=buffer_size,
            batch_size=batch_size,
            gamma=gamma,
            tau=tau,
            train_freq=train_freq,
            action_noise=action_noise,
            tensorboard_log="./tb_log_hp"
           )

    model.learn(50000, callback=hp_callback)

    mean_reward, _ = evaluate_policy(model, env=vec_norm_eval_env, n_eval_episodes=5, deterministic=True)
    return mean_reward

In [28]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(study.best_params)
print(study.best_value)
print(study.best_trial)

[I 2023-06-21 12:46:43,107] A new study created in memory with name: no-name-3739491d-0b81-459d-ba8c-539c64577440


Output()

[I 2023-06-21 12:47:17,664] Trial 0 finished with value: -171.7580381 and parameters: {'noise_scale': 0.1, 'gamma': 0.9, 'learning_rate': 5e-05, 'buffer_size': 50000, 'batch_size': 512, 'tau': 0.003, 'learning_starts': 50000, 'train_freq': 4}. Best is trial 0 with value: -171.7580381.


Output()

Process ForkServerProcess-10:
Process ForkServerProcess-6:
Process ForkServerProcess-8:
Process ForkServerProcess-7:
Process ForkServerProcess-9:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/enzo/Desktop/FleetRL/venv/lib/python3.10/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 34, in _worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (

KeyboardInterrupt: 